From b15474988e7c376227934a1c70439d8fb6c26354 Mon Sep 17 00:00:00 2001 From: "Angel A. Mata" Date: Fri, 22 May 2026 10:32:45 -0700 Subject: [PATCH 1/6] perf(onboard): operationalize profiling traces Add opt-in trace artifacts across onboard and inference validation so maintainers can identify dominant latency phases from local or CI runs. --- .github/workflows/e2e-branch-validation.yaml | 1 + .github/workflows/regression-e2e.yaml | 9 + src/lib/adapters/http/probe.ts | 72 +++- src/lib/inference/onboard-probes.ts | 42 +- src/lib/onboard.ts | 396 ++++++++++++------- src/lib/onboard/gateway-http-readiness.ts | 90 +++-- src/lib/onboard/gateway-tcp-readiness.ts | 13 + src/lib/trace.test.ts | 127 ++++++ src/lib/trace.ts | 300 ++++++++++++++ test/e2e/brev-e2e.test.ts | 1 + trace-demo.json | 132 +++++++ 11 files changed, 1003 insertions(+), 180 deletions(-) create mode 100644 src/lib/trace.test.ts create mode 100644 src/lib/trace.ts create mode 100644 trace-demo.json diff --git a/.github/workflows/e2e-branch-validation.yaml b/.github/workflows/e2e-branch-validation.yaml index 0a35f70b1d..c041da7006 100644 --- a/.github/workflows/e2e-branch-validation.yaml +++ b/.github/workflows/e2e-branch-validation.yaml @@ -322,6 +322,7 @@ jobs: 'set +e mkdir -p /tmp/nc-debug cp /tmp/nemoclaw-onboard.log /tmp/nc-debug/ 2>/dev/null || true + cp -R /tmp/nemoclaw-traces /tmp/nc-debug/traces 2>/dev/null || true timeout 15s openshell sandbox list > /tmp/nc-debug/sandbox-list.txt 2>&1 timeout 15s openshell gateway status > /tmp/nc-debug/gateway-status.txt 2>&1 timeout 15s docker ps -a > /tmp/nc-debug/docker-ps.txt 2>&1 diff --git a/.github/workflows/regression-e2e.yaml b/.github/workflows/regression-e2e.yaml index 29409fb1bd..3a18ae879a 100644 --- a/.github/workflows/regression-e2e.yaml +++ b/.github/workflows/regression-e2e.yaml @@ -227,6 +227,7 @@ jobs: env: NEMOCLAW_NON_INTERACTIVE: "1" NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces run: bash test/e2e/test-onboard-inference-smoke.sh - name: Upload onboard inference smoke logs on failure @@ -239,6 +240,14 @@ jobs: /tmp/nemoclaw-e2e-onboard-inference-smoke-node.log if-no-files-found: ignore + - name: Upload onboard profiling traces + if: always() + uses: actions/upload-artifact@v4 + with: + name: onboard-inference-smoke-traces + path: /tmp/nemoclaw-traces/ + if-no-files-found: ignore + # ── Gateway drift preflight E2E ───────────────────────────── # Coverage guard for #3399 / #3423. A stale OpenShell gateway image can # make sandbox-state RPCs fail with protobuf invalid-wire decode errors. diff --git a/src/lib/adapters/http/probe.ts b/src/lib/adapters/http/probe.ts index dcc08b187d..c92a32fde6 100644 --- a/src/lib/adapters/http/probe.ts +++ b/src/lib/adapters/http/probe.ts @@ -13,6 +13,7 @@ import { isErrnoException } from "../../core/errno"; import { compactText } from "../../core/url-utils"; import type { ProbeResult } from "../../onboard/types"; import { ROOT } from "../../state/paths"; +import { addTraceEvent, withTraceSpan } from "../../trace"; export type CurlProbeResult = ProbeResult; @@ -68,6 +69,33 @@ export function getCurlTimingArgs(): string[] { return ["--connect-timeout", "10", "--max-time", "60"]; } +function sanitizeCurlUrl(value: string): string { + try { + const url = new URL(value); + url.username = ""; + url.password = ""; + for (const key of [...url.searchParams.keys()]) { + url.searchParams.set(key, ""); + } + url.hash = ""; + return url.toString(); + } catch { + return value.replace(/(Bearer\s+)\S+/gi, "$1"); + } +} + +function getCurlProbeTraceAttributes(argv: string[], opts: CurlProbeOptions): Record { + const url = argv.at(-1) || ""; + const methodIndex = argv.findIndex((arg) => arg === "-X" || arg === "--request"); + const method = + methodIndex >= 0 && argv[methodIndex + 1] ? argv[methodIndex + 1].toUpperCase() : "POST"; + return { + "http.url": sanitizeCurlUrl(String(url)), + "http.request.method": method, + "process.timeout_ms": opts.timeoutMs ?? 30_000, + }; +} + export function summarizeCurlFailure(curlStatus = 0, stderr = "", body = ""): string { const detail = compactText(stderr || body); return detail @@ -130,6 +158,12 @@ export function summarizeProbeFailure(body = "", status = 0, curlStatus = 0, std } export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult { + return withTraceSpan("nemoclaw.inference.curl_probe", getCurlProbeTraceAttributes(argv, opts), () => + runCurlProbeImpl(argv, opts), + ); +} + +function runCurlProbeImpl(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult { const bodyFile = secureTempFile("nemoclaw-curl-probe", ".json"); try { const args = [...argv]; @@ -154,7 +188,7 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP const errorMessage = compactText( `${result.error.message || String(result.error)} ${String(result.stderr || "")}`, ); - return { + const failure = { ok: false, httpStatus: 0, curlStatus: errorCode, @@ -162,9 +196,11 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP stderr: errorMessage, message: summarizeProbeFailure(body, 0, errorCode, errorMessage), }; + addTraceEvent("curl_result", { ok: false, http_status: 0, curl_status: errorCode }); + return failure; } const status = Number(String(result.stdout || "").trim()); - return { + const probeResult = { ok: result.status === 0 && status >= 200 && status < 300, httpStatus: Number.isFinite(status) ? status : 0, curlStatus: result.status || 0, @@ -177,9 +213,15 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP String(result.stderr || ""), ), }; + addTraceEvent("curl_result", { + ok: probeResult.ok, + http_status: probeResult.httpStatus, + curl_status: probeResult.curlStatus, + }); + return probeResult; } catch (error) { const detail = error instanceof Error ? error.message : String(error); - return { + const probeResult = { ok: false, httpStatus: 0, curlStatus: @@ -191,6 +233,8 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP detail, ), }; + addTraceEvent("curl_result", { ok: false, http_status: 0, curl_status: probeResult.curlStatus }); + return probeResult; } finally { cleanupTempDir(bodyFile, "nemoclaw-curl-probe"); } @@ -218,6 +262,17 @@ function hasChatCompletionsStreamingData(body: string): boolean { export function runChatCompletionsStreamingProbe( argv: string[], opts: CurlProbeOptions = {}, +): CurlProbeResult { + return withTraceSpan( + "nemoclaw.inference.curl_streaming_probe", + getCurlProbeTraceAttributes(argv, opts), + () => runChatCompletionsStreamingProbeImpl(argv, opts), + ); +} + +function runChatCompletionsStreamingProbeImpl( + argv: string[], + opts: CurlProbeOptions = {}, ): CurlProbeResult { const bodyFile = secureTempFile("nemoclaw-chat-streaming-probe", ".sse"); try { @@ -323,6 +378,17 @@ const REQUIRED_STREAMING_EVENTS = ["response.output_text.delta"]; export function runStreamingEventProbe( argv: string[], opts: CurlProbeOptions = {}, +): StreamingProbeResult { + return withTraceSpan( + "nemoclaw.inference.curl_streaming_event_probe", + getCurlProbeTraceAttributes(argv, opts), + () => runStreamingEventProbeImpl(argv, opts), + ); +} + +function runStreamingEventProbeImpl( + argv: string[], + opts: CurlProbeOptions = {}, ): StreamingProbeResult { const bodyFile = secureTempFile("nemoclaw-streaming-probe", ".sse"); try { diff --git a/src/lib/inference/onboard-probes.ts b/src/lib/inference/onboard-probes.ts index 641cd31a66..c21003df81 100644 --- a/src/lib/inference/onboard-probes.ts +++ b/src/lib/inference/onboard-probes.ts @@ -20,6 +20,7 @@ const { runChatCompletionsStreamingProbe, runStreamingEventProbe, } = httpProbe; +const trace = require("../trace"); // ── Helpers ────────────────────────────────────────────────────── @@ -228,16 +229,37 @@ function isCurlTimeout(result) { } function executeProbeWithHttpRetry(probe) { - let result = probe.execute(); - for (const delayMs of HTTP_PROBE_RETRY_DELAYS_MS) { - if (!shouldRetryHttpProbe(result)) break; - console.log( - ` ${probe.name} validation returned HTTP ${result.httpStatus}; retrying in ${Math.round(delayMs / 1000)}s...`, - ); - sleepSync(delayMs); - result = probe.execute(); - } - return result; + return trace.withTraceSpan( + "nemoclaw.inference.validation_probe", + { probe_name: probe.name, api: probe.api || null }, + () => { + let attempt = 1; + let result = probe.execute(); + trace.addTraceEvent("probe_result", { + attempt, + ok: result.ok, + http_status: result.httpStatus, + curl_status: result.curlStatus, + }); + for (const delayMs of HTTP_PROBE_RETRY_DELAYS_MS) { + if (!shouldRetryHttpProbe(result)) break; + console.log( + ` ${probe.name} validation returned HTTP ${result.httpStatus}; retrying in ${Math.round(delayMs / 1000)}s...`, + ); + trace.addTraceEvent("probe_retry_sleep", { delay_ms: delayMs, http_status: result.httpStatus }); + sleepSync(delayMs); + attempt += 1; + result = probe.execute(); + trace.addTraceEvent("probe_result", { + attempt, + ok: result.ok, + http_status: result.httpStatus, + curl_status: result.curlStatus, + }); + } + return result; + }, + ); } // ── Responses API probe ────────────────────────────────────────── diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c9543bbb8f..9a51f1aadc 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -78,6 +78,7 @@ const { }: typeof import("./onboard/gateway-gpu-passthrough") = require("./onboard/gateway-gpu-passthrough"); const { syncPresetSelection }: typeof import("./onboard/policy-preset-sync") = require("./onboard/policy-preset-sync"); const { maybeForceE2eStepFailure }: typeof import("./onboard/e2e-failure-injection") = require("./onboard/e2e-failure-injection"); +const trace: typeof import("./trace") = require("./trace"); const { gatherWechatConfig, hasWechatConfigDrift, @@ -1690,13 +1691,18 @@ function upsertProvider( baseUrl: string | null, env: NodeJS.ProcessEnv = {}, ) { - const result = onboardProviders.upsertProvider( - name, - type, - credentialEnv, - baseUrl, - env, - runOpenshell, + const result = trace.withTraceSpan( + "nemoclaw.policy.provider_upsert", + { provider: name, provider_type: type, credential_env: credentialEnv, base_url: baseUrl }, + () => + onboardProviders.upsertProvider( + name, + type, + credentialEnv, + baseUrl, + env, + runOpenshell, + ), ); if (result.ok && credentialEnv) { const stagedValue = stagedLegacyValues.get(credentialEnv); @@ -1759,7 +1765,11 @@ function verifyDirectSandboxGpu(sandboxName: string): void { } function upsertMessagingProviders(tokenDefs: MessagingTokenDef[]) { - const upserted = onboardProviders.upsertMessagingProviders(tokenDefs, runOpenshell); + const upserted = trace.withTraceSpan( + "nemoclaw.policy.messaging_provider_upsert", + { provider_count: tokenDefs.length, providers: tokenDefs.map(({ name }) => name) }, + () => onboardProviders.upsertMessagingProviders(tokenDefs, runOpenshell), + ); // upsertMessagingProviders process.exits on failure, so reaching this // point means every entry in tokenDefs that had a token was registered. // Mark migrated only when the registered token equals the staged legacy @@ -2132,7 +2142,11 @@ async function validateOpenAiLikeSelection( } = {}, ): Promise { const apiKey = credentialEnv ? getCredential(credentialEnv) : ""; - const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options); + const probe = trace.withTraceSpan( + "nemoclaw.inference.validation.openai_like", + { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, + () => probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options), + ); if (!probe.ok) { console.error(` ${label} endpoint validation failed.`); console.error(` ${probe.message}`); @@ -2168,7 +2182,11 @@ async function validateAnthropicSelectionWithRetryMessage( helpUrl: string | null = null, ): Promise { const apiKey = getCredential(credentialEnv); - const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); + const probe = trace.withTraceSpan( + "nemoclaw.inference.validation.anthropic", + { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, + () => probeAnthropicEndpoint(endpointUrl, model, apiKey), + ); if (!probe.ok) { console.error(` ${label} endpoint validation failed.`); console.error(` ${probe.message}`); @@ -2199,11 +2217,16 @@ async function validateCustomOpenAiLikeSelection( helpUrl: string | null = null, ): Promise { const apiKey = getCredential(credentialEnv); - const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { - requireResponsesToolCalling: true, - skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), - probeStreaming: true, - }); + const probe = trace.withTraceSpan( + "nemoclaw.inference.validation.custom_openai_like", + { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, + () => + probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { + requireResponsesToolCalling: true, + skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), + probeStreaming: true, + }), + ); if (probe.ok) { if (probe.note) { console.log(` ℹ ${probe.note}`); @@ -2238,7 +2261,11 @@ async function validateCustomAnthropicSelection( helpUrl: string | null = null, ): Promise { const apiKey = getCredential(credentialEnv); - const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); + const probe = trace.withTraceSpan( + "nemoclaw.inference.validation.custom_anthropic", + { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, + () => probeAnthropicEndpoint(endpointUrl, model, apiKey), + ); if (probe.ok) { console.log(` ${probe.label} available — ${agentProductName()} will use ${probe.api}.`); return { ok: true, api: probe.api }; @@ -3197,36 +3224,49 @@ async function ensureNamedCredential( } function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds = 2): boolean { - for (let i = 0; i < attempts; i += 1) { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) return true; + return trace.withTraceSpan( + "nemoclaw.sandbox.readiness_wait", + { sandbox_name: sandboxName, attempts, delay_seconds: delaySeconds }, + () => { + for (let i = 0; i < attempts; i += 1) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + trace.addTraceEvent("ready", { attempt: i + 1, source: "sandbox_list" }); + return true; + } - // Package-managed OpenShell gateways report readiness through - // `sandbox list`; legacy Kubernetes gateways may still expose pod state. - if (isLinuxDockerDriverGatewayEnabled()) { - if (i < attempts - 1) sleep(delaySeconds); - continue; - } - const podPhase = runCaptureOpenshell( - [ - "doctor", - "exec", - "--", - "kubectl", - "-n", - "openshell", - "get", - "pod", - sandboxName, - "-o", - "jsonpath={.status.phase}", - ], - { ignoreError: true }, - ); - if (podPhase === "Running") return true; - sleep(delaySeconds); - } - return false; + // Package-managed OpenShell gateways report readiness through + // `sandbox list`; legacy Kubernetes gateways may still expose pod state. + if (isLinuxDockerDriverGatewayEnabled()) { + if (i < attempts - 1) sleep(delaySeconds); + continue; + } + const podPhase = runCaptureOpenshell( + [ + "doctor", + "exec", + "--", + "kubectl", + "-n", + "openshell", + "get", + "pod", + sandboxName, + "-o", + "jsonpath={.status.phase}", + ], + { ignoreError: true }, + ); + if (podPhase === "Running") { + trace.addTraceEvent("ready", { attempt: i + 1, source: "pod_phase" }); + return true; + } + sleep(delaySeconds); + } + trace.addTraceEvent("not_ready", { attempts }); + return false; + }, + ); } // parsePolicyPresetEnv — see urlUtils import above @@ -5498,15 +5538,27 @@ async function createSandbox( timeoutSecs: sandboxReadyTimeoutSecs, deps: { runOpenshell, runCaptureOpenshell, sleep }, }); - const createResult = await streamSandboxCreate(createCommand, sandboxEnv, { - readyCheck: () => { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) return true; - dockerGpuCreatePatch.maybeApplyDuringCreate(); - return false; + const createResult = await trace.withTraceSpan( + "nemoclaw.sandbox.create_stream", + { + sandbox_name: sandboxName, + provider, + model, + timeout_seconds: sandboxReadyTimeoutSecs, + from_dockerfile: Boolean(fromDockerfile), + gpu_enabled: effectiveSandboxGpuConfig.sandboxGpuEnabled, }, - failureCheck: dockerGpuCreatePatch.createFailureMessage, - }); + () => + streamSandboxCreate(createCommand, sandboxEnv, { + readyCheck: () => { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) return true; + dockerGpuCreatePatch.maybeApplyDuringCreate(); + return false; + }, + failureCheck: dockerGpuCreatePatch.createFailureMessage, + }), + ); if (initialSandboxPolicy.cleanup && initialSandboxPolicy.cleanup()) { process.removeListener("exit", initialSandboxPolicy.cleanup); @@ -5555,16 +5607,23 @@ async function createSandbox( // without this gate, NemoClaw registers a phantom sandbox that // causes "sandbox not found" on every subsequent connect/status call. console.log(" Waiting for sandbox to become ready..."); - let ready = false; - const readyAttempts = Math.max(1, Math.ceil(sandboxReadyTimeoutSecs / 2)); - for (let i = 0; i < readyAttempts; i++) { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) { - ready = true; - break; - } - if (i < readyAttempts - 1) sleep(2); - } + const ready = trace.withTraceSpan( + "nemoclaw.sandbox.readiness_wait", + { sandbox_name: sandboxName, timeout_seconds: sandboxReadyTimeoutSecs }, + () => { + const readyAttempts = Math.max(1, Math.ceil(sandboxReadyTimeoutSecs / 2)); + for (let i = 0; i < readyAttempts; i++) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + trace.addTraceEvent("ready", { attempt: i + 1 }); + return true; + } + if (i < readyAttempts - 1) sleep(2); + } + trace.addTraceEvent("not_ready", { attempts: readyAttempts }); + return false; + }, + ); const restoreBackupPath = pendingStateRestore?.manifest?.backupPath ?? pendingStateRestoreBackupPath; @@ -5617,23 +5676,30 @@ async function createSandbox( // Probes /health endpoint and accepts 200 or 401 (device auth) as "alive". // Previously used `curl -sf` which failed on 401, causing false negatives. Fixes #2342. console.log(" Waiting for NemoClaw dashboard to become ready..."); - for (let i = 0; i < 15; i++) { - const readyOutput = runCaptureOpenshell( - ["sandbox", "exec", "-n", sandboxName, "--", "curl", "-so", "/dev/null", "-w", "%{http_code}", - "--max-time", "3", `http://localhost:${effectiveDashboardPort}/health`], - { ignoreError: true }, - ); - const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; - if (readyCode === 200 || readyCode === 401) { - console.log(" ✓ Dashboard is live"); - break; - } - if (i === 14) { - console.warn(" Dashboard taking longer than expected to start. Continuing..."); - } else { - sleep(2); - } - } + trace.withTraceSpan( + "nemoclaw.dashboard.readiness_wait", + { sandbox_name: sandboxName, port: effectiveDashboardPort, attempts: 15 }, + () => { + for (let i = 0; i < 15; i++) { + const readyOutput = runCaptureOpenshell( + ["sandbox", "exec", "-n", sandboxName, "--", "curl", "-so", "/dev/null", "-w", "%{http_code}", + "--max-time", "3", `http://localhost:${effectiveDashboardPort}/health`], + { ignoreError: true }, + ); + const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; + trace.addTraceEvent("dashboard_probe", { attempt: i + 1, http_status: readyCode }); + if (readyCode === 200 || readyCode === 401) { + console.log(" ✓ Dashboard is live"); + return; + } + if (i === 14) { + console.warn(" Dashboard taking longer than expected to start. Continuing..."); + } else { + sleep(2); + } + } + }, + ); if (effectiveSandboxGpuConfig.sandboxGpuEnabled) { try { @@ -8453,24 +8519,34 @@ async function setupPoliciesWithSelection( sandboxName: string, options: SetupPolicySelectionOptions = {}, ) { - const selectedTier = await setupPoliciesWithSelectionImpl( + const selectedTier = await trace.withTraceSpan( + "nemoclaw.policy.application", { - policies, - tiers, - localInferenceProviders: LOCAL_INFERENCE_PROVIDERS, - step, - note, - isNonInteractive, - waitForSandboxReady, - syncPresetSelection, - selectPolicyTier, - setPolicyTier: (sandbox, tierName) => registry.updateSandbox(sandbox, { policyTier: tierName }), - selectTierPresetsAndAccess, - parsePolicyPresetEnv, - env: process.env, + sandbox_name: sandboxName, + selected_presets: options.selectedPresets ?? null, + provider: options.provider ?? null, + web_search_supported: options.webSearchSupported ?? null, }, - sandboxName, - options, + () => + setupPoliciesWithSelectionImpl( + { + policies, + tiers, + localInferenceProviders: LOCAL_INFERENCE_PROVIDERS, + step, + note, + isNonInteractive, + waitForSandboxReady, + syncPresetSelection, + selectPolicyTier, + setPolicyTier: (sandbox, tierName) => registry.updateSandbox(sandbox, { policyTier: tierName }), + selectTierPresetsAndAccess, + parsePolicyPresetEnv, + env: process.env, + }, + sandboxName, + options, + ), ); return selectedTier; } @@ -8968,6 +9044,15 @@ function skippedStepMessage( async function onboard(opts: OnboardOptions = {}): Promise { setOnboardBrandingAgent(opts.agent || process.env.NEMOCLAW_AGENT || null); + const traceCollector = trace.getTraceCollector(); + const onboardTraceSpan = traceCollector?.startSpan("nemoclaw.onboard", { + resume: opts.resume === true, + fresh: opts.fresh === true, + non_interactive: opts.nonInteractive === true || process.env.NEMOCLAW_NON_INTERACTIVE === "1", + agent: opts.agent || process.env.NEMOCLAW_AGENT || null, + trace_file_enabled: Boolean(process.env.NEMOCLAW_TRACE_FILE), + trace_dir_enabled: Boolean(process.env.NEMOCLAW_TRACE_DIR), + }); NON_INTERACTIVE = opts.nonInteractive || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; @@ -9118,6 +9203,8 @@ async function onboard(opts: OnboardOptions = {}): Promise { }; process.once("exit", releaseOnboardLock); + let completed = false; + let traceCompleted = false; try { let session: Session | null; let selectedMessagingChannels: string[] = []; @@ -9249,7 +9336,6 @@ async function onboard(opts: OnboardOptions = {}): Promise { process.exit(1); } - let completed = false; process.once("exit", (code) => { if (!completed && code !== 0) { const current = onboardSession.loadSession(); @@ -9342,7 +9428,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { validateSandboxGpuPreflight(resumeSandboxGpuConfig); } else { startRecordedStep("preflight"); - gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }); + gpu = await trace.withTraceSpan("nemoclaw.onboard.phase.preflight", {}, () => + preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }), + ); onboardSession.markStepComplete("preflight"); } const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, { @@ -9503,7 +9591,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { gatewayReuseState = "missing"; } startRecordedStep("gateway"); - await startGateway(gpu, { gpuPassthrough }); + await trace.withTraceSpan( + "nemoclaw.onboard.phase.gateway", + { reuse_state: gatewayReuseState, gpu_passthrough: gpuPassthrough }, + () => startGateway(gpu, { gpuPassthrough }), + ); onboardSession.markStepComplete("gateway"); } @@ -9556,7 +9648,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { // otherwise leave a phantom that `nemoclaw list` resurrects until // manually destroyed. startRecordedStep("provider_selection"); - const selection = await setupNim(gpu, sandboxName, agent); + const selection = await trace.withTraceSpan( + "nemoclaw.onboard.phase.provider_selection", + { sandbox_name: sandboxName, agent: agent?.name ?? null }, + () => setupNim(gpu, sandboxName, agent), + ); model = selection.model; provider = selection.provider; endpointUrl = selection.endpointUrl; @@ -9584,29 +9680,36 @@ async function onboard(opts: OnboardOptions = {}): Promise { console.error(" Inference selection did not yield a provider/model."); process.exit(1); } + const selectedProvider = provider; + const selectedModel = model; process.env.NEMOCLAW_OPENSHELL_BIN = getOpenshellBinary(); const needsBedrockRuntimeAdapter = - provider === "compatible-anthropic-endpoint" && + selectedProvider === "compatible-anthropic-endpoint" && bedrockRuntimeOnboard.needsBedrockRuntimeAdapter(endpointUrl); const resumeInference = !needsBedrockRuntimeAdapter && !forceProviderSelection && resume && - isInferenceRouteReady(provider, model); + isInferenceRouteReady(selectedProvider, selectedModel); if (resumeInference) { - if (provider === hermesProviderAuth.HERMES_PROVIDER_NAME) { + if (selectedProvider === hermesProviderAuth.HERMES_PROVIDER_NAME) { if (!sandboxName) { sandboxName = await promptValidatedSandboxName(agent); } - startRecordedStep("inference", { provider, model }); - const inferenceResult = await setupInference( - sandboxName, - model, - provider, - endpointUrl, - credentialEnv, - hermesAuthMethod, - hermesToolGateways, + startRecordedStep("inference", { provider: selectedProvider, model: selectedModel }); + const inferenceResult = await trace.withTraceSpan( + "nemoclaw.onboard.phase.inference", + { sandbox_name: sandboxName, provider: selectedProvider, model: selectedModel, credential_env: credentialEnv }, + () => + setupInference( + sandboxName, + selectedModel, + selectedProvider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + ), ); if (inferenceResult?.retry === "selection") { forceProviderSelection = true; @@ -9614,11 +9717,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { } onboardSession.markStepComplete( "inference", - toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), + toSessionUpdates({ provider: selectedProvider, model: selectedModel, hermesAuthMethod, nimContainer, hermesToolGateways }), ); break; } - if (isRoutedInferenceProvider(provider)) { + if (isRoutedInferenceProvider(selectedProvider)) { try { await reconcileModelRouter(); } catch (err) { @@ -9628,13 +9731,13 @@ async function onboard(opts: OnboardOptions = {}): Promise { process.exit(1); } } - skippedStepMessage("inference", `${provider} / ${model}`); + skippedStepMessage("inference", `${selectedProvider} / ${selectedModel}`); if (nimContainer && sandboxName) { registry.updateSandbox(sandboxName, { nimContainer }); } onboardSession.markStepComplete( "inference", - toSessionUpdates({ provider, model, hermesAuthMethod, nimContainer, hermesToolGateways }), + toSessionUpdates({ provider: selectedProvider, model: selectedModel, hermesAuthMethod, nimContainer, hermesToolGateways }), ); break; } @@ -9648,8 +9751,8 @@ async function onboard(opts: OnboardOptions = {}): Promise { : formatSandboxBuildEstimateNote(assessHost()); console.log( formatOnboardConfigSummary({ - provider, - model, + provider: selectedProvider, + model: selectedModel, credentialEnv, hermesAuthMethod, webSearchConfig, @@ -9672,14 +9775,19 @@ async function onboard(opts: OnboardOptions = {}): Promise { } startRecordedStep("inference", { provider, model }); - const inferenceResult = await setupInference( - sandboxName, - model, - provider, - endpointUrl, - credentialEnv, - hermesAuthMethod, - hermesToolGateways, + const inferenceResult = await trace.withTraceSpan( + "nemoclaw.onboard.phase.inference", + { sandbox_name: sandboxName, provider: selectedProvider, model: selectedModel, credential_env: credentialEnv }, + () => + setupInference( + sandboxName, + selectedModel, + selectedProvider, + endpointUrl, + credentialEnv, + hermesAuthMethod, + hermesToolGateways, + ), ); delete process.env.NVIDIA_API_KEY; if (inferenceResult?.retry === "selection") { @@ -9864,19 +9972,24 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (fresh) { stopStaleDashboardListenersForSandbox(registry.listSandboxes().sandboxes, sandboxName); } - sandboxName = await createSandbox( - gpu, - model, - provider, - preferredInferenceApi, - sandboxName, - nextWebSearchConfig, - selectedMessagingChannels, - fromDockerfile, - agent, - opts.controlUiPort || null, - sandboxGpuConfig, - hermesToolGateways, + sandboxName = await trace.withTraceSpan( + "nemoclaw.onboard.phase.sandbox", + { sandbox_name: sandboxName, provider, model, agent: agent?.name ?? null }, + () => + createSandbox( + gpu, + model, + provider, + preferredInferenceApi, + sandboxName, + nextWebSearchConfig, + selectedMessagingChannels, + fromDockerfile, + agent, + opts.controlUiPort || null, + sandboxGpuConfig, + hermesToolGateways, + ), ); webSearchConfig = nextWebSearchConfig; registry.updateSandbox(sandboxName, { @@ -10123,8 +10236,13 @@ async function onboard(opts: OnboardOptions = {}): Promise { } printDashboard(sandboxName, model, provider, nimContainer, agent); + traceCompleted = true; } finally { releaseOnboardLock(); + if (onboardTraceSpan) { + traceCollector?.endSpan(onboardTraceSpan, traceCompleted ? "OK" : "ERROR"); + trace.flushTrace(traceCompleted ? "OK" : "ERROR"); + } } } diff --git a/src/lib/onboard/gateway-http-readiness.ts b/src/lib/onboard/gateway-http-readiness.ts index 4647e9cb7c..bec301c289 100644 --- a/src/lib/onboard/gateway-http-readiness.ts +++ b/src/lib/onboard/gateway-http-readiness.ts @@ -16,6 +16,7 @@ import http2 from "node:http2"; import { getGatewayHttpEndpoint } from "../core/gateway-address"; import { GATEWAY_PORT } from "../core/ports"; import { sleepSeconds } from "../core/wait"; +import { addTraceEvent, withTraceSpan } from "../trace"; import { envInt } from "./env"; /** @@ -73,6 +74,18 @@ export function isGatewayHttpReady( timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/`, method: "GET" | "POST" = "GET", +): Promise { + return withTraceSpan( + "nemoclaw.gateway.http_probe", + { timeout_ms: timeoutMs, url, method }, + () => isGatewayHttpReadyImpl(timeoutMs, url, method), + ); +} + +function isGatewayHttpReadyImpl( + timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, + url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/`, + method: "GET" | "POST" = "GET", ): Promise { const effectiveTimeout = Number.isFinite(timeoutMs) && timeoutMs > 0 @@ -103,6 +116,17 @@ export function isGatewayHttpReady( export function isDockerDriverGatewayHttpReady( timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/openshell.v1.OpenShell/Health`, +): Promise { + return withTraceSpan( + "nemoclaw.gateway.docker_driver_http_probe", + { timeout_ms: timeoutMs, url }, + () => isDockerDriverGatewayHttpReadyImpl(timeoutMs, url), + ); +} + +function isDockerDriverGatewayHttpReadyImpl( + timeoutMs = ISGATEWAY_HTTP_READY_DEFAULT_TIMEOUT_MS, + url = `${getGatewayHttpEndpoint(GATEWAY_PORT)}/openshell.v1.OpenShell/Health`, ): Promise { const effectiveTimeout = Number.isFinite(timeoutMs) && timeoutMs > 0 @@ -205,33 +229,43 @@ export function isDockerDriverGatewayHttpReady( export async function waitForGatewayHttpReady( opts: WaitForGatewayHttpReadyOpts = {}, ): Promise { - const probe = opts.probe ?? (() => isGatewayHttpReady()); - const sleeper = opts.sleeper ?? sleepSeconds; - const config = getGatewayReuseHealthWaitConfig(); - // Always probe at least once, even if the caller passed a non-positive - // maxAttempts. Non-finite (NaN, Infinity) values fall back to safe defaults - // — Math.max alone would let Infinity through and hang the loop, and NaN - // would propagate into sleeper(). - const rawAttempts = opts.maxAttempts ?? config.count; - const maxAttempts = Number.isFinite(rawAttempts) ? Math.max(1, Math.round(rawAttempts)) : 1; - const rawInterval = opts.intervalSeconds ?? config.interval; - const intervalSeconds = Number.isFinite(rawInterval) ? Math.max(0, rawInterval) : 0; - - // The default probe (isGatewayHttpReady) never rejects, but injected probes - // can. Treat a rejection as "not ready this attempt" so we exhaust the - // budget instead of bailing on the first transient failure. - const safeProbe = async (): Promise => { - try { - return await probe(); - } catch { - return false; - } - }; + return withTraceSpan("nemoclaw.gateway.http_readiness_wait", {}, async () => { + const probe = opts.probe ?? (() => isGatewayHttpReady()); + const sleeper = opts.sleeper ?? sleepSeconds; + const config = getGatewayReuseHealthWaitConfig(); + // Always probe at least once, even if the caller passed a non-positive + // maxAttempts. Non-finite (NaN, Infinity) values fall back to safe defaults + // — Math.max alone would let Infinity through and hang the loop, and NaN + // would propagate into sleeper(). + const rawAttempts = opts.maxAttempts ?? config.count; + const maxAttempts = Number.isFinite(rawAttempts) ? Math.max(1, Math.round(rawAttempts)) : 1; + const rawInterval = opts.intervalSeconds ?? config.interval; + const intervalSeconds = Number.isFinite(rawInterval) ? Math.max(0, rawInterval) : 0; + addTraceEvent("wait_config", { max_attempts: maxAttempts, interval_seconds: intervalSeconds }); - if (await safeProbe()) return true; - for (let attempt = 1; attempt < maxAttempts; attempt++) { - sleeper(intervalSeconds); - if (await safeProbe()) return true; - } - return false; + // The default probe (isGatewayHttpReady) never rejects, but injected probes + // can. Treat a rejection as "not ready this attempt" so we exhaust the + // budget instead of bailing on the first transient failure. + const safeProbe = async (): Promise => { + try { + return await probe(); + } catch { + return false; + } + }; + + if (await safeProbe()) { + addTraceEvent("ready", { attempt: 1 }); + return true; + } + for (let attempt = 1; attempt < maxAttempts; attempt++) { + sleeper(intervalSeconds); + if (await safeProbe()) { + addTraceEvent("ready", { attempt: attempt + 1 }); + return true; + } + } + addTraceEvent("not_ready", { attempts: maxAttempts }); + return false; + }); } diff --git a/src/lib/onboard/gateway-tcp-readiness.ts b/src/lib/onboard/gateway-tcp-readiness.ts index f0a01e3ec1..0a8a0e3bce 100644 --- a/src/lib/onboard/gateway-tcp-readiness.ts +++ b/src/lib/onboard/gateway-tcp-readiness.ts @@ -52,6 +52,7 @@ import net from "node:net"; import { getGatewayConnectHost } from "../core/gateway-address"; import { GATEWAY_PORT } from "../core/ports"; +import { withTraceSpan } from "../trace"; const ISGATEWAY_TCP_READY_DEFAULT_TIMEOUT_MS = 500; @@ -83,6 +84,18 @@ export function isGatewayTcpReady( port: number = GATEWAY_PORT, timeoutMs: number = ISGATEWAY_TCP_READY_DEFAULT_TIMEOUT_MS, host = getGatewayConnectHost(), +): Promise { + return withTraceSpan( + "nemoclaw.gateway.tcp_probe", + { host, port, timeout_ms: timeoutMs }, + () => isGatewayTcpReadyImpl(port, timeoutMs, host), + ); +} + +function isGatewayTcpReadyImpl( + port: number = GATEWAY_PORT, + timeoutMs: number = ISGATEWAY_TCP_READY_DEFAULT_TIMEOUT_MS, + host = getGatewayConnectHost(), ): Promise { const effectiveTimeout = Number.isFinite(timeoutMs) && timeoutMs > ISGATEWAY_TCP_READY_MIN_TIMEOUT_MS diff --git a/src/lib/trace.test.ts b/src/lib/trace.test.ts new file mode 100644 index 0000000000..583d6ee778 --- /dev/null +++ b/src/lib/trace.test.ts @@ -0,0 +1,127 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; +import { runCurlProbe } from "./adapters/http/probe"; +import { + addTraceEvent, + flushTrace, + resetTraceForTests, + sanitizeTraceAttributes, + withTraceSpan, + type TraceArtifact, +} from "./trace"; + +function withTraceFile(fn: (traceFile: string) => T): T { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-test-")); + const traceFile = path.join(tmpDir, "trace.json"); + process.env.NEMOCLAW_TRACE_FILE = traceFile; + resetTraceForTests(); + return fn(traceFile); +} + +afterEach(() => { + delete process.env.NEMOCLAW_TRACE_FILE; + delete process.env.NEMOCLAW_TRACE_DIR; + resetTraceForTests(); +}); + +describe("onboard trace artifacts", () => { + it("writes OpenTelemetry-style spans and a slowest-span summary", () => { + withTraceFile((traceFile) => { + withTraceSpan("nemoclaw.onboard.phase.gateway", { provider: "nvidia-prod" }, () => { + addTraceEvent("ready", { attempt: 1 }); + }); + + expect(flushTrace()).toBe(traceFile); + const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact; + const spans = artifact.resource_spans[0].scope_spans[0].spans; + + expect(artifact.resource_spans[0].scope_spans[0].scope.name).toBe("nemoclaw.onboard"); + expect(spans).toHaveLength(1); + expect(spans[0].trace_id).toBe(artifact.summary.trace_id); + expect(spans[0].span_id).toMatch(/^[0-9a-f]{16}$/); + expect(spans[0].duration_ms).toBeGreaterThanOrEqual(0); + expect(spans[0].events[0]).toMatchObject({ + name: "ready", + attributes: { attempt: 1 }, + }); + expect(artifact.summary.slowest_spans[0].name).toBe("nemoclaw.onboard.phase.gateway"); + }); + }); + + it("redacts secret-like metadata before writing artifacts", () => { + withTraceFile((traceFile) => { + withTraceSpan( + "nemoclaw.inference.validation_probe", + { + api_key: "nvapi-secret-value", + authorization: "Bearer sk-secret-value", + endpoint_url: "https://user:pass@example.test/v1?token=secret", + }, + () => undefined, + ); + + flushTrace(); + const text = fs.readFileSync(traceFile, "utf8"); + + expect(text).not.toContain("nvapi-secret-value"); + expect(text).not.toContain("sk-secret-value"); + expect(text).not.toContain("user:pass"); + expect(text).not.toContain("token=secret"); + expect(text).toContain(""); + }); + }); + + it("sanitizes curl probe URLs and records status metadata", () => { + withTraceFile((traceFile) => { + const result = runCurlProbe( + [ + "-sS", + "-H", + "Authorization: Bearer should-not-appear", + "https://example.test/v1/chat/completions?key=secret", + ], + { + spawnSyncImpl: () => + ({ + status: 0, + stdout: "200", + stderr: "", + error: undefined, + }) as never, + }, + ); + + expect(result.ok).toBe(true); + flushTrace(); + const text = fs.readFileSync(traceFile, "utf8"); + const artifact = JSON.parse(text) as TraceArtifact; + const span = artifact.resource_spans[0].scope_spans[0].spans.find( + (entry) => entry.name === "nemoclaw.inference.curl_probe", + ); + + expect(text).not.toContain("should-not-appear"); + expect(text).not.toContain("key=secret"); + expect(span?.attributes["http.url"]).toBe( + "https://example.test/v1/chat/completions?key=%3CREDACTED%3E", + ); + expect(span?.events[0].attributes).toMatchObject({ ok: true, http_status: 200 }); + }); + }); + + it("redacts nested sensitive attributes", () => { + expect( + sanitizeTraceAttributes({ + nested: { token: "xoxb-secret", ok: true }, + credential_env: "NVIDIA_API_KEY", + }), + ).toMatchObject({ + nested: '{"token":"","ok":true}', + credential_env: "NVIDIA_API_KEY", + }); + }); +}); diff --git a/src/lib/trace.ts b/src/lib/trace.ts new file mode 100644 index 0000000000..3a48437194 --- /dev/null +++ b/src/lib/trace.ts @@ -0,0 +1,300 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import crypto from "node:crypto"; +import os from "node:os"; +import path from "node:path"; +import { redactForLog, redactFull, redactUrl } from "./security/redact"; + +type TraceStatusCode = "OK" | "ERROR" | "UNSET"; +type TraceAttribute = string | number | boolean | null | string[] | number[] | boolean[]; +type TraceAttributes = Record; + +export interface TraceEvent { + name: string; + time_unix_nano: string; + attributes?: TraceAttributes; +} + +export interface TraceSpan { + trace_id: string; + span_id: string; + parent_span_id?: string; + name: string; + kind: "INTERNAL"; + start_time_unix_nano: string; + end_time_unix_nano?: string; + duration_ms?: number; + status: { + code: TraceStatusCode; + message?: string; + }; + attributes: TraceAttributes; + events: TraceEvent[]; +} + +export interface TraceArtifact { + resource_spans: [ + { + resource: { + attributes: TraceAttributes; + }; + scope_spans: [ + { + scope: { + name: string; + version: string; + }; + spans: TraceSpan[]; + }, + ]; + }, + ]; + summary: { + trace_id: string; + generated_at: string; + total_duration_ms: number; + slowest_spans: Array<{ name: string; duration_ms: number; status: TraceStatusCode }>; + output_path: string; + }; +} + +const TRACE_SCOPE_NAME = "nemoclaw.onboard"; +const TRACE_SCOPE_VERSION = "1.0.0"; +const MAX_ATTRIBUTE_STRING_LENGTH = 240; +const SENSITIVE_ATTRIBUTE_KEY = + /(?:api[_-]?key|token|secret|password|authorization|bearer|cookie|set-cookie)/i; + +function nowNs(): bigint { + return BigInt(Date.now()) * 1_000_000n; +} + +function durationMs(startNs: bigint, endNs: bigint): number { + return Number(endNs - startNs) / 1_000_000; +} + +function randomHex(bytes: number): string { + return crypto.randomBytes(bytes).toString("hex"); +} + +function sanitizeAttributeValue(key: string, value: unknown): TraceAttribute | undefined { + if (value === undefined) return undefined; + if (value === null || typeof value === "boolean") return value; + if (typeof value === "number") return Number.isFinite(value) ? value : String(value); + if (typeof value === "bigint") return String(value); + if (typeof value === "string") { + if (SENSITIVE_ATTRIBUTE_KEY.test(key)) return ""; + const urlValue = redactUrl(value); + const redacted = redactFull(urlValue ?? value); + return redacted.slice(0, MAX_ATTRIBUTE_STRING_LENGTH); + } + if (Array.isArray(value)) { + const entries = value + .map((entry) => sanitizeAttributeValue(key, entry)) + .filter((entry): entry is string | number | boolean => { + return typeof entry === "string" || typeof entry === "number" || typeof entry === "boolean"; + }); + if (entries.every((entry) => typeof entry === "string")) return entries as string[]; + if (entries.every((entry) => typeof entry === "number")) return entries as number[]; + if (entries.every((entry) => typeof entry === "boolean")) return entries as boolean[]; + return entries.map((entry) => String(entry)); + } + const redacted = redactForLog(value); + try { + return JSON.stringify(redacted).slice(0, MAX_ATTRIBUTE_STRING_LENGTH); + } catch { + return "[unserializable]"; + } +} + +export function sanitizeTraceAttributes(attributes: Record = {}): TraceAttributes { + const safe: TraceAttributes = {}; + for (const [key, value] of Object.entries(attributes)) { + const sanitized = sanitizeAttributeValue(key, value); + if (sanitized !== undefined) safe[key] = sanitized; + } + return safe; +} + +function resolveTracePath(env: NodeJS.ProcessEnv): string | null { + const traceFile = env.NEMOCLAW_TRACE_FILE?.trim(); + if (traceFile) return path.resolve(traceFile); + const traceDir = env.NEMOCLAW_TRACE_DIR?.trim(); + if (!traceDir) return null; + const stamp = new Date().toISOString().replace(/[:.]/g, "-"); + return path.resolve(traceDir, `nemoclaw-trace-${stamp}-${process.pid}.json`); +} + +export class TraceCollector { + readonly traceId = randomHex(16); + readonly outputPath: string; + private readonly spans: TraceSpan[] = []; + private readonly spanStack: TraceSpan[] = []; + private readonly startNs = nowNs(); + private flushed = false; + + constructor(outputPath: string) { + this.outputPath = outputPath; + } + + startSpan(name: string, attributes: Record = {}): TraceSpan { + const parent = this.spanStack.at(-1); + const span: TraceSpan & { _startNs?: bigint } = { + trace_id: this.traceId, + span_id: randomHex(8), + ...(parent ? { parent_span_id: parent.span_id } : {}), + name, + kind: "INTERNAL", + start_time_unix_nano: nowNs().toString(), + status: { code: "UNSET" }, + attributes: sanitizeTraceAttributes(attributes), + events: [], + }; + span._startNs = BigInt(span.start_time_unix_nano); + this.spans.push(span); + this.spanStack.push(span); + return span; + } + + endSpan(span: TraceSpan, status: TraceStatusCode = "OK", message?: string): void { + const internal = span as TraceSpan & { _startNs?: bigint }; + if (span.end_time_unix_nano) return; + const endNs = nowNs(); + span.end_time_unix_nano = endNs.toString(); + span.duration_ms = durationMs(internal._startNs ?? BigInt(span.start_time_unix_nano), endNs); + span.status = message ? { code: status, message: redactFull(message).slice(0, 200) } : { code: status }; + const index = this.spanStack.lastIndexOf(span); + if (index >= 0) this.spanStack.splice(index, 1); + delete internal._startNs; + } + + addEvent(span: TraceSpan, name: string, attributes: Record = {}): void { + span.events.push({ + name, + time_unix_nano: nowNs().toString(), + attributes: sanitizeTraceAttributes(attributes), + }); + } + + activeSpan(): TraceSpan | null { + return this.spanStack.at(-1) ?? null; + } + + flush(finalStatus: TraceStatusCode = "OK", message?: string): string | null { + if (this.flushed) return this.outputPath; + this.flushed = true; + while (this.spanStack.length > 0) { + const span = this.spanStack.pop(); + if (span) this.endSpan(span, finalStatus, message); + } + const endNs = nowNs(); + const slowest = [...this.spans] + .filter((span) => span.name !== "nemoclaw.onboard") + .filter((span) => typeof span.duration_ms === "number") + .sort((a, b) => (b.duration_ms ?? 0) - (a.duration_ms ?? 0)) + .slice(0, 10) + .map((span) => ({ + name: span.name, + duration_ms: Number((span.duration_ms ?? 0).toFixed(3)), + status: span.status.code, + })); + const artifact: TraceArtifact = { + resource_spans: [ + { + resource: { + attributes: sanitizeTraceAttributes({ + "service.name": "nemoclaw", + "service.version": process.env.npm_package_version || "unknown", + "host.type": os.type(), + "os.platform": process.platform, + "process.pid": process.pid, + "ci": process.env.CI === "true" || process.env.GITHUB_ACTIONS === "true", + }), + }, + scope_spans: [ + { + scope: { + name: TRACE_SCOPE_NAME, + version: TRACE_SCOPE_VERSION, + }, + spans: this.spans, + }, + ], + }, + ], + summary: { + trace_id: this.traceId, + generated_at: new Date().toISOString(), + total_duration_ms: Number(durationMs(this.startNs, endNs).toFixed(3)), + slowest_spans: slowest, + output_path: this.outputPath, + }, + }; + fs.mkdirSync(path.dirname(this.outputPath), { recursive: true, mode: 0o700 }); + fs.writeFileSync(this.outputPath, `${JSON.stringify(artifact, null, 2)}\n`, { mode: 0o600 }); + return this.outputPath; + } +} + +let collector: TraceCollector | null | undefined; + +export function getTraceCollector(): TraceCollector | null { + if (collector !== undefined) return collector; + const tracePath = resolveTracePath(process.env); + collector = tracePath ? new TraceCollector(tracePath) : null; + if (collector) { + process.once("exit", (code) => { + collector?.flush(code === 0 ? "OK" : "ERROR", `process exited with code ${code}`); + }); + } + return collector; +} + +export function resetTraceForTests(): void { + collector = undefined; +} + +export function withTraceSpan( + name: string, + attributes: Record, + fn: () => T, +): T { + const trace = getTraceCollector(); + if (!trace) return fn(); + const span = trace.startSpan(name, attributes); + try { + const result = fn(); + const maybePromise = result as unknown; + if ( + maybePromise !== null && + typeof maybePromise === "object" && + typeof (maybePromise as { then?: unknown }).then === "function" + ) { + return (maybePromise as Promise) + .then((value) => { + trace.endSpan(span, "OK"); + return value; + }) + .catch((error) => { + trace.endSpan(span, "ERROR", error instanceof Error ? error.message : String(error)); + throw error; + }) as T; + } + trace.endSpan(span, "OK"); + return result; + } catch (error) { + trace.endSpan(span, "ERROR", error instanceof Error ? error.message : String(error)); + throw error; + } +} + +export function addTraceEvent(name: string, attributes: Record = {}): void { + const trace = getTraceCollector(); + const span = trace?.activeSpan() ?? null; + if (trace && span) trace.addEvent(span, name, attributes); +} + +export function flushTrace(status: TraceStatusCode = "OK", message?: string): string | null { + return getTraceCollector()?.flush(status, message) ?? null; +} diff --git a/test/e2e/brev-e2e.test.ts b/test/e2e/brev-e2e.test.ts index 278ca69c42..2037ee2c19 100644 --- a/test/e2e/brev-e2e.test.ts +++ b/test/e2e/brev-e2e.test.ts @@ -270,6 +270,7 @@ function sshEnv( `export NEMOCLAW_NON_INTERACTIVE=1`, `export NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1`, `export NEMOCLAW_SANDBOX_NAME=e2e-test`, + `export NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces`, ]; if (GPU_TEST_SUITE) { // This suite validates Docker GPU passthrough and sandbox inference wiring. diff --git a/trace-demo.json b/trace-demo.json new file mode 100644 index 0000000000..ca0ba2389d --- /dev/null +++ b/trace-demo.json @@ -0,0 +1,132 @@ +{ + "resource_spans": [ + { + "resource": { + "attributes": { + "service.name": "nemoclaw", + "service.version": "unknown", + "host.type": "Darwin", + "os.platform": "darwin", + "process.pid": 42693, + "ci": false + } + }, + "scope_spans": [ + { + "scope": { + "name": "nemoclaw.onboard", + "version": "1.0.0" + }, + "spans": [ + { + "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", + "span_id": "6b2da256ad014b3a", + "name": "nemoclaw.onboard", + "kind": "INTERNAL", + "start_time_unix_nano": "1779404593899000000", + "status": { + "code": "OK" + }, + "attributes": { + "non_interactive": true, + "agent": "openclaw" + }, + "events": [], + "end_time_unix_nano": "1779404593903000000", + "duration_ms": 4 + }, + { + "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", + "span_id": "1f01775665b0cc44", + "parent_span_id": "6b2da256ad014b3a", + "name": "nemoclaw.onboard.phase.inference", + "kind": "INTERNAL", + "start_time_unix_nano": "1779404593900000000", + "status": { + "code": "OK" + }, + "attributes": { + "provider": "compatible-endpoint", + "model": "demo-model", + "credential_env": "DEMO_API_KEY" + }, + "events": [], + "end_time_unix_nano": "1779404593903000000", + "duration_ms": 3 + }, + { + "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", + "span_id": "653c1e1aba4eb46b", + "parent_span_id": "1f01775665b0cc44", + "name": "nemoclaw.inference.validation_probe", + "kind": "INTERNAL", + "start_time_unix_nano": "1779404593901000000", + "status": { + "code": "OK" + }, + "attributes": { + "probe_name": "Chat Completions API", + "api": "openai-completions" + }, + "events": [], + "end_time_unix_nano": "1779404593903000000", + "duration_ms": 2 + }, + { + "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", + "span_id": "f840c39f5415e23d", + "parent_span_id": "653c1e1aba4eb46b", + "name": "nemoclaw.inference.curl_probe", + "kind": "INTERNAL", + "start_time_unix_nano": "1779404593901000000", + "status": { + "code": "OK" + }, + "attributes": { + "http.url": "https://example.test/v1/chat/completions?key=%3CREDACTED%3E", + "http.request.method": "POST", + "process.timeout_ms": 30000 + }, + "events": [ + { + "name": "curl_result", + "time_unix_nano": "1779404593902000000", + "attributes": { + "ok": true, + "http_status": 200, + "curl_status": 0 + } + } + ], + "end_time_unix_nano": "1779404593903000000", + "duration_ms": 2 + } + ] + } + ] + } + ], + "summary": { + "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", + "generated_at": "2026-05-21T23:03:13.903Z", + "total_duration_ms": 4, + "slowest_spans": [ + { + "name": "nemoclaw.onboard.phase.inference", + "duration_ms": 3, + "status": "OK" + }, + { + "name": "nemoclaw.inference.validation_probe", + "duration_ms": 2, + "status": "OK" + }, + { + "name": "nemoclaw.inference.curl_probe", + "duration_ms": 2, + "status": "OK" + } + ], + "output_path": "/Users/amata/a/nemoclaw/NemoClaw/trace-demo.json" + } +} From 4857ed9084ba0ad82d55560b561937f854ab4048 Mon Sep 17 00:00:00 2001 From: "Angel A. Mata" Date: Fri, 22 May 2026 11:13:28 -0700 Subject: [PATCH 2/6] perf(onboard): trace sandbox build stages Record sandbox create stream phases and Docker build progress events so slow onboard runs can identify which sandbox build stage dominates latency. --- src/lib/onboard.ts | 1 + src/lib/sandbox/create-stream.test.ts | 58 +++++++++++++++++++++++++++ src/lib/sandbox/create-stream.ts | 48 ++++++++++++++++++++-- 3 files changed, 104 insertions(+), 3 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 9a51f1aadc..f76d8ee673 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -5557,6 +5557,7 @@ async function createSandbox( return false; }, failureCheck: dockerGpuCreatePatch.createFailureMessage, + traceEvent: trace.addTraceEvent, }), ); diff --git a/src/lib/sandbox/create-stream.test.ts b/src/lib/sandbox/create-stream.test.ts index 96d210de7b..55e22bda63 100644 --- a/src/lib/sandbox/create-stream.test.ts +++ b/src/lib/sandbox/create-stream.test.ts @@ -74,8 +74,10 @@ describe("sandbox-create-stream", () => { it("streams BuildKit progress lines as build output", async () => { const child = new FakeChild(); const logLine = vi.fn(); + const traceEvent = vi.fn(); const promise = streamSandboxCreate("echo create", process.env, { logLine, + traceEvent, spawnImpl: () => child as never, heartbeatIntervalMs: 1_000, silentPhaseMs: 10_000, @@ -95,6 +97,62 @@ describe("sandbox-create-stream", () => { expect(logLine).toHaveBeenCalledWith("#1 [internal] load build definition from Dockerfile"); expect(logLine).toHaveBeenCalledWith("#2 CACHED"); expect(logLine).toHaveBeenCalledWith("#3 DONE 0.1s"); + expect(traceEvent).toHaveBeenCalledWith( + "docker_buildkit_progress", + expect.objectContaining({ + step: 1, + detail: "[internal] load build definition from Dockerfile", + }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_buildkit_progress", + expect.objectContaining({ step: 2, detail: "CACHED" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_buildkit_progress", + expect.objectContaining({ step: 3, detail: "DONE 0.1s" }), + ); + }); + + it("records classic Docker build steps as trace events", async () => { + const child = new FakeChild(); + const traceEvent = vi.fn(); + const promise = streamSandboxCreate("echo create", process.env, { + traceEvent, + logLine: vi.fn(), + spawnImpl: () => child as never, + heartbeatIntervalMs: 1_000, + silentPhaseMs: 10_000, + }); + + child.stdout.emit( + "data", + Buffer.from( + " Step 1/3 : FROM base\n" + + " Step 2/3 : RUN npm ci\n" + + " Step 3/3 : COPY . /workspace\n" + + "Successfully built abc123\n", + ), + ); + child.emit("close", 0); + + await expect(promise).resolves.toMatchObject({ status: 0, sawProgress: true }); + expect(traceEvent).toHaveBeenCalledWith( + "sandbox_create_phase", + expect.objectContaining({ phase: "build" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_build_step_start", + expect.objectContaining({ step: "Step 1/3", index: 1, total: 3, instruction: "FROM base" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_build_step_end", + expect.objectContaining({ status: "completed", step: "Step 1/3", instruction: "FROM base" }), + ); + expect(traceEvent).toHaveBeenCalledWith( + "docker_build_end", + expect.objectContaining({ status: "completed" }), + ); }); it("forces success when the sandbox becomes ready before the stream exits", async () => { diff --git a/src/lib/sandbox/create-stream.ts b/src/lib/sandbox/create-stream.ts index f489b033cb..1e93cf74dd 100644 --- a/src/lib/sandbox/create-stream.ts +++ b/src/lib/sandbox/create-stream.ts @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import { spawn, type SpawnOptions } from "node:child_process"; +import { type SpawnOptions, spawn } from "node:child_process"; import { ROOT } from "../state/paths"; @@ -19,6 +19,7 @@ export interface StreamSandboxCreateOptions { heartbeatIntervalMs?: number; silentPhaseMs?: number; logLine?: (line: string) => void; + traceEvent?: (name: string, attributes?: Record) => void; // Optional guard for the early-ready escape hatch. When set, readyCheck() // alone cannot detach the create stream until at least one streamed output // line matches a configured pattern. @@ -97,6 +98,7 @@ const VISIBLE_PROGRESS_PATTERNS: readonly RegExp[] = [ const VM_READY_DETACH_OUTPUT_PATTERNS: readonly RegExp[] = [/Setting up NemoClaw/]; const CLASSIC_DOCKER_STEP_RE = /^\s*Step (\d+)\/(\d+) : (.+)$/; +const BUILDKIT_STEP_RE = /^#(\d+)\s+(.+)$/; function matchesAny(line: string, patterns: readonly RegExp[]) { return patterns.some((pattern) => pattern.test(line)); @@ -133,6 +135,7 @@ export function streamSandboxCreate( }); const logLine = options.logLine ?? console.log; + const traceEvent = options.traceEvent ?? (() => {}); const lines: string[] = []; let pending = ""; let lastPrintedLine = ""; @@ -198,6 +201,10 @@ export function streamSandboxCreate( printProgressLine(line); } + function emitTraceEvent(name: string, attributes: Record = {}) { + traceEvent(name, attributes); + } + function markBuildStarted(nowMs: number = timingNow()) { if (buildStartedAtMs === null) { buildStartedAtMs = nowMs; @@ -206,11 +213,18 @@ export function streamSandboxCreate( function finishActiveBuildStep(status: "completed" | "stopped", nowMs: number = timingNow()) { if (!activeBuildStep) return; + const elapsedMs = nowMs - activeBuildStep.startedAtMs; const phrase = status === "completed" ? "completed in" : "stopped after"; - const elapsed = formatDuration(nowMs - activeBuildStep.startedAtMs); + const elapsed = formatDuration(elapsedMs); appendTimingLine( ` ${activeBuildStep.label} ${phrase} ${elapsed} (${activeBuildStep.instruction})`, ); + emitTraceEvent("docker_build_step_end", { + status, + step: activeBuildStep.label, + instruction: activeBuildStep.instruction, + duration_ms: elapsedMs, + }); activeBuildStep = null; } @@ -218,10 +232,15 @@ export function streamSandboxCreate( if (buildTimingFinished) return; finishActiveBuildStep(status, nowMs); if (buildStartedAtMs !== null) { + const elapsedMs = nowMs - buildStartedAtMs; const phrase = status === "completed" ? "completed in" : "stopped after"; appendTimingLine( - ` Sandbox image build ${phrase} ${formatDuration(nowMs - buildStartedAtMs)}`, + ` Sandbox image build ${phrase} ${formatDuration(elapsedMs)}`, ); + emitTraceEvent("docker_build_end", { + status, + duration_ms: elapsedMs, + }); } buildTimingFinished = true; } @@ -237,6 +256,24 @@ export function streamSandboxCreate( instruction: match[3].trim().replace(/\s+/g, " "), startedAtMs: nowMs, }; + emitTraceEvent("docker_build_step_start", { + step: activeBuildStep.label, + index: Number(match[1]), + total: Number(match[2]), + instruction: activeBuildStep.instruction, + }); + } + + function maybeRecordBuildKitStep(line: string) { + const match = line.match(BUILDKIT_STEP_RE); + if (!match) return; + if (!matchesAny(line, BUILD_PROGRESS_PATTERNS) && !matchesAny(line, PULL_PROGRESS_PATTERNS)) { + return; + } + emitTraceEvent("docker_buildkit_progress", { + step: Number(match[1]), + detail: match[2].trim().replace(/\s+/g, " "), + }); } function elapsedSeconds() { @@ -260,6 +297,10 @@ export function streamSandboxCreate( : nextPhase === "ready" ? " Waiting for sandbox to become ready..." : null; + emitTraceEvent("sandbox_create_phase", { + phase: nextPhase, + elapsed_seconds: elapsedSeconds(), + }); if (phaseLine) printProgressLine(phaseLine); } @@ -275,6 +316,7 @@ export function streamSandboxCreate( markBuildStarted(); } maybeStartClassicBuildStep(line); + maybeRecordBuildKitStep(line); if (/^(?:Successfully built | {2}Built image )/.test(line)) { finishBuildTiming("completed"); } From 7dbed7aaf56a98e886cc41558564a1cf26d6b14f Mon Sep 17 00:00:00 2001 From: "Angel A. Mata" Date: Fri, 22 May 2026 14:18:34 -0700 Subject: [PATCH 3/6] perf(onboard): add trace enable switch Allow NEMOCLAW_TRACE=1 to create a timestamped trace artifact automatically while preserving explicit file and directory overrides. --- src/lib/onboard.ts | 1 + src/lib/trace.test.ts | 42 ++++++++++++++++++++++++++++++++++++++---- src/lib/trace.ts | 29 +++++++++++++++++++++++------ 3 files changed, 62 insertions(+), 10 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index f76d8ee673..b6666c1b86 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -9051,6 +9051,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { fresh: opts.fresh === true, non_interactive: opts.nonInteractive === true || process.env.NEMOCLAW_NON_INTERACTIVE === "1", agent: opts.agent || process.env.NEMOCLAW_AGENT || null, + trace_enabled: Boolean(process.env.NEMOCLAW_TRACE), trace_file_enabled: Boolean(process.env.NEMOCLAW_TRACE_FILE), trace_dir_enabled: Boolean(process.env.NEMOCLAW_TRACE_DIR), }); diff --git a/src/lib/trace.test.ts b/src/lib/trace.test.ts index 583d6ee778..4d854d88d6 100644 --- a/src/lib/trace.test.ts +++ b/src/lib/trace.test.ts @@ -9,23 +9,28 @@ import { runCurlProbe } from "./adapters/http/probe"; import { addTraceEvent, flushTrace, + getTraceCollector, resetTraceForTests, sanitizeTraceAttributes, - withTraceSpan, + TRACE_DIR_ENV, + TRACE_ENABLED_ENV, + TRACE_FILE_ENV, type TraceArtifact, + withTraceSpan, } from "./trace"; function withTraceFile(fn: (traceFile: string) => T): T { const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-test-")); const traceFile = path.join(tmpDir, "trace.json"); - process.env.NEMOCLAW_TRACE_FILE = traceFile; + process.env[TRACE_FILE_ENV] = traceFile; resetTraceForTests(); return fn(traceFile); } afterEach(() => { - delete process.env.NEMOCLAW_TRACE_FILE; - delete process.env.NEMOCLAW_TRACE_DIR; + delete process.env[TRACE_ENABLED_ENV]; + delete process.env[TRACE_FILE_ENV]; + delete process.env[TRACE_DIR_ENV]; resetTraceForTests(); }); @@ -124,4 +129,33 @@ describe("onboard trace artifacts", () => { credential_env: "NVIDIA_API_KEY", }); }); + + it("creates a readable timestamped trace file when NEMOCLAW_TRACE is enabled", () => { + const originalCwd = process.cwd(); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-enabled-")); + try { + process.chdir(tmpDir); + process.env[TRACE_ENABLED_ENV] = "1"; + resetTraceForTests(); + + const collector = getTraceCollector(); + expect(collector?.outputPath).toMatch( + /[/\\]\.e2e[/\\]traces[/\\]nemoclaw-trace-\d{4}-\d{2}-\d{2}-\d{2}-\d{2}-\d{2}-\d{3}Z-pid-\d+\.json$/, + ); + + withTraceSpan("nemoclaw.onboard.phase.preflight", {}, () => undefined); + const outputPath = flushTrace(); + expect(outputPath).toBe(collector?.outputPath); + expect(fs.existsSync(String(outputPath))).toBe(true); + } finally { + process.chdir(originalCwd); + } + }); + + it("treats false-like NEMOCLAW_TRACE values as disabled", () => { + process.env[TRACE_ENABLED_ENV] = "false"; + resetTraceForTests(); + + expect(getTraceCollector()).toBeNull(); + }); }); diff --git a/src/lib/trace.ts b/src/lib/trace.ts index 3a48437194..0209856996 100644 --- a/src/lib/trace.ts +++ b/src/lib/trace.ts @@ -1,8 +1,8 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -import fs from "node:fs"; import crypto from "node:crypto"; +import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import { redactForLog, redactFull, redactUrl } from "./security/redact"; @@ -62,6 +62,10 @@ export interface TraceArtifact { const TRACE_SCOPE_NAME = "nemoclaw.onboard"; const TRACE_SCOPE_VERSION = "1.0.0"; +export const TRACE_ENABLED_ENV = "NEMOCLAW_TRACE"; +export const TRACE_FILE_ENV = "NEMOCLAW_TRACE_FILE"; +export const TRACE_DIR_ENV = "NEMOCLAW_TRACE_DIR"; +const DEFAULT_TRACE_DIR = path.join(".e2e", "traces"); const MAX_ATTRIBUTE_STRING_LENGTH = 240; const SENSITIVE_ATTRIBUTE_KEY = /(?:api[_-]?key|token|secret|password|authorization|bearer|cookie|set-cookie)/i; @@ -117,13 +121,26 @@ export function sanitizeTraceAttributes(attributes: Record = {} return safe; } +function isTraceFlagEnabled(value: string | undefined): boolean { + const normalized = value?.trim().toLowerCase(); + if (!normalized) return false; + return !["0", "false", "no", "off"].includes(normalized); +} + +function traceFileName(): string { + const stamp = new Date().toISOString().replace("T", "-").replace(/[:.]/g, "-"); + return `nemoclaw-trace-${stamp}-pid-${process.pid}.json`; +} + function resolveTracePath(env: NodeJS.ProcessEnv): string | null { - const traceFile = env.NEMOCLAW_TRACE_FILE?.trim(); + const traceFile = env[TRACE_FILE_ENV]?.trim(); if (traceFile) return path.resolve(traceFile); - const traceDir = env.NEMOCLAW_TRACE_DIR?.trim(); - if (!traceDir) return null; - const stamp = new Date().toISOString().replace(/[:.]/g, "-"); - return path.resolve(traceDir, `nemoclaw-trace-${stamp}-${process.pid}.json`); + const traceDir = env[TRACE_DIR_ENV]?.trim(); + if (traceDir) return path.resolve(traceDir, traceFileName()); + if (isTraceFlagEnabled(env[TRACE_ENABLED_ENV])) { + return path.resolve(DEFAULT_TRACE_DIR, traceFileName()); + } + return null; } export class TraceCollector { From d087a3afd7a0642af3eb863c3c3e78b5af36b950 Mon Sep 17 00:00:00 2001 From: "Angel A. Mata" Date: Fri, 22 May 2026 17:15:41 -0700 Subject: [PATCH 4/6] fix(onboard): address tracing review feedback --- .../references/commands.md | 15 + .github/workflows/regression-e2e.yaml | 2 +- docs/reference/commands.mdx | 15 + src/lib/adapters/http/probe.test.ts | 59 +++- src/lib/adapters/http/probe.ts | 46 ++- src/lib/onboard.ts | 310 +++++------------- src/lib/onboard/machine/handlers/gateway.ts | 9 +- src/lib/onboard/machine/handlers/preflight.ts | 3 +- src/lib/onboard/policy-selection.ts | 11 + src/lib/onboard/sandbox-readiness-tracing.ts | 133 ++++++++ src/lib/onboard/tracing.ts | 173 ++++++++++ src/lib/trace.test.ts | 22 ++ src/lib/trace.ts | 12 +- trace-demo.json | 132 -------- 14 files changed, 569 insertions(+), 373 deletions(-) create mode 100644 src/lib/onboard/sandbox-readiness-tracing.ts create mode 100644 src/lib/onboard/tracing.ts delete mode 100644 trace-demo.json diff --git a/.agents/skills/nemoclaw-user-reference/references/commands.md b/.agents/skills/nemoclaw-user-reference/references/commands.md index 74d1d3944b..3a0975da59 100644 --- a/.agents/skills/nemoclaw-user-reference/references/commands.md +++ b/.agents/skills/nemoclaw-user-reference/references/commands.md @@ -1257,6 +1257,21 @@ These flags toggle optional behaviors during onboarding; set them before running | `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. | | `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. | +### Onboard Profiling Traces + +Set `NEMOCLAW_TRACE=1` before `nemoclaw onboard` to write an OpenTelemetry-style JSON trace for the run. +When no explicit path is provided, NemoClaw writes a timestamped file under `.e2e/traces/` in the current working directory. +Use `NEMOCLAW_TRACE_DIR` to choose the output directory, or `NEMOCLAW_TRACE_FILE` to choose the exact output file. + +```console +$ NEMOCLAW_TRACE=1 nemoclaw onboard +$ NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces nemoclaw onboard +$ NEMOCLAW_TRACE_FILE=/tmp/nemoclaw-onboard-trace.json nemoclaw onboard +``` + +Trace artifacts include onboard phase timing, sandbox and dashboard readiness waits, policy application, inference validation probes, curl probe results, and sandbox build progress events. +Secret-like metadata such as API keys, bearer tokens, cookies, and credentials is redacted before the file is written. + ### Probe Timeouts These tune how long internal probes wait before giving up. diff --git a/.github/workflows/regression-e2e.yaml b/.github/workflows/regression-e2e.yaml index 3a18ae879a..8a6574e155 100644 --- a/.github/workflows/regression-e2e.yaml +++ b/.github/workflows/regression-e2e.yaml @@ -242,7 +242,7 @@ jobs: - name: Upload onboard profiling traces if: always() - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: onboard-inference-smoke-traces path: /tmp/nemoclaw-traces/ diff --git a/docs/reference/commands.mdx b/docs/reference/commands.mdx index 58b0846d9e..42850e1706 100644 --- a/docs/reference/commands.mdx +++ b/docs/reference/commands.mdx @@ -1264,6 +1264,21 @@ These flags toggle optional behaviors during onboarding; set them before running | `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. | | `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. | +### Onboard Profiling Traces + +Set `NEMOCLAW_TRACE=1` before `nemoclaw onboard` to write an OpenTelemetry-style JSON trace for the run. +When no explicit path is provided, NemoClaw writes a timestamped file under `.e2e/traces/` in the current working directory. +Use `NEMOCLAW_TRACE_DIR` to choose the output directory, or `NEMOCLAW_TRACE_FILE` to choose the exact output file. + +```console +$ NEMOCLAW_TRACE=1 nemoclaw onboard +$ NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces nemoclaw onboard +$ NEMOCLAW_TRACE_FILE=/tmp/nemoclaw-onboard-trace.json nemoclaw onboard +``` + +Trace artifacts include onboard phase timing, sandbox and dashboard readiness waits, policy application, inference validation probes, curl probe results, and sandbox build progress events. +Secret-like metadata such as API keys, bearer tokens, cookies, and credentials is redacted before the file is written. + ### Probe Timeouts These tune how long internal probes wait before giving up. diff --git a/src/lib/adapters/http/probe.test.ts b/src/lib/adapters/http/probe.test.ts index 0669c8181e..ad07ed6f55 100644 --- a/src/lib/adapters/http/probe.test.ts +++ b/src/lib/adapters/http/probe.test.ts @@ -2,9 +2,11 @@ // SPDX-License-Identifier: Apache-2.0 import fs from "node:fs"; +import os from "node:os"; import path from "node:path"; -import { describe, expect, it } from "vitest"; +import { afterEach, describe, expect, it } from "vitest"; +import { flushTrace, resetTraceForTests, TRACE_FILE_ENV, type TraceArtifact } from "../../trace"; import { getCurlTimingArgs, runChatCompletionsStreamingProbe, @@ -15,6 +17,19 @@ import { summarizeProbeFailure, } from "./probe"; +function withTraceFile(fn: (traceFile: string) => T): T { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-probe-trace-test-")); + const traceFile = path.join(tmpDir, "trace.json"); + process.env[TRACE_FILE_ENV] = traceFile; + resetTraceForTests(); + return fn(traceFile); +} + +afterEach(() => { + delete process.env[TRACE_FILE_ENV]; + resetTraceForTests(); +}); + describe("http-probe helpers", () => { it("returns explicit curl timeouts", () => { expect(getCurlTimingArgs()).toEqual(["--connect-timeout", "10", "--max-time", "60"]); @@ -157,6 +172,27 @@ describe("runChatCompletionsStreamingProbe", () => { expect(result.ok).toBe(false); expect(result.message).toContain("did not return SSE data"); }); + + it("records curl_result metadata for chat streaming probes", () => { + withTraceFile((traceFile) => { + const result = runChatCompletionsStreamingProbe( + ["-sS", "--max-time", "120", "https://example.test/v1/chat/completions"], + { spawnSyncImpl: mockStreaming("", 28, "200") }, + ); + + expect(result.ok).toBe(false); + flushTrace(); + const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact; + const span = artifact.resource_spans[0].scope_spans[0].spans.find( + (entry) => entry.name === "nemoclaw.inference.curl_streaming_probe", + ); + expect(span?.events[0].attributes).toMatchObject({ + ok: false, + http_status: 200, + curl_status: 28, + }); + }); + }); }); describe("runStreamingEventProbe", () => { @@ -307,4 +343,25 @@ describe("runStreamingEventProbe", () => { expect(fs.existsSync(outputPath)).toBe(false); expect(fs.existsSync(path.dirname(outputPath))).toBe(false); }); + + it("records curl_result metadata for responses streaming probes", () => { + withTraceFile((traceFile) => { + const result = runStreamingEventProbe( + ["-sS", "--max-time", "15", "https://example.test/v1/responses"], + { spawnSyncImpl: mockStreaming("event: response.created\ndata: {}\n") }, + ); + + expect(result.ok).toBe(false); + flushTrace(); + const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact; + const span = artifact.resource_spans[0].scope_spans[0].spans.find( + (entry) => entry.name === "nemoclaw.inference.curl_streaming_event_probe", + ); + expect(span?.events[0].attributes).toMatchObject({ + ok: false, + missing_events_count: 1, + curl_status: 0, + }); + }); + }); }); diff --git a/src/lib/adapters/http/probe.ts b/src/lib/adapters/http/probe.ts index c92a32fde6..e787e0dcad 100644 --- a/src/lib/adapters/http/probe.ts +++ b/src/lib/adapters/http/probe.ts @@ -96,6 +96,10 @@ function getCurlProbeTraceAttributes(argv: string[], opts: CurlProbeOptions): Re }; } +function emitCurlResultTraceEvent(attributes: Record): void { + addTraceEvent("curl_result", attributes); +} + export function summarizeCurlFailure(curlStatus = 0, stderr = "", body = ""): string { const detail = compactText(stderr || body); return detail @@ -196,7 +200,7 @@ function runCurlProbeImpl(argv: string[], opts: CurlProbeOptions = {}): CurlProb stderr: errorMessage, message: summarizeProbeFailure(body, 0, errorCode, errorMessage), }; - addTraceEvent("curl_result", { ok: false, http_status: 0, curl_status: errorCode }); + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: errorCode }); return failure; } const status = Number(String(result.stdout || "").trim()); @@ -213,7 +217,7 @@ function runCurlProbeImpl(argv: string[], opts: CurlProbeOptions = {}): CurlProb String(result.stderr || ""), ), }; - addTraceEvent("curl_result", { + emitCurlResultTraceEvent({ ok: probeResult.ok, http_status: probeResult.httpStatus, curl_status: probeResult.curlStatus, @@ -233,7 +237,7 @@ function runCurlProbeImpl(argv: string[], opts: CurlProbeOptions = {}): CurlProb detail, ), }; - addTraceEvent("curl_result", { ok: false, http_status: 0, curl_status: probeResult.curlStatus }); + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: probeResult.curlStatus }); return probeResult; } finally { cleanupTempDir(bodyFile, "nemoclaw-curl-probe"); @@ -302,6 +306,7 @@ function runChatCompletionsStreamingProbeImpl( const errorMessage = compactText( `${result.error.message || String(result.error)} ${String(result.stderr || "")}`, ); + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: errorCode }); return { ok: false, httpStatus: 0, @@ -317,6 +322,7 @@ function runChatCompletionsStreamingProbeImpl( const hasStreamingData = hasChatCompletionsStreamingData(body); const httpOk = Number.isFinite(status) && status >= 200 && status < 300; if (httpOk && hasStreamingData && (curlStatus === 0 || curlStatus === 28)) { + emitCurlResultTraceEvent({ ok: true, http_status: status, curl_status: curlStatus }); return { ok: true, httpStatus: status, @@ -331,6 +337,11 @@ function runChatCompletionsStreamingProbeImpl( httpOk && !hasStreamingData ? `HTTP ${status}: chat completions stream did not return SSE data` : summarizeProbeFailure(body, status || 0, curlStatus, String(result.stderr || "")); + emitCurlResultTraceEvent({ + ok: false, + http_status: Number.isFinite(status) ? status : 0, + curl_status: curlStatus, + }); return { ok: false, httpStatus: Number.isFinite(status) ? status : 0, @@ -341,17 +352,16 @@ function runChatCompletionsStreamingProbeImpl( }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + const curlStatus = + typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1; + emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: curlStatus }); return { ok: false, httpStatus: 0, - curlStatus: - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, + curlStatus, body: "", stderr: detail, - message: summarizeCurlFailure( - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, - detail, - ), + message: summarizeCurlFailure(curlStatus, detail), }; } finally { cleanupTempDir(bodyFile, "nemoclaw-chat-streaming-probe"); @@ -413,6 +423,11 @@ function runStreamingEventProbeImpl( const detail = result.error ? String(result.error.message || result.error) : String(result.stderr || ""); + emitCurlResultTraceEvent({ + ok: false, + missing_events_count: REQUIRED_STREAMING_EVENTS.length, + curl_status: result.status ?? 1, + }); return { ok: false, missingEvents: REQUIRED_STREAMING_EVENTS, @@ -432,6 +447,11 @@ function runStreamingEventProbeImpl( const missing = REQUIRED_STREAMING_EVENTS.filter((e) => !eventTypes.has(e)); if (missing.length > 0) { + emitCurlResultTraceEvent({ + ok: false, + missing_events_count: missing.length, + curl_status: result.status ?? 0, + }); return { ok: false, missingEvents: missing, @@ -441,9 +461,17 @@ function runStreamingEventProbeImpl( }; } + emitCurlResultTraceEvent({ ok: true, missing_events_count: 0, curl_status: result.status ?? 0 }); return { ok: true, missingEvents: [], message: "" }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + const curlStatus = + typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1; + emitCurlResultTraceEvent({ + ok: false, + missing_events_count: REQUIRED_STREAMING_EVENTS.length, + curl_status: curlStatus, + }); return { ok: false, missingEvents: REQUIRED_STREAMING_EVENTS, diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index cf3c8bbfa4..e0ed441b64 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -78,7 +78,8 @@ const { }: typeof import("./onboard/gateway-gpu-passthrough") = require("./onboard/gateway-gpu-passthrough"); const { syncPresetSelection }: typeof import("./onboard/policy-preset-sync") = require("./onboard/policy-preset-sync"); const { maybeForceE2eStepFailure }: typeof import("./onboard/e2e-failure-injection") = require("./onboard/e2e-failure-injection"); -const trace: typeof import("./trace") = require("./trace"); +const onboardTracing: typeof import("./onboard/tracing") = require("./onboard/tracing"); +const sandboxReadinessTracing: typeof import("./onboard/sandbox-readiness-tracing") = require("./onboard/sandbox-readiness-tracing"); const { gatherWechatConfig, hasWechatConfigDrift, @@ -1694,18 +1695,13 @@ function upsertProvider( baseUrl: string | null, env: NodeJS.ProcessEnv = {}, ) { - const result = trace.withTraceSpan( - "nemoclaw.policy.provider_upsert", - { provider: name, provider_type: type, credential_env: credentialEnv, base_url: baseUrl }, - () => - onboardProviders.upsertProvider( - name, - type, - credentialEnv, - baseUrl, - env, - runOpenshell, - ), + const result = onboardProviders.upsertProvider( + name, + type, + credentialEnv, + baseUrl, + env, + runOpenshell, ); if (result.ok && credentialEnv) { const stagedValue = stagedLegacyValues.get(credentialEnv); @@ -1768,11 +1764,7 @@ function verifyDirectSandboxGpu(sandboxName: string): void { } function upsertMessagingProviders(tokenDefs: MessagingTokenDef[]) { - const upserted = trace.withTraceSpan( - "nemoclaw.policy.messaging_provider_upsert", - { provider_count: tokenDefs.length, providers: tokenDefs.map(({ name }) => name) }, - () => onboardProviders.upsertMessagingProviders(tokenDefs, runOpenshell), - ); + const upserted = onboardProviders.upsertMessagingProviders(tokenDefs, runOpenshell); // upsertMessagingProviders process.exits on failure, so reaching this // point means every entry in tokenDefs that had a token was registered. // Mark migrated only when the registered token equals the staged legacy @@ -2145,11 +2137,7 @@ async function validateOpenAiLikeSelection( } = {}, ): Promise { const apiKey = credentialEnv ? getCredential(credentialEnv) : ""; - const probe = trace.withTraceSpan( - "nemoclaw.inference.validation.openai_like", - { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, - () => probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options), - ); + const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options); if (!probe.ok) { console.error(` ${label} endpoint validation failed.`); console.error(` ${probe.message}`); @@ -2185,11 +2173,7 @@ async function validateAnthropicSelectionWithRetryMessage( helpUrl: string | null = null, ): Promise { const apiKey = getCredential(credentialEnv); - const probe = trace.withTraceSpan( - "nemoclaw.inference.validation.anthropic", - { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, - () => probeAnthropicEndpoint(endpointUrl, model, apiKey), - ); + const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); if (!probe.ok) { console.error(` ${label} endpoint validation failed.`); console.error(` ${probe.message}`); @@ -2220,16 +2204,11 @@ async function validateCustomOpenAiLikeSelection( helpUrl: string | null = null, ): Promise { const apiKey = getCredential(credentialEnv); - const probe = trace.withTraceSpan( - "nemoclaw.inference.validation.custom_openai_like", - { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, - () => - probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { - requireResponsesToolCalling: true, - skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), - probeStreaming: true, - }), - ); + const probe = probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, { + requireResponsesToolCalling: true, + skipResponsesProbe: shouldForceCompletionsApi(process.env.NEMOCLAW_PREFERRED_API), + probeStreaming: true, + }); if (probe.ok) { if (probe.note) { console.log(` ℹ ${probe.note}`); @@ -2264,11 +2243,7 @@ async function validateCustomAnthropicSelection( helpUrl: string | null = null, ): Promise { const apiKey = getCredential(credentialEnv); - const probe = trace.withTraceSpan( - "nemoclaw.inference.validation.custom_anthropic", - { provider_label: label, model, credential_env: credentialEnv, endpoint_url: endpointUrl }, - () => probeAnthropicEndpoint(endpointUrl, model, apiKey), - ); + const probe = probeAnthropicEndpoint(endpointUrl, model, apiKey); if (probe.ok) { console.log(` ${probe.label} available — ${agentProductName()} will use ${probe.api}.`); return { ok: true, api: probe.api }; @@ -3227,49 +3202,15 @@ async function ensureNamedCredential( } function waitForSandboxReady(sandboxName: string, attempts = 10, delaySeconds = 2): boolean { - return trace.withTraceSpan( - "nemoclaw.sandbox.readiness_wait", - { sandbox_name: sandboxName, attempts, delay_seconds: delaySeconds }, - () => { - for (let i = 0; i < attempts; i += 1) { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) { - trace.addTraceEvent("ready", { attempt: i + 1, source: "sandbox_list" }); - return true; - } - - // Package-managed OpenShell gateways report readiness through - // `sandbox list`; legacy Kubernetes gateways may still expose pod state. - if (isLinuxDockerDriverGatewayEnabled()) { - if (i < attempts - 1) sleep(delaySeconds); - continue; - } - const podPhase = runCaptureOpenshell( - [ - "doctor", - "exec", - "--", - "kubectl", - "-n", - "openshell", - "get", - "pod", - sandboxName, - "-o", - "jsonpath={.status.phase}", - ], - { ignoreError: true }, - ); - if (podPhase === "Running") { - trace.addTraceEvent("ready", { attempt: i + 1, source: "pod_phase" }); - return true; - } - sleep(delaySeconds); - } - trace.addTraceEvent("not_ready", { attempts }); - return false; - }, - ); + return sandboxReadinessTracing.waitForSandboxReadyWithTrace({ + sandboxName, + attempts, + delaySeconds, + runCaptureOpenshell, + isSandboxReady, + isLinuxDockerDriverGatewayEnabled, + sleep, + }); } // parsePolicyPresetEnv — see urlUtils import above @@ -5541,28 +5482,16 @@ async function createSandbox( timeoutSecs: sandboxReadyTimeoutSecs, deps: { runOpenshell, runCaptureOpenshell, sleep }, }); - const createResult = await trace.withTraceSpan( - "nemoclaw.sandbox.create_stream", - { - sandbox_name: sandboxName, - provider, - model, - timeout_seconds: sandboxReadyTimeoutSecs, - from_dockerfile: Boolean(fromDockerfile), - gpu_enabled: effectiveSandboxGpuConfig.sandboxGpuEnabled, + const createResult = await streamSandboxCreate(createCommand, sandboxEnv, { + readyCheck: () => { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) return true; + dockerGpuCreatePatch.maybeApplyDuringCreate(); + return false; }, - () => - streamSandboxCreate(createCommand, sandboxEnv, { - readyCheck: () => { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) return true; - dockerGpuCreatePatch.maybeApplyDuringCreate(); - return false; - }, - failureCheck: dockerGpuCreatePatch.createFailureMessage, - traceEvent: trace.addTraceEvent, - }), - ); + failureCheck: dockerGpuCreatePatch.createFailureMessage, + traceEvent: onboardTracing.addTraceEvent, + }); if (initialSandboxPolicy.cleanup && initialSandboxPolicy.cleanup()) { process.removeListener("exit", initialSandboxPolicy.cleanup); @@ -5611,23 +5540,13 @@ async function createSandbox( // without this gate, NemoClaw registers a phantom sandbox that // causes "sandbox not found" on every subsequent connect/status call. console.log(" Waiting for sandbox to become ready..."); - const ready = trace.withTraceSpan( - "nemoclaw.sandbox.readiness_wait", - { sandbox_name: sandboxName, timeout_seconds: sandboxReadyTimeoutSecs }, - () => { - const readyAttempts = Math.max(1, Math.ceil(sandboxReadyTimeoutSecs / 2)); - for (let i = 0; i < readyAttempts; i++) { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) { - trace.addTraceEvent("ready", { attempt: i + 1 }); - return true; - } - if (i < readyAttempts - 1) sleep(2); - } - trace.addTraceEvent("not_ready", { attempts: readyAttempts }); - return false; - }, - ); + const ready = sandboxReadinessTracing.waitForCreatedSandboxReadyWithTrace({ + sandboxName, + timeoutSecs: sandboxReadyTimeoutSecs, + runCaptureOpenshell, + isSandboxReady, + sleep, + }); const restoreBackupPath = pendingStateRestore?.manifest?.backupPath ?? pendingStateRestoreBackupPath; @@ -5680,30 +5599,12 @@ async function createSandbox( // Probes /health endpoint and accepts 200 or 401 (device auth) as "alive". // Previously used `curl -sf` which failed on 401, causing false negatives. Fixes #2342. console.log(" Waiting for NemoClaw dashboard to become ready..."); - trace.withTraceSpan( - "nemoclaw.dashboard.readiness_wait", - { sandbox_name: sandboxName, port: effectiveDashboardPort, attempts: 15 }, - () => { - for (let i = 0; i < 15; i++) { - const readyOutput = runCaptureOpenshell( - ["sandbox", "exec", "-n", sandboxName, "--", "curl", "-so", "/dev/null", "-w", "%{http_code}", - "--max-time", "3", `http://localhost:${effectiveDashboardPort}/health`], - { ignoreError: true }, - ); - const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; - trace.addTraceEvent("dashboard_probe", { attempt: i + 1, http_status: readyCode }); - if (readyCode === 200 || readyCode === 401) { - console.log(" ✓ Dashboard is live"); - return; - } - if (i === 14) { - console.warn(" Dashboard taking longer than expected to start. Continuing..."); - } else { - sleep(2); - } - } - }, - ); + sandboxReadinessTracing.waitForDashboardReadyWithTrace({ + sandboxName, + port: effectiveDashboardPort, + runCaptureOpenshell, + sleep, + }); if (effectiveSandboxGpuConfig.sandboxGpuEnabled) { try { @@ -8523,34 +8424,24 @@ async function setupPoliciesWithSelection( sandboxName: string, options: SetupPolicySelectionOptions = {}, ) { - const selectedTier = await trace.withTraceSpan( - "nemoclaw.policy.application", + const selectedTier = await setupPoliciesWithSelectionImpl( { - sandbox_name: sandboxName, - selected_presets: options.selectedPresets ?? null, - provider: options.provider ?? null, - web_search_supported: options.webSearchSupported ?? null, + policies, + tiers, + localInferenceProviders: LOCAL_INFERENCE_PROVIDERS, + step, + note, + isNonInteractive, + waitForSandboxReady, + syncPresetSelection, + selectPolicyTier, + setPolicyTier: (sandbox, tierName) => registry.updateSandbox(sandbox, { policyTier: tierName }), + selectTierPresetsAndAccess, + parsePolicyPresetEnv, + env: process.env, }, - () => - setupPoliciesWithSelectionImpl( - { - policies, - tiers, - localInferenceProviders: LOCAL_INFERENCE_PROVIDERS, - step, - note, - isNonInteractive, - waitForSandboxReady, - syncPresetSelection, - selectPolicyTier, - setPolicyTier: (sandbox, tierName) => registry.updateSandbox(sandbox, { policyTier: tierName }), - selectTierPresetsAndAccess, - parsePolicyPresetEnv, - env: process.env, - }, - sandboxName, - options, - ), + sandboxName, + options, ); return selectedTier; } @@ -9036,16 +8927,7 @@ function skippedStepMessage( async function onboard(opts: OnboardOptions = {}): Promise { setOnboardBrandingAgent(opts.agent || process.env.NEMOCLAW_AGENT || null); - const traceCollector = trace.getTraceCollector(); - const onboardTraceSpan = traceCollector?.startSpan("nemoclaw.onboard", { - resume: opts.resume === true, - fresh: opts.fresh === true, - non_interactive: opts.nonInteractive === true || process.env.NEMOCLAW_NON_INTERACTIVE === "1", - agent: opts.agent || process.env.NEMOCLAW_AGENT || null, - trace_enabled: Boolean(process.env.NEMOCLAW_TRACE), - trace_file_enabled: Boolean(process.env.NEMOCLAW_TRACE_FILE), - trace_dir_enabled: Boolean(process.env.NEMOCLAW_TRACE_DIR), - }); + const onboardTrace = onboardTracing.startOnboardTrace(opts, process.env); NON_INTERACTIVE = opts.nonInteractive || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; @@ -9396,10 +9278,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { getSandbox: registry.getSandbox.bind(registry), getResumeSandboxGpuOverrides, detectGpu: nim.detectGpu, - runPreflight: (preflightOptions) => - trace.withTraceSpan("nemoclaw.onboard.phase.preflight", {}, () => - preflight({ ...opts, ...preflightOptions }), - ), + runPreflight: (preflightOptions) => preflight({ ...opts, ...preflightOptions }), assessHost, assertCdiNvidiaGpuSpecPresent, resolveSandboxGpuConfig, @@ -9476,15 +9355,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { skippedStepMessage, note, startRecordedStep, - startGateway: (gatewayGpu, gatewayOptions) => - trace.withTraceSpan( - "nemoclaw.onboard.phase.gateway", - { - reuse_state: gatewaySnapshot.gatewayReuseState, - gpu_passthrough: gatewayOptions.gpuPassthrough, - }, - () => startGateway(gatewayGpu, gatewayOptions), - ), + startGateway, recordStepComplete, exitProcess: (code) => process.exit(code), }, @@ -9540,9 +9411,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { // otherwise leave a phantom that `nemoclaw list` resurrects until // manually destroyed. await startRecordedStep("provider_selection"); - const selection = await trace.withTraceSpan( - "nemoclaw.onboard.phase.provider_selection", - { sandbox_name: sandboxName, agent: agent?.name ?? null }, + const selection = await onboardTracing.withProviderSelectionTrace( + sandboxName, + agent?.name, () => setupNim(gpu, sandboxName, agent), ); model = selection.model; @@ -9589,14 +9460,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { sandboxName = await promptValidatedSandboxName(agent); } await startRecordedStep("inference", { provider: selectedProvider, model: selectedModel }); - const inferenceResult = await trace.withTraceSpan( - "nemoclaw.onboard.phase.inference", - { - sandbox_name: sandboxName, - provider: selectedProvider, - model: selectedModel, - credential_env: credentialEnv, - }, + const inferenceResult = await onboardTracing.withInferenceTrace( + sandboxName, + selectedProvider, + selectedModel, + credentialEnv, () => setupInference( sandboxName, @@ -9672,14 +9540,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { } await startRecordedStep("inference", { provider, model }); - const inferenceResult = await trace.withTraceSpan( - "nemoclaw.onboard.phase.inference", - { - sandbox_name: sandboxName, - provider: selectedProvider, - model: selectedModel, - credential_env: credentialEnv, - }, + const inferenceResult = await onboardTracing.withInferenceTrace( + sandboxName, + selectedProvider, + selectedModel, + credentialEnv, () => setupInference( sandboxName, @@ -9874,9 +9739,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { if (fresh) { stopStaleDashboardListenersForSandbox(registry.listSandboxes().sandboxes, sandboxName); } - sandboxName = await trace.withTraceSpan( - "nemoclaw.onboard.phase.sandbox", - { sandbox_name: sandboxName, provider, model, agent: agent?.name ?? null }, + sandboxName = await onboardTracing.withSandboxPhaseTrace( + sandboxName, + provider, + model, + agent?.name, () => createSandbox( gpu, @@ -10144,10 +10011,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { } finally { releaseOnboardLock(); onboardRuntimeBoundary.clear(); - if (onboardTraceSpan) { - traceCollector?.endSpan(onboardTraceSpan, traceCompleted ? "OK" : "ERROR"); - trace.flushTrace(traceCompleted ? "OK" : "ERROR"); - } + onboardTracing.finishOnboardTrace(onboardTrace, traceCompleted); } } diff --git a/src/lib/onboard/machine/handlers/gateway.ts b/src/lib/onboard/machine/handlers/gateway.ts index 2a56bd88e5..520bb94bbd 100644 --- a/src/lib/onboard/machine/handlers/gateway.ts +++ b/src/lib/onboard/machine/handlers/gateway.ts @@ -2,9 +2,10 @@ // SPDX-License-Identifier: Apache-2.0 import type { NvidiaPlatform } from "../../../inference/nim"; -import type { GatewayContainerState } from "../../gateway-container-running"; -import type { Session } from "../../../state/onboard-session"; import type { GatewayReuseState } from "../../../state/gateway"; +import type { Session } from "../../../state/onboard-session"; +import type { GatewayContainerState } from "../../gateway-container-running"; +import { withGatewayTrace } from "../../tracing"; export interface GatewayStateOptions { resume: boolean; @@ -173,7 +174,9 @@ export async function handleGatewayState({ gatewayReuseState = "missing"; } await deps.startRecordedStep("gateway"); - await deps.startGateway(gpu, { gpuPassthrough }); + await withGatewayTrace(gatewayReuseState, gpuPassthrough, () => + deps.startGateway(gpu, { gpuPassthrough }), + ); session = await deps.recordStepComplete("gateway"); } diff --git a/src/lib/onboard/machine/handlers/preflight.ts b/src/lib/onboard/machine/handlers/preflight.ts index ccecf7860d..df063154b5 100644 --- a/src/lib/onboard/machine/handlers/preflight.ts +++ b/src/lib/onboard/machine/handlers/preflight.ts @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 import type { Session } from "../../../state/onboard-session"; +import { withPreflightTrace } from "../../tracing"; export type PreflightSandboxGpuFlag = "enable" | "disable" | null; @@ -117,7 +118,7 @@ export async function handlePreflightState< deps.assertCdiNvidiaGpuSpecPresent(deps.assessHost(), resumeOptedOutGpuPassthrough); } else { await deps.startRecordedStep("preflight"); - gpu = await deps.runPreflight({ optedOutGpuPassthrough: noGpu }); + gpu = await withPreflightTrace(() => deps.runPreflight({ optedOutGpuPassthrough: noGpu })); session = await deps.recordStepComplete("preflight"); } diff --git a/src/lib/onboard/policy-selection.ts b/src/lib/onboard/policy-selection.ts index a05ec9677f..cade72870a 100644 --- a/src/lib/onboard/policy-selection.ts +++ b/src/lib/onboard/policy-selection.ts @@ -6,6 +6,7 @@ import { HERMES_TOOL_GATEWAY_PRESET_NAMES, mergeRequiredHermesToolGatewayPolicyPresets, } from "./hermes-managed-tools"; +import { withPolicyApplicationTrace } from "./tracing"; type Preset = { name: string; access?: string }; type SupportOptions = { webSearchSupported?: boolean | null }; @@ -126,6 +127,16 @@ export async function setupPoliciesWithSelection( deps: SetupPolicySelectionDeps, sandboxName: string, options: SetupPolicySelectionOptions = {}, +): Promise { + return withPolicyApplicationTrace(sandboxName, options, () => + setupPoliciesWithSelectionInner(deps, sandboxName, options), + ); +} + +async function setupPoliciesWithSelectionInner( + deps: SetupPolicySelectionDeps, + sandboxName: string, + options: SetupPolicySelectionOptions = {}, ): Promise { const selectedPresets = Array.isArray(options.selectedPresets) ? options.selectedPresets : null; const onSelection = typeof options.onSelection === "function" ? options.onSelection : null; diff --git a/src/lib/onboard/sandbox-readiness-tracing.ts b/src/lib/onboard/sandbox-readiness-tracing.ts new file mode 100644 index 0000000000..7723e13f04 --- /dev/null +++ b/src/lib/onboard/sandbox-readiness-tracing.ts @@ -0,0 +1,133 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { addTraceEvent, withDashboardReadinessTrace, withSandboxReadinessTrace } from "./tracing"; + +type RunCaptureOpenshell = ( + args: string[], + options?: { ignoreError?: boolean }, +) => string; + +export function waitForSandboxReadyWithTrace(options: { + sandboxName: string; + attempts: number; + delaySeconds: number; + runCaptureOpenshell: RunCaptureOpenshell; + isSandboxReady: (output: string, sandboxName: string) => boolean; + isLinuxDockerDriverGatewayEnabled: () => boolean; + sleep: (seconds: number) => void; +}): boolean { + const { + sandboxName, + attempts, + delaySeconds, + runCaptureOpenshell, + isSandboxReady, + isLinuxDockerDriverGatewayEnabled, + sleep, + } = options; + return withSandboxReadinessTrace(sandboxName, { attempts, delay_seconds: delaySeconds }, () => { + for (let i = 0; i < attempts; i += 1) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + addTraceEvent("ready", { attempt: i + 1, source: "sandbox_list" }); + return true; + } + + // Package-managed OpenShell gateways report readiness through + // `sandbox list`; legacy Kubernetes gateways may still expose pod state. + if (isLinuxDockerDriverGatewayEnabled()) { + if (i < attempts - 1) sleep(delaySeconds); + continue; + } + const podPhase = runCaptureOpenshell( + [ + "doctor", + "exec", + "--", + "kubectl", + "-n", + "openshell", + "get", + "pod", + sandboxName, + "-o", + "jsonpath={.status.phase}", + ], + { ignoreError: true }, + ); + if (podPhase === "Running") { + addTraceEvent("ready", { attempt: i + 1, source: "pod_phase" }); + return true; + } + sleep(delaySeconds); + } + addTraceEvent("not_ready", { attempts }); + return false; + }); +} + +export function waitForCreatedSandboxReadyWithTrace(options: { + sandboxName: string; + timeoutSecs: number; + runCaptureOpenshell: RunCaptureOpenshell; + isSandboxReady: (output: string, sandboxName: string) => boolean; + sleep: (seconds: number) => void; +}): boolean { + const { sandboxName, timeoutSecs, runCaptureOpenshell, isSandboxReady, sleep } = options; + return withSandboxReadinessTrace(sandboxName, { timeout_seconds: timeoutSecs }, () => { + const readyAttempts = Math.max(1, Math.ceil(timeoutSecs / 2)); + for (let i = 0; i < readyAttempts; i++) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + addTraceEvent("ready", { attempt: i + 1 }); + return true; + } + if (i < readyAttempts - 1) sleep(2); + } + addTraceEvent("not_ready", { attempts: readyAttempts }); + return false; + }); +} + +export function waitForDashboardReadyWithTrace(options: { + sandboxName: string; + port: string | number; + runCaptureOpenshell: RunCaptureOpenshell; + sleep: (seconds: number) => void; +}): void { + const { sandboxName, port, runCaptureOpenshell, sleep } = options; + withDashboardReadinessTrace(sandboxName, port, 15, () => { + for (let i = 0; i < 15; i++) { + const readyOutput = runCaptureOpenshell( + [ + "sandbox", + "exec", + "-n", + sandboxName, + "--", + "curl", + "-so", + "/dev/null", + "-w", + "%{http_code}", + "--max-time", + "3", + `http://localhost:${port}/health`, + ], + { ignoreError: true }, + ); + const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; + addTraceEvent("dashboard_probe", { attempt: i + 1, http_status: readyCode }); + if (readyCode === 200 || readyCode === 401) { + console.log(" ✓ Dashboard is live"); + return; + } + if (i === 14) { + console.warn(" Dashboard taking longer than expected to start. Continuing..."); + } else { + sleep(2); + } + } + }); +} diff --git a/src/lib/onboard/tracing.ts b/src/lib/onboard/tracing.ts new file mode 100644 index 0000000000..95b80d0fe9 --- /dev/null +++ b/src/lib/onboard/tracing.ts @@ -0,0 +1,173 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import type { GatewayReuseState } from "../state/gateway"; +import * as trace from "../trace"; + +type TraceFn = () => T; + +const TRACE_TRUTHY_VALUES = new Set(["1", "true", "yes", "on"]); + +export interface OnboardTraceOptions { + resume?: boolean; + fresh?: boolean; + nonInteractive?: boolean; + agent?: string | null; +} + +export interface OnboardTraceHandle { + collector: ReturnType; + span: ReturnType>["startSpan"]> | null; +} + +export function isTruthyTraceEnv(value: unknown): boolean { + return TRACE_TRUTHY_VALUES.has(String(value ?? "").trim().toLowerCase()); +} + +export function startOnboardTrace( + opts: OnboardTraceOptions, + env: NodeJS.ProcessEnv, +): OnboardTraceHandle { + const collector = trace.getTraceCollector(); + const span = collector?.startSpan("nemoclaw.onboard", { + resume: opts.resume === true, + fresh: opts.fresh === true, + non_interactive: opts.nonInteractive === true || env.NEMOCLAW_NON_INTERACTIVE === "1", + agent: opts.agent || env.NEMOCLAW_AGENT || null, + trace_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE), + trace_file_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE_FILE), + trace_dir_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE_DIR), + }); + return { collector, span: span ?? null }; +} + +export function finishOnboardTrace(handle: OnboardTraceHandle, completed: boolean): void { + if (!handle.span) return; + handle.collector?.endSpan(handle.span, completed ? "OK" : "ERROR"); + trace.flushTrace(completed ? "OK" : "ERROR"); +} + +export function withPreflightTrace(fn: TraceFn): T { + return trace.withTraceSpan("nemoclaw.onboard.phase.preflight", {}, fn); +} + +export function withGatewayTrace( + reuseState: GatewayReuseState, + gpuPassthrough: boolean, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.gateway", + { reuse_state: reuseState, gpu_passthrough: gpuPassthrough }, + fn, + ); +} + +export function withProviderSelectionTrace( + sandboxName: string | null, + agentName: string | null | undefined, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.provider_selection", + { sandbox_name: sandboxName, agent: agentName ?? null }, + fn, + ); +} + +export function withInferenceTrace( + sandboxName: string, + provider: string, + model: string, + credentialEnv: string | null, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.inference", + { sandbox_name: sandboxName, provider, model, credential_env: credentialEnv }, + fn, + ); +} + +export function withSandboxPhaseTrace( + sandboxName: string, + provider: string, + model: string, + agentName: string | null | undefined, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.onboard.phase.sandbox", + { sandbox_name: sandboxName, provider, model, agent: agentName ?? null }, + fn, + ); +} + +export function withSandboxCreateStreamTrace( + attrs: { + sandboxName: string; + provider: string; + model: string; + timeoutSeconds: number; + fromDockerfile: boolean; + gpuEnabled: boolean; + }, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.sandbox.create_stream", + { + sandbox_name: attrs.sandboxName, + provider: attrs.provider, + model: attrs.model, + timeout_seconds: attrs.timeoutSeconds, + from_dockerfile: attrs.fromDockerfile, + gpu_enabled: attrs.gpuEnabled, + }, + fn, + ); +} + +export function withSandboxReadinessTrace( + sandboxName: string, + attrs: Record, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.sandbox.readiness_wait", + { sandbox_name: sandboxName, ...attrs }, + fn, + ); +} + +export function withDashboardReadinessTrace( + sandboxName: string, + port: string | number, + attempts: number, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.dashboard.readiness_wait", + { sandbox_name: sandboxName, port, attempts }, + fn, + ); +} + +export function withPolicyApplicationTrace( + sandboxName: string, + options: { selectedPresets?: unknown; provider?: unknown; webSearchSupported?: unknown }, + fn: TraceFn, +): T { + return trace.withTraceSpan( + "nemoclaw.policy.application", + { + sandbox_name: sandboxName, + selected_presets: options.selectedPresets ?? null, + provider: options.provider ?? null, + web_search_supported: options.webSearchSupported ?? null, + }, + fn, + ); +} + +export const addTraceEvent = trace.addTraceEvent; diff --git a/src/lib/trace.test.ts b/src/lib/trace.test.ts index 4d854d88d6..4bf0954e34 100644 --- a/src/lib/trace.test.ts +++ b/src/lib/trace.test.ts @@ -158,4 +158,26 @@ describe("onboard trace artifacts", () => { expect(getTraceCollector()).toBeNull(); }); + + it("removes the registered exit listener when resetting tests", () => { + const before = process.listenerCount("exit"); + withTraceFile(() => { + expect(getTraceCollector()).not.toBeNull(); + expect(process.listenerCount("exit")).toBe(before + 1); + resetTraceForTests(); + expect(process.listenerCount("exit")).toBe(before); + }); + }); + + it("does not mark traces flushed when artifact writes fail", () => { + const blocker = path.join(os.tmpdir(), `nemoclaw-trace-blocker-${Date.now()}`); + fs.writeFileSync(blocker, "not a directory"); + process.env[TRACE_FILE_ENV] = path.join(blocker, "trace.json"); + resetTraceForTests(); + const collector = getTraceCollector(); + expect(collector).not.toBeNull(); + + expect(() => flushTrace()).toThrow(); + expect(() => collector?.flush()).toThrow(); + }); }); diff --git a/src/lib/trace.ts b/src/lib/trace.ts index 0209856996..f0cd0dc5c6 100644 --- a/src/lib/trace.ts +++ b/src/lib/trace.ts @@ -200,7 +200,6 @@ export class TraceCollector { flush(finalStatus: TraceStatusCode = "OK", message?: string): string | null { if (this.flushed) return this.outputPath; - this.flushed = true; while (this.spanStack.length > 0) { const span = this.spanStack.pop(); if (span) this.endSpan(span, finalStatus, message); @@ -250,25 +249,32 @@ export class TraceCollector { }; fs.mkdirSync(path.dirname(this.outputPath), { recursive: true, mode: 0o700 }); fs.writeFileSync(this.outputPath, `${JSON.stringify(artifact, null, 2)}\n`, { mode: 0o600 }); + this.flushed = true; return this.outputPath; } } let collector: TraceCollector | null | undefined; +let exitHandler: ((code: number) => void) | null = null; export function getTraceCollector(): TraceCollector | null { if (collector !== undefined) return collector; const tracePath = resolveTracePath(process.env); collector = tracePath ? new TraceCollector(tracePath) : null; if (collector) { - process.once("exit", (code) => { + exitHandler = (code) => { collector?.flush(code === 0 ? "OK" : "ERROR", `process exited with code ${code}`); - }); + }; + process.once("exit", exitHandler); } return collector; } export function resetTraceForTests(): void { + if (exitHandler) { + process.removeListener("exit", exitHandler); + } + exitHandler = null; collector = undefined; } diff --git a/trace-demo.json b/trace-demo.json deleted file mode 100644 index ca0ba2389d..0000000000 --- a/trace-demo.json +++ /dev/null @@ -1,132 +0,0 @@ -{ - "resource_spans": [ - { - "resource": { - "attributes": { - "service.name": "nemoclaw", - "service.version": "unknown", - "host.type": "Darwin", - "os.platform": "darwin", - "process.pid": 42693, - "ci": false - } - }, - "scope_spans": [ - { - "scope": { - "name": "nemoclaw.onboard", - "version": "1.0.0" - }, - "spans": [ - { - "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", - "span_id": "6b2da256ad014b3a", - "name": "nemoclaw.onboard", - "kind": "INTERNAL", - "start_time_unix_nano": "1779404593899000000", - "status": { - "code": "OK" - }, - "attributes": { - "non_interactive": true, - "agent": "openclaw" - }, - "events": [], - "end_time_unix_nano": "1779404593903000000", - "duration_ms": 4 - }, - { - "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", - "span_id": "1f01775665b0cc44", - "parent_span_id": "6b2da256ad014b3a", - "name": "nemoclaw.onboard.phase.inference", - "kind": "INTERNAL", - "start_time_unix_nano": "1779404593900000000", - "status": { - "code": "OK" - }, - "attributes": { - "provider": "compatible-endpoint", - "model": "demo-model", - "credential_env": "DEMO_API_KEY" - }, - "events": [], - "end_time_unix_nano": "1779404593903000000", - "duration_ms": 3 - }, - { - "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", - "span_id": "653c1e1aba4eb46b", - "parent_span_id": "1f01775665b0cc44", - "name": "nemoclaw.inference.validation_probe", - "kind": "INTERNAL", - "start_time_unix_nano": "1779404593901000000", - "status": { - "code": "OK" - }, - "attributes": { - "probe_name": "Chat Completions API", - "api": "openai-completions" - }, - "events": [], - "end_time_unix_nano": "1779404593903000000", - "duration_ms": 2 - }, - { - "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", - "span_id": "f840c39f5415e23d", - "parent_span_id": "653c1e1aba4eb46b", - "name": "nemoclaw.inference.curl_probe", - "kind": "INTERNAL", - "start_time_unix_nano": "1779404593901000000", - "status": { - "code": "OK" - }, - "attributes": { - "http.url": "https://example.test/v1/chat/completions?key=%3CREDACTED%3E", - "http.request.method": "POST", - "process.timeout_ms": 30000 - }, - "events": [ - { - "name": "curl_result", - "time_unix_nano": "1779404593902000000", - "attributes": { - "ok": true, - "http_status": 200, - "curl_status": 0 - } - } - ], - "end_time_unix_nano": "1779404593903000000", - "duration_ms": 2 - } - ] - } - ] - } - ], - "summary": { - "trace_id": "5436e257a15bcb8da6bcde2e1eb823fb", - "generated_at": "2026-05-21T23:03:13.903Z", - "total_duration_ms": 4, - "slowest_spans": [ - { - "name": "nemoclaw.onboard.phase.inference", - "duration_ms": 3, - "status": "OK" - }, - { - "name": "nemoclaw.inference.validation_probe", - "duration_ms": 2, - "status": "OK" - }, - { - "name": "nemoclaw.inference.curl_probe", - "duration_ms": 2, - "status": "OK" - } - ], - "output_path": "/Users/amata/a/nemoclaw/NemoClaw/trace-demo.json" - } -} From 4011194565a326e7143bd14529b6626ed55793ad Mon Sep 17 00:00:00 2001 From: "Angel A. Mata" Date: Fri, 22 May 2026 17:30:26 -0700 Subject: [PATCH 5/6] fix(onboard): tighten trace review fixes --- src/lib/onboard.ts | 3 ++- src/lib/onboard/sandbox-readiness-tracing.ts | 2 +- src/lib/onboard/tracing.ts | 8 ++++++-- src/lib/trace.test.ts | 4 +++- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 97f937c928..1889f754fb 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -8949,7 +8949,6 @@ function skippedStepMessage( async function onboard(opts: OnboardOptions = {}): Promise { setOnboardBrandingAgent(opts.agent || process.env.NEMOCLAW_AGENT || null); - const onboardTrace = onboardTracing.startOnboardTrace(opts, process.env); NON_INTERACTIVE = opts.nonInteractive || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; AUTO_YES = opts.autoYes === true || process.env.NEMOCLAW_YES === "1"; @@ -9101,8 +9100,10 @@ async function onboard(opts: OnboardOptions = {}): Promise { }; process.once("exit", releaseOnboardLock); + let onboardTrace: ReturnType = { collector: null, span: null }; let traceCompleted = false; try { + onboardTrace = onboardTracing.startOnboardTrace(opts, process.env); let session: Session | null; let selectedMessagingChannels: string[] = []; // Merged, absolute fromDockerfile: explicit flag/env takes precedence; on diff --git a/src/lib/onboard/sandbox-readiness-tracing.ts b/src/lib/onboard/sandbox-readiness-tracing.ts index 7723e13f04..33d49ef8b9 100644 --- a/src/lib/onboard/sandbox-readiness-tracing.ts +++ b/src/lib/onboard/sandbox-readiness-tracing.ts @@ -60,7 +60,7 @@ export function waitForSandboxReadyWithTrace(options: { addTraceEvent("ready", { attempt: i + 1, source: "pod_phase" }); return true; } - sleep(delaySeconds); + if (i < attempts - 1) sleep(delaySeconds); } addTraceEvent("not_ready", { attempts }); return false; diff --git a/src/lib/onboard/tracing.ts b/src/lib/onboard/tracing.ts index 95b80d0fe9..121710e66f 100644 --- a/src/lib/onboard/tracing.ts +++ b/src/lib/onboard/tracing.ts @@ -24,6 +24,10 @@ export function isTruthyTraceEnv(value: unknown): boolean { return TRACE_TRUTHY_VALUES.has(String(value ?? "").trim().toLowerCase()); } +function hasTracePath(value: unknown): boolean { + return typeof value === "string" && value.trim().length > 0; +} + export function startOnboardTrace( opts: OnboardTraceOptions, env: NodeJS.ProcessEnv, @@ -35,8 +39,8 @@ export function startOnboardTrace( non_interactive: opts.nonInteractive === true || env.NEMOCLAW_NON_INTERACTIVE === "1", agent: opts.agent || env.NEMOCLAW_AGENT || null, trace_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE), - trace_file_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE_FILE), - trace_dir_enabled: isTruthyTraceEnv(env.NEMOCLAW_TRACE_DIR), + trace_file_enabled: hasTracePath(env.NEMOCLAW_TRACE_FILE), + trace_dir_enabled: hasTracePath(env.NEMOCLAW_TRACE_DIR), }); return { collector, span: span ?? null }; } diff --git a/src/lib/trace.test.ts b/src/lib/trace.test.ts index 4bf0954e34..430cd59213 100644 --- a/src/lib/trace.test.ts +++ b/src/lib/trace.test.ts @@ -170,7 +170,9 @@ describe("onboard trace artifacts", () => { }); it("does not mark traces flushed when artifact writes fail", () => { - const blocker = path.join(os.tmpdir(), `nemoclaw-trace-blocker-${Date.now()}`); + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-blocker-")); + fs.chmodSync(tmpDir, 0o700); + const blocker = path.join(tmpDir, "not-a-directory"); fs.writeFileSync(blocker, "not a directory"); process.env[TRACE_FILE_ENV] = path.join(blocker, "trace.json"); resetTraceForTests(); From 686aa7b352c7341ea33b0ab61a180257514d39f7 Mon Sep 17 00:00:00 2001 From: "Angel A. Mata" Date: Tue, 26 May 2026 13:50:58 -0700 Subject: [PATCH 6/6] fix(onboard): preserve non-null traced sandbox inputs Signed-off-by: Angel Mata amata@nvidia.com --- .../machine/handlers/provider-inference.ts | 34 +++++++++++-------- src/lib/onboard/machine/handlers/sandbox.ts | 7 ++-- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/lib/onboard/machine/handlers/provider-inference.ts b/src/lib/onboard/machine/handlers/provider-inference.ts index 2a6856c0b8..f84d682943 100644 --- a/src/lib/onboard/machine/handlers/provider-inference.ts +++ b/src/lib/onboard/machine/handlers/provider-inference.ts @@ -229,8 +229,10 @@ export async function handleProviderInferenceState({ } const selected = requireSelection(provider, model, deps); - provider = selected.provider; - model = selected.model; + const selectedProvider = selected.provider; + const selectedModel = selected.model; + provider = selectedProvider; + model = selectedModel; if (shouldRecordProviderSelection) { session = await deps.recordStepComplete( "provider_selection", @@ -259,17 +261,18 @@ export async function handleProviderInferenceState({ let inferenceResult: ProviderInferenceRetry; try { if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + const confirmedSandboxName = sandboxName; await deps.startRecordedStep("inference", { provider, model }); inferenceResult = await withInferenceTrace( - sandboxName, - provider, - model, + confirmedSandboxName, + selectedProvider, + selectedModel, credentialEnv, () => deps.setupInference( - sandboxName, - model, - provider, + confirmedSandboxName, + selectedModel, + selectedProvider, endpointUrl, credentialEnv, hermesAuthMethod, @@ -314,6 +317,7 @@ export async function handleProviderInferenceState({ let inferenceResult: ProviderInferenceRetry; try { if (!sandboxName) sandboxName = await deps.promptValidatedSandboxName(agent); + const confirmedSandboxName = sandboxName; const buildEstimateNote = env.NEMOCLAW_IGNORE_RUNTIME_RESOURCES === "1" ? null @@ -327,7 +331,7 @@ export async function handleProviderInferenceState({ webSearchConfig, hermesToolGateways, enabledChannels: selectedMessagingChannels.length > 0 ? selectedMessagingChannels : null, - sandboxName, + sandboxName: confirmedSandboxName, notes: buildEstimateNote ? [buildEstimateNote] : [], }), ); @@ -343,15 +347,15 @@ export async function handleProviderInferenceState({ await deps.startRecordedStep("inference", { provider, model }); inferenceResult = await withInferenceTrace( - sandboxName, - provider, - model, + confirmedSandboxName, + selectedProvider, + selectedModel, credentialEnv, () => deps.setupInference( - sandboxName, - model, - provider, + confirmedSandboxName, + selectedModel, + selectedProvider, endpointUrl, credentialEnv, hermesAuthMethod, diff --git a/src/lib/onboard/machine/handlers/sandbox.ts b/src/lib/onboard/machine/handlers/sandbox.ts index 15190e5035..efa5cf0adb 100644 --- a/src/lib/onboard/machine/handlers/sandbox.ts +++ b/src/lib/onboard/machine/handlers/sandbox.ts @@ -276,10 +276,11 @@ export async function handleSandboxState