Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .agents/skills/nemoclaw-user-reference/references/commands.md
Original file line number Diff line number Diff line change
Expand Up @@ -1299,6 +1299,21 @@ These flags toggle optional behaviors during onboarding; set them before running
| `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. |
| `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. |

### Onboard Profiling Traces

Set `NEMOCLAW_TRACE=1` before `nemoclaw onboard` to write an OpenTelemetry-style JSON trace for the run.
When no explicit path is provided, NemoClaw writes a timestamped file under `.e2e/traces/` in the current working directory.
Use `NEMOCLAW_TRACE_DIR` to choose the output directory, or `NEMOCLAW_TRACE_FILE` to choose the exact output file.

```console
$ NEMOCLAW_TRACE=1 nemoclaw onboard
$ NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces nemoclaw onboard
$ NEMOCLAW_TRACE_FILE=/tmp/nemoclaw-onboard-trace.json nemoclaw onboard
```

Trace artifacts include onboard phase timing, sandbox and dashboard readiness waits, policy application, inference validation probes, curl probe results, and sandbox build progress events.
Secret-like metadata such as API keys, bearer tokens, cookies, and credentials is redacted before the file is written.

### Probe Timeouts

These tune how long internal probes wait before giving up.
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/e2e-branch-validation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,7 @@ jobs:
'set +e
mkdir -p /tmp/nc-debug
cp /tmp/nemoclaw-onboard.log /tmp/nc-debug/ 2>/dev/null || true
cp -R /tmp/nemoclaw-traces /tmp/nc-debug/traces 2>/dev/null || true
timeout 15s openshell sandbox list > /tmp/nc-debug/sandbox-list.txt 2>&1
timeout 15s openshell gateway status > /tmp/nc-debug/gateway-status.txt 2>&1
timeout 15s docker ps -a > /tmp/nc-debug/docker-ps.txt 2>&1
Expand Down
9 changes: 9 additions & 0 deletions .github/workflows/regression-e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,7 @@ jobs:
env:
NEMOCLAW_NON_INTERACTIVE: "1"
NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1"
NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces
run: bash test/e2e/test-onboard-inference-smoke.sh

- name: Upload onboard inference smoke logs on failure
Expand All @@ -239,6 +240,14 @@ jobs:
/tmp/nemoclaw-e2e-onboard-inference-smoke-node.log
if-no-files-found: ignore

- name: Upload onboard profiling traces
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
name: onboard-inference-smoke-traces
path: /tmp/nemoclaw-traces/
if-no-files-found: ignore
Comment thread
coderabbitai[bot] marked this conversation as resolved.

# ── Gateway drift preflight E2E ─────────────────────────────
# Coverage guard for #3399 / #3423. A stale OpenShell gateway image can
# make sandbox-state RPCs fail with protobuf invalid-wire decode errors.
Expand Down
15 changes: 15 additions & 0 deletions docs/reference/commands.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -1307,6 +1307,21 @@ These flags toggle optional behaviors during onboarding; set them before running
| `NEMOCLAW_OPENSHELL_SANDBOX_BIN` | path | Advanced override for the `openshell-sandbox` binary passed to the Linux Docker-driver gateway supervisor. Defaults to the binary next to `openshell`, then common install paths. |
| `NEMOCLAW_OPENSHELL_GATEWAY_STATE_DIR` | path | Advanced override for the Linux Docker-driver gateway pid file and SQLite state directory. Defaults to `~/.local/state/nemoclaw/openshell-docker-gateway`. |

### Onboard Profiling Traces

Set `NEMOCLAW_TRACE=1` before `nemoclaw onboard` to write an OpenTelemetry-style JSON trace for the run.
When no explicit path is provided, NemoClaw writes a timestamped file under `.e2e/traces/` in the current working directory.
Use `NEMOCLAW_TRACE_DIR` to choose the output directory, or `NEMOCLAW_TRACE_FILE` to choose the exact output file.

```console
$ NEMOCLAW_TRACE=1 nemoclaw onboard
$ NEMOCLAW_TRACE_DIR=/tmp/nemoclaw-traces nemoclaw onboard
$ NEMOCLAW_TRACE_FILE=/tmp/nemoclaw-onboard-trace.json nemoclaw onboard
```

Trace artifacts include onboard phase timing, sandbox and dashboard readiness waits, policy application, inference validation probes, curl probe results, and sandbox build progress events.
Secret-like metadata such as API keys, bearer tokens, cookies, and credentials is redacted before the file is written.

### Probe Timeouts

These tune how long internal probes wait before giving up.
Expand Down
59 changes: 58 additions & 1 deletion src/lib/adapters/http/probe.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
// SPDX-License-Identifier: Apache-2.0

import fs from "node:fs";
import os from "node:os";
import path from "node:path";
import { describe, expect, it } from "vitest";
import { afterEach, describe, expect, it } from "vitest";

import { flushTrace, resetTraceForTests, TRACE_FILE_ENV, type TraceArtifact } from "../../trace";
import {
getCurlTimingArgs,
runChatCompletionsStreamingProbe,
Expand All @@ -15,6 +17,19 @@ import {
summarizeProbeFailure,
} from "./probe";

function withTraceFile<T>(fn: (traceFile: string) => T): T {
const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-probe-trace-test-"));
const traceFile = path.join(tmpDir, "trace.json");
process.env[TRACE_FILE_ENV] = traceFile;
resetTraceForTests();
return fn(traceFile);
}

afterEach(() => {
delete process.env[TRACE_FILE_ENV];
resetTraceForTests();
});

describe("http-probe helpers", () => {
it("returns explicit curl timeouts", () => {
expect(getCurlTimingArgs()).toEqual(["--connect-timeout", "10", "--max-time", "60"]);
Expand Down Expand Up @@ -157,6 +172,27 @@ describe("runChatCompletionsStreamingProbe", () => {
expect(result.ok).toBe(false);
expect(result.message).toContain("did not return SSE data");
});

it("records curl_result metadata for chat streaming probes", () => {
withTraceFile((traceFile) => {
const result = runChatCompletionsStreamingProbe(
["-sS", "--max-time", "120", "https://example.test/v1/chat/completions"],
{ spawnSyncImpl: mockStreaming("", 28, "200") },
);

expect(result.ok).toBe(false);
flushTrace();
const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact;
const span = artifact.resource_spans[0].scope_spans[0].spans.find(
(entry) => entry.name === "nemoclaw.inference.curl_streaming_probe",
);
expect(span?.events[0].attributes).toMatchObject({
ok: false,
http_status: 200,
curl_status: 28,
});
});
});
});

describe("runStreamingEventProbe", () => {
Expand Down Expand Up @@ -307,4 +343,25 @@ describe("runStreamingEventProbe", () => {
expect(fs.existsSync(outputPath)).toBe(false);
expect(fs.existsSync(path.dirname(outputPath))).toBe(false);
});

it("records curl_result metadata for responses streaming probes", () => {
withTraceFile((traceFile) => {
const result = runStreamingEventProbe(
["-sS", "--max-time", "15", "https://example.test/v1/responses"],
{ spawnSyncImpl: mockStreaming("event: response.created\ndata: {}\n") },
);

expect(result.ok).toBe(false);
flushTrace();
const artifact = JSON.parse(fs.readFileSync(traceFile, "utf8")) as TraceArtifact;
const span = artifact.resource_spans[0].scope_spans[0].spans.find(
(entry) => entry.name === "nemoclaw.inference.curl_streaming_event_probe",
);
expect(span?.events[0].attributes).toMatchObject({
ok: false,
missing_events_count: 1,
curl_status: 0,
});
});
});
});
112 changes: 103 additions & 9 deletions src/lib/adapters/http/probe.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { isErrnoException } from "../../core/errno";
import { compactText } from "../../core/url-utils";
import type { ProbeResult } from "../../onboard/types";
import { ROOT } from "../../state/paths";
import { addTraceEvent, withTraceSpan } from "../../trace";

export type CurlProbeResult = ProbeResult;

Expand Down Expand Up @@ -68,6 +69,37 @@ export function getCurlTimingArgs(): string[] {
return ["--connect-timeout", "10", "--max-time", "60"];
}

function sanitizeCurlUrl(value: string): string {
try {
const url = new URL(value);
url.username = "";
url.password = "";
for (const key of [...url.searchParams.keys()]) {
url.searchParams.set(key, "<REDACTED>");
}
url.hash = "";
return url.toString();
} catch {
return value.replace(/(Bearer\s+)\S+/gi, "$1<REDACTED>");
}
}

function getCurlProbeTraceAttributes(argv: string[], opts: CurlProbeOptions): Record<string, unknown> {
const url = argv.at(-1) || "";
const methodIndex = argv.findIndex((arg) => arg === "-X" || arg === "--request");
const method =
methodIndex >= 0 && argv[methodIndex + 1] ? argv[methodIndex + 1].toUpperCase() : "POST";
return {
"http.url": sanitizeCurlUrl(String(url)),
"http.request.method": method,
"process.timeout_ms": opts.timeoutMs ?? 30_000,
};
}

function emitCurlResultTraceEvent(attributes: Record<string, unknown>): void {
addTraceEvent("curl_result", attributes);
}

export function summarizeCurlFailure(curlStatus = 0, stderr = "", body = ""): string {
const detail = compactText(stderr || body);
return detail
Expand Down Expand Up @@ -130,6 +162,12 @@ export function summarizeProbeFailure(body = "", status = 0, curlStatus = 0, std
}

export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult {
return withTraceSpan("nemoclaw.inference.curl_probe", getCurlProbeTraceAttributes(argv, opts), () =>
runCurlProbeImpl(argv, opts),
);
}

function runCurlProbeImpl(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult {
const bodyFile = secureTempFile("nemoclaw-curl-probe", ".json");
try {
const args = [...argv];
Expand All @@ -154,17 +192,19 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP
const errorMessage = compactText(
`${result.error.message || String(result.error)} ${String(result.stderr || "")}`,
);
return {
const failure = {
ok: false,
httpStatus: 0,
curlStatus: errorCode,
body,
stderr: errorMessage,
message: summarizeProbeFailure(body, 0, errorCode, errorMessage),
};
emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: errorCode });
return failure;
}
const status = Number(String(result.stdout || "").trim());
return {
const probeResult = {
ok: result.status === 0 && status >= 200 && status < 300,
httpStatus: Number.isFinite(status) ? status : 0,
curlStatus: result.status || 0,
Expand All @@ -177,9 +217,15 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP
String(result.stderr || ""),
),
};
emitCurlResultTraceEvent({
ok: probeResult.ok,
http_status: probeResult.httpStatus,
curl_status: probeResult.curlStatus,
});
return probeResult;
} catch (error) {
const detail = error instanceof Error ? error.message : String(error);
return {
const probeResult = {
ok: false,
httpStatus: 0,
curlStatus:
Expand All @@ -191,6 +237,8 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP
detail,
),
};
emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: probeResult.curlStatus });
return probeResult;
} finally {
cleanupTempDir(bodyFile, "nemoclaw-curl-probe");
}
Expand Down Expand Up @@ -218,6 +266,17 @@ function hasChatCompletionsStreamingData(body: string): boolean {
export function runChatCompletionsStreamingProbe(
argv: string[],
opts: CurlProbeOptions = {},
): CurlProbeResult {
return withTraceSpan(
"nemoclaw.inference.curl_streaming_probe",
getCurlProbeTraceAttributes(argv, opts),
() => runChatCompletionsStreamingProbeImpl(argv, opts),
);
}

function runChatCompletionsStreamingProbeImpl(
argv: string[],
opts: CurlProbeOptions = {},
): CurlProbeResult {
const bodyFile = secureTempFile("nemoclaw-chat-streaming-probe", ".sse");
try {
Expand Down Expand Up @@ -247,6 +306,7 @@ export function runChatCompletionsStreamingProbe(
const errorMessage = compactText(
`${result.error.message || String(result.error)} ${String(result.stderr || "")}`,
);
emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: errorCode });
return {
ok: false,
httpStatus: 0,
Expand All @@ -262,6 +322,7 @@ export function runChatCompletionsStreamingProbe(
const hasStreamingData = hasChatCompletionsStreamingData(body);
const httpOk = Number.isFinite(status) && status >= 200 && status < 300;
if (httpOk && hasStreamingData && (curlStatus === 0 || curlStatus === 28)) {
emitCurlResultTraceEvent({ ok: true, http_status: status, curl_status: curlStatus });
return {
ok: true,
httpStatus: status,
Expand All @@ -276,6 +337,11 @@ export function runChatCompletionsStreamingProbe(
httpOk && !hasStreamingData
? `HTTP ${status}: chat completions stream did not return SSE data`
: summarizeProbeFailure(body, status || 0, curlStatus, String(result.stderr || ""));
emitCurlResultTraceEvent({
ok: false,
http_status: Number.isFinite(status) ? status : 0,
curl_status: curlStatus,
});
return {
ok: false,
httpStatus: Number.isFinite(status) ? status : 0,
Expand All @@ -286,17 +352,16 @@ export function runChatCompletionsStreamingProbe(
};
} catch (error) {
const detail = error instanceof Error ? error.message : String(error);
const curlStatus =
typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1;
emitCurlResultTraceEvent({ ok: false, http_status: 0, curl_status: curlStatus });
return {
ok: false,
httpStatus: 0,
curlStatus:
typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1,
curlStatus,
body: "",
stderr: detail,
message: summarizeCurlFailure(
typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1,
detail,
),
message: summarizeCurlFailure(curlStatus, detail),
};
} finally {
cleanupTempDir(bodyFile, "nemoclaw-chat-streaming-probe");
Expand All @@ -323,6 +388,17 @@ const REQUIRED_STREAMING_EVENTS = ["response.output_text.delta"];
export function runStreamingEventProbe(
argv: string[],
opts: CurlProbeOptions = {},
): StreamingProbeResult {
return withTraceSpan(
"nemoclaw.inference.curl_streaming_event_probe",
getCurlProbeTraceAttributes(argv, opts),
() => runStreamingEventProbeImpl(argv, opts),
);
}

function runStreamingEventProbeImpl(
argv: string[],
opts: CurlProbeOptions = {},
): StreamingProbeResult {
const bodyFile = secureTempFile("nemoclaw-streaming-probe", ".sse");
try {
Expand All @@ -347,6 +423,11 @@ export function runStreamingEventProbe(
const detail = result.error
? String(result.error.message || result.error)
: String(result.stderr || "");
emitCurlResultTraceEvent({
ok: false,
missing_events_count: REQUIRED_STREAMING_EVENTS.length,
curl_status: result.status ?? 1,
});
return {
ok: false,
missingEvents: REQUIRED_STREAMING_EVENTS,
Expand All @@ -366,6 +447,11 @@ export function runStreamingEventProbe(

const missing = REQUIRED_STREAMING_EVENTS.filter((e) => !eventTypes.has(e));
if (missing.length > 0) {
emitCurlResultTraceEvent({
ok: false,
missing_events_count: missing.length,
curl_status: result.status ?? 0,
});
return {
ok: false,
missingEvents: missing,
Expand All @@ -375,9 +461,17 @@ export function runStreamingEventProbe(
};
}

emitCurlResultTraceEvent({ ok: true, missing_events_count: 0, curl_status: result.status ?? 0 });
return { ok: true, missingEvents: [], message: "" };
} catch (error) {
const detail = error instanceof Error ? error.message : String(error);
const curlStatus =
typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1;
emitCurlResultTraceEvent({
ok: false,
missing_events_count: REQUIRED_STREAMING_EVENTS.length,
curl_status: curlStatus,
});
return {
ok: false,
missingEvents: REQUIRED_STREAMING_EVENTS,
Expand Down
Loading
Loading