From c5ad034449a21f5dc638e46cee116f005042fdd8 Mon Sep 17 00:00:00 2001 From: Yimo Jiang Date: Mon, 25 May 2026 10:03:03 +0000 Subject: [PATCH 1/4] fix(cli): verify configured channels reach OpenClaw runtime (#4156) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Onboarding could write Telegram and other channel blocks into `/sandbox/.openclaw/openclaw.json` but the OpenClaw dashboard still rendered "No channels found" because NemoClaw only verified static config and gateway-provider attachment — never that the running OpenClaw process acknowledged each configured channel. Add a two-layer probe that reads the in-sandbox config AND scans the gateway log segment since the most recent launch for channel mentions; surface mismatches as a `messaging` warn diagnostic in both `verifyDeployment` (post-create) and `nemoclaw doctor`. Format the post-deployment summary so warn-level messaging diagnostics also render on the healthy-overall path, where they would otherwise be silently dropped. Focused tests cover the new module (pure parsing + shell-script end-to-end), the verifyDeployment integration (configured-but-not- running, stale-rebuild config drift, log unavailable, malformed config), and a doctor `--json` E2E assertion that the new Runtime channel registry check fires after rebuild. Signed-off-by: Yimo Jiang --- src/lib/actions/sandbox/doctor.ts | 91 +++++ src/lib/channel-runtime-status.test.ts | 441 +++++++++++++++++++++++++ src/lib/channel-runtime-status.ts | 355 ++++++++++++++++++++ src/lib/onboard.ts | 14 + src/lib/verify-deployment.test.ts | 171 ++++++++++ src/lib/verify-deployment.ts | 166 +++++++++- test/e2e/test-messaging-providers.sh | 47 +++ 7 files changed, 1275 insertions(+), 10 deletions(-) create mode 100644 src/lib/channel-runtime-status.test.ts create mode 100644 src/lib/channel-runtime-status.ts diff --git a/src/lib/actions/sandbox/doctor.ts b/src/lib/actions/sandbox/doctor.ts index 03c70d34ef..5462d55ed3 100644 --- a/src/lib/actions/sandbox/doctor.ts +++ b/src/lib/actions/sandbox/doctor.ts @@ -7,8 +7,11 @@ import fs from "node:fs"; import path from "node:path"; import * as agentRuntime from "../../agent/runtime"; +import { loadAgent } from "../../agent/defs"; +import { compareChannelSets, probeChannelRuntimeStatus } from "../../channel-runtime-status"; import { CLI_DISPLAY_NAME, CLI_NAME } from "../../cli/branding"; import { recoverNamedGatewayRuntime } from "../../gateway-runtime-action"; +import { executeSandboxCommandForVerification } from "../../onboard/sandbox-verification-exec"; import { readCloudflaredState } from "../../tunnel/services"; import { probeProviderHealth, type ProviderHealthStatus } from "../../inference/health"; import { probeSandboxInferenceGatewayHealth } from "./process-recovery"; @@ -347,6 +350,82 @@ function ollamaDoctorCheck(currentProvider: string): DoctorCheck { }; } +/** + * Compare the registry's enabled-channels list with channels the OpenClaw + * runtime actually acknowledged inside the sandbox (config block in + * /sandbox/.openclaw/openclaw.json plus a gateway-log mention). Returns + * null when the probe doesn't apply (no enabled channels, agent has no + * JSON config) so the caller can skip the check entirely instead of + * rendering a no-op line. Fixes #4156 — without this, a sandbox where + * the OpenClaw runtime silently ignored a configured channel looks healthy + * at `doctor` time even though the dashboard shows "No channels found". + */ +function channelRuntimeDoctorCheck( + sandboxName: string, + enabledChannels: string[], +): DoctorCheck | null { + if (enabledChannels.length === 0) return null; + let agent: ReturnType; + try { + const sb = registry.getSandbox(sandboxName); + agent = loadAgent(sb?.agent || "openclaw"); + } catch { + return null; + } + if (agent.configPaths.format !== "json") return null; + const configFilePath = `${agent.configPaths.dir}/${agent.configPaths.configFile}`; + const runtime = probeChannelRuntimeStatus({ + configFilePath, + executeSandboxCommand: (script: string) => + executeSandboxCommandForVerification(sandboxName, script), + }); + if (!runtime.ok) { + return { + group: "Messaging", + label: "Runtime channel registry", + status: "warn", + detail: runtime.detail, + hint: + `start the sandbox and rerun \`${CLI_NAME} ${sandboxName} doctor\`, ` + + `or rebuild with \`${CLI_NAME} ${sandboxName} rebuild\` if the config file is missing`, + }; + } + // Compare the registry's expected set with what the runtime acknowledged + // (visible = config has the channel AND gateway log mentioned it). This + // catches both "config dropped the channel" (stale/bad rebuild) and + // "config has it but runtime didn't start it" (the reporter's case). + const { missing: notRunning } = compareChannelSets(enabledChannels, runtime.visibleChannels); + if (notRunning.length > 0) { + return { + group: "Messaging", + label: "Runtime channel registry", + status: "warn", + detail: `configured but not in OpenClaw runtime: ${notRunning.join(", ")}`, + hint: + `the OpenClaw dashboard "Channels" panel will show "No channels found" for ` + + `${notRunning.join(", ")}; inspect the gateway log with \`${CLI_NAME} ${sandboxName} logs\` ` + + `and re-run \`${CLI_NAME} ${sandboxName} rebuild\` if the channels block needs to be regenerated`, + }; + } + if (!runtime.logProbeOk) { + return { + group: "Messaging", + label: "Runtime channel registry", + status: "warn", + detail: `${enabledChannels.join(", ")} present in config; gateway log unavailable, runtime startup not confirmed`, + hint: + `start the sandbox and rerun \`${CLI_NAME} ${sandboxName} doctor\`, or inspect ` + + `the gateway log with \`${CLI_NAME} ${sandboxName} logs\``, + }; + } + return { + group: "Messaging", + label: "Runtime channel registry", + status: "ok", + detail: `${enabledChannels.join(", ")} acknowledged by OpenClaw runtime`, + }; +} + function messagingDoctorCheck(sandboxName: string, sb: SandboxEntry): DoctorCheck { const registeredChannels = Array.isArray(sb.messagingChannels) ? sb.messagingChannels : []; const disabledChannels = new Set(Array.isArray(sb.disabledChannels) ? sb.disabledChannels : []); @@ -632,6 +711,18 @@ export async function runSandboxDoctor( hint: shieldsHint, }); checks.push(messagingDoctorCheck(sandboxName, sb)); + // #4156: bridge the gap between "configured" and "runtime-visible" — the + // existing messaging check above probes provider attachment, not whether + // OpenClaw's runtime config actually surfaces each enabled channel. + const registeredChannels = Array.isArray(sb.messagingChannels) ? sb.messagingChannels : []; + const disabledChannelsSet = new Set( + Array.isArray(sb.disabledChannels) ? sb.disabledChannels : [], + ); + const enabledChannels = registeredChannels.filter( + (channel: string) => !disabledChannelsSet.has(channel), + ); + const runtimeCheck = channelRuntimeDoctorCheck(sandboxName, enabledChannels); + if (runtimeCheck) checks.push(runtimeCheck); } checks.push(ollamaDoctorCheck(currentProvider)); diff --git a/src/lib/channel-runtime-status.test.ts b/src/lib/channel-runtime-status.test.ts new file mode 100644 index 0000000000..c2f631224c --- /dev/null +++ b/src/lib/channel-runtime-status.test.ts @@ -0,0 +1,441 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, it, expect } from "vitest"; +import { + buildGatewayLogScanScript, + compareChannelSets, + extractEnabledChannelsFromOpenclawConfig, + parseGatewayLogScanOutput, + probeChannelRuntimeStatus, +} from "../../dist/lib/channel-runtime-status.js"; + +// Build an executeSandboxCommand mock that returns the config file body +// on the `cat` call and a synthesized gateway-log-scan output on the +// scan call. `logChannelsFound` is the list of FOUND: entries +// the scan emits; null suppresses the OK marker entirely (simulates a +// missing log file). Pass `null` for either to make the exec spawn fail. +function makeMockExec( + configBody: string | null, + logChannelsFound: string[] | null, +): (script: string) => { status: number; stdout: string; stderr: string } | null { + return (script: string) => { + if (script.startsWith("cat ")) { + if (configBody === null) return null; + return { status: 0, stdout: configBody, stderr: "" }; + } + if (script.includes("GATEWAY_LOG_PROBED")) { + if (logChannelsFound === null) { + // `if test -r path` evaluated false — nothing was echoed. + return { status: 0, stdout: "", stderr: "" }; + } + const lines = ["GATEWAY_LOG_PROBED", ...logChannelsFound.map((p) => `FOUND:${p}`)]; + return { status: 0, stdout: `${lines.join("\n")}\n`, stderr: "" }; + } + return null; + }; +} + +describe("extractEnabledChannelsFromOpenclawConfig", () => { + it("returns empty for non-object input", () => { + expect(extractEnabledChannelsFromOpenclawConfig(null)).toEqual([]); + expect(extractEnabledChannelsFromOpenclawConfig(undefined)).toEqual([]); + expect(extractEnabledChannelsFromOpenclawConfig("oops")).toEqual([]); + expect(extractEnabledChannelsFromOpenclawConfig(42)).toEqual([]); + }); + + it("returns empty when channels block is missing or empty", () => { + expect(extractEnabledChannelsFromOpenclawConfig({})).toEqual([]); + expect(extractEnabledChannelsFromOpenclawConfig({ channels: {} })).toEqual([]); + expect(extractEnabledChannelsFromOpenclawConfig({ channels: { defaults: {} } })).toEqual([]); + }); + + it("collects channels with at least one enabled account", () => { + const config = { + channels: { + telegram: { accounts: { default: { enabled: true, botToken: "x" } } }, + slack: { + accounts: { + default: { enabled: true, botToken: "x", appToken: "y" }, + }, + }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual(["slack", "telegram"]); + }); + + it("skips channels whose only account has enabled=false", () => { + const config = { + channels: { + telegram: { accounts: { default: { enabled: false } } }, + discord: { accounts: { default: { enabled: true } } }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual(["discord"]); + }); + + it("maps openclaw-weixin to wechat", () => { + const config = { + channels: { + "openclaw-weixin": { + accounts: { + "wechat-acct-1": { enabled: true }, + }, + }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual(["wechat"]); + }); + + it("includes WhatsApp's token-less account when enabled", () => { + const config = { + channels: { + whatsapp: { + accounts: { + default: { enabled: true }, + }, + }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual(["whatsapp"]); + }); + + it("dedupes when multiple accounts under one channel are enabled", () => { + const config = { + channels: { + discord: { + accounts: { + primary: { enabled: true }, + secondary: { enabled: true }, + }, + }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual(["discord"]); + }); + + it("ignores unknown channel keys", () => { + const config = { + channels: { + "vendor-future": { accounts: { default: { enabled: true } } }, + telegram: { accounts: { default: { enabled: true } } }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual(["telegram"]); + }); + + it("treats missing accounts block as no enabled accounts", () => { + const config = { + channels: { + telegram: { enabled: true }, + }, + }; + expect(extractEnabledChannelsFromOpenclawConfig(config)).toEqual([]); + }); +}); + +describe("buildGatewayLogScanScript", () => { + it("emits a `test -r` guard and the OK marker", () => { + const script = buildGatewayLogScanScript("/tmp/gateway.log"); + expect(script).toContain("test -r '/tmp/gateway.log'"); + expect(script).toContain("echo GATEWAY_LOG_PROBED"); + }); + + it("isolates the current launch segment with awk before grepping", () => { + // Without launch-segment isolation a stale channel mention from a + // previous gateway run would still satisfy the probe even though the + // *current* OpenClaw process never started the channel (#4156 review). + // The awk filter resets its buffer on every boot/respawn marker so + // only the segment since the last launch reaches grep. + const script = buildGatewayLogScanScript("/tmp/gateway.log"); + expect(script).toContain("(launched|respawning)"); + expect(script).toContain("buf=\"\""); + expect(script).toContain("grep -iwoE 'telegram|discord|slack|whatsapp|wechat|openclaw-weixin'"); + expect(script).not.toContain("tail -n"); + expect(script).not.toContain("grep -m 1 -iwF 'telegram'"); + }); + + it("escapes single quotes in the log path", () => { + const script = buildGatewayLogScanScript("/tmp/odd'path.log"); + expect(script).toContain(`'/tmp/odd'\\''path.log'`); + }); +}); + +describe("parseGatewayLogScanOutput", () => { + it("collects channel names from FOUND: lines", () => { + const stdout = `GATEWAY_LOG_PROBED +FOUND:telegram +FOUND:discord +`; + expect([...parseGatewayLogScanOutput(stdout)].sort()).toEqual(["discord", "telegram"]); + }); + + it("collapses openclaw-weixin onto wechat", () => { + const stdout = `GATEWAY_LOG_PROBED +FOUND:openclaw-weixin +`; + expect([...parseGatewayLogScanOutput(stdout)]).toEqual(["wechat"]); + }); + + it("returns an empty set when no FOUND: lines are present", () => { + expect(parseGatewayLogScanOutput("GATEWAY_LOG_PROBED\n").size).toBe(0); + }); + + it("matches case-insensitively because grep -iwoE preserves log casing", () => { + // The grep in the script keeps whatever case the log line used. The + // parser normalizes so an OpenClaw log mentioning "Telegram" still + // collapses onto the canonical "telegram" channel name. + const stdout = `GATEWAY_LOG_PROBED +FOUND:Telegram +FOUND:WHATSAPP +`; + expect([...parseGatewayLogScanOutput(stdout)].sort()).toEqual(["telegram", "whatsapp"]); + }); +}); + +describe("buildGatewayLogScanScript end-to-end shell behavior", () => { + // Real-shell execution to confirm the awk/grep pipeline does what the + // unit tests assert in structure. This guards against subtle quoting + // and shell-flag drift between the builder and a sandbox sh. + const { execSync, writeFileSync, unlinkSync, mkdtempSync, tmpdir, joinPath } = (() => { + const cp = require("node:child_process"); + const fs = require("node:fs"); + const os = require("node:os"); + const path = require("node:path"); + return { + execSync: cp.execSync, + writeFileSync: fs.writeFileSync, + unlinkSync: fs.unlinkSync, + mkdtempSync: fs.mkdtempSync, + tmpdir: os.tmpdir, + joinPath: path.join, + }; + })(); + + function runScript(logBody: string): Set { + const dir = mkdtempSync(joinPath(tmpdir(), "channel-runtime-status-")); + const logPath = joinPath(dir, "gateway.log"); + writeFileSync(logPath, logBody); + const script = buildGatewayLogScanScript(logPath); + try { + // sh -c so we exercise a POSIX shell, not bash-only features. The + // pipeline can legitimately exit non-zero when no channel matches + // (sed gets empty input); pipe through `cat` to swallow the exit. + const stdout = execSync(`sh -c "${script.replace(/"/g, "\\\"").replace(/\$/g, "\\$")}"`, { + encoding: "utf-8", + }); + return parseGatewayLogScanOutput(stdout); + } finally { + try { + unlinkSync(logPath); + } catch { + /* best-effort cleanup */ + } + } + } + + it("returns only channels mentioned since the last gateway boot", () => { + const logBody = [ + "2026-05-25 [gateway] openclaw gateway launched (pid 1)", + "2026-05-25 [info] discord registered", + "2026-05-25 [gateway] pid 1 exited (rc=2); respawning (#1)", + "2026-05-25 [info] Starting telegram bridge", + ].join("\n"); + expect([...runScript(logBody)].sort()).toEqual(["telegram"]); + }); + + it("returns an empty set when the current launch segment has no channel mentions (#4156)", () => { + const logBody = [ + "2026-05-25 [gateway] openclaw gateway launched (pid 1)", + "2026-05-25 [info] Starting telegram bridge", + "2026-05-25 [gateway] pid 1 exited (rc=2); respawning (#1)", + "2026-05-25 [error] failed to load channel config", + ].join("\n"); + expect([...runScript(logBody)]).toEqual([]); + }); + + it("collapses openclaw-weixin in the live log onto the wechat channel name", () => { + const logBody = [ + "2026-05-25 [gateway] openclaw gateway launched (pid 1)", + "2026-05-25 [info] openclaw-weixin plugin loaded", + ].join("\n"); + expect([...runScript(logBody)]).toEqual(["wechat"]); + }); +}); + +describe("probeChannelRuntimeStatus", () => { + it("returns ok=false when sandbox exec fails", () => { + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + executeSandboxCommand: () => null, + }); + expect(result.ok).toBe(false); + expect(result.visibleChannels).toEqual([]); + expect(result.detail).toContain("sandbox unreachable"); + expect(result.logProbeOk).toBe(false); + }); + + it("returns ok=false when config file is missing or empty", () => { + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + executeSandboxCommand: makeMockExec("", []), + }); + expect(result.ok).toBe(false); + expect(result.detail).toContain("missing or empty"); + }); + + it("returns ok=false on invalid JSON", () => { + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + executeSandboxCommand: makeMockExec("{not json", []), + }); + expect(result.ok).toBe(false); + expect(result.detail).toContain("not valid JSON"); + }); + + it("treats a configured channel as visible when the gateway log mentions it", () => { + const config = JSON.stringify({ + channels: { telegram: { accounts: { default: { enabled: true } } } }, + }); + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + executeSandboxCommand: makeMockExec(config, ["telegram"]), + }); + expect(result.ok).toBe(true); + expect(result.logProbeOk).toBe(true); + expect(result.visibleChannels).toEqual(["telegram"]); + expect(result.configuredButNotRunning).toEqual([]); + }); + + it("flags a configured channel as not-running when the gateway log never mentions it (#4156 reporter case)", () => { + // Reporter symptom: openclaw.json had the telegram block but the + // dashboard rendered "No channels found." This is the failure mode — + // configured but the OpenClaw runtime never logged anything for it. + const config = JSON.stringify({ + channels: { + telegram: { + accounts: { default: { enabled: true, botToken: "openshell:resolve:env:TELEGRAM_BOT_TOKEN" } }, + }, + }, + }); + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + executeSandboxCommand: makeMockExec(config, []), + }); + expect(result.ok).toBe(true); + expect(result.logProbeOk).toBe(true); + expect(result.visibleChannels).toEqual([]); + expect(result.configuredButNotRunning).toEqual(["telegram"]); + }); + + it("returns empty visible channels when runtime config has no channels block", () => { + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + executeSandboxCommand: makeMockExec(JSON.stringify({ models: {} }), []), + }); + expect(result.ok).toBe(true); + expect(result.visibleChannels).toEqual([]); + expect(result.configuredButNotRunning).toEqual([]); + }); + + it("collapses openclaw-weixin in the log onto the wechat channel name", () => { + const config = JSON.stringify({ + channels: { + "openclaw-weixin": { accounts: { "acct-1": { enabled: true } } }, + }, + }); + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + // Log mentions only the plugin name, not "wechat" + executeSandboxCommand: makeMockExec(config, ["openclaw-weixin"]), + }); + expect(result.visibleChannels).toEqual(["wechat"]); + expect(result.configuredButNotRunning).toEqual([]); + }); + + it("falls back to config-only when the gateway log is missing", () => { + const config = JSON.stringify({ + channels: { telegram: { accounts: { default: { enabled: true } } } }, + }); + const result = probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + // logChannelsFound=null = no OK marker emitted = log unreadable + executeSandboxCommand: makeMockExec(config, null), + }); + expect(result.ok).toBe(true); + expect(result.logProbeOk).toBe(false); + // Without log corroboration, the config view is reported as visible + // but configuredButNotRunning stays empty — the caller decides how + // to surface the "could not verify runtime" caveat in its diagnostic. + expect(result.visibleChannels).toEqual(["telegram"]); + expect(result.configuredButNotRunning).toEqual([]); + expect(result.detail).toContain("unreadable"); + }); + + it("escapes single quotes in the config file path", () => { + const captured: string[] = []; + probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.open'claw/openclaw.json", + executeSandboxCommand: (script: string) => { + captured.push(script); + if (script.startsWith("cat ")) return { status: 0, stdout: "{}", stderr: "" }; + return { status: 0, stdout: "", stderr: "" }; + }, + }); + expect(captured[0]).toContain(`'/sandbox/.open'\\''claw/openclaw.json'`); + }); + + it("honors a custom gateway log path override", () => { + const captured: string[] = []; + probeChannelRuntimeStatus({ + configFilePath: "/sandbox/.openclaw/openclaw.json", + gatewayLogPath: "/var/log/openclaw/agent.log", + executeSandboxCommand: (script: string) => { + captured.push(script); + if (script.startsWith("cat ")) return { status: 0, stdout: "{}", stderr: "" }; + return { status: 0, stdout: "", stderr: "" }; + }, + }); + const scanScript = captured.find((s) => s.includes("GATEWAY_LOG_PROBED")); + expect(scanScript).toBeDefined(); + expect(scanScript).toContain("/var/log/openclaw/agent.log"); + // No `tail` fallback should remain — the probe scans the full file. + expect(captured.every((s) => !s.startsWith("tail "))).toBe(true); + }); +}); + +describe("compareChannelSets", () => { + it("returns empty when sets match", () => { + expect(compareChannelSets(["telegram", "discord"], ["discord", "telegram"])).toEqual({ + missing: [], + unexpected: [], + }); + }); + + it("reports configured channels missing from the runtime view", () => { + expect(compareChannelSets(["telegram", "slack"], ["telegram"])).toEqual({ + missing: ["slack"], + unexpected: [], + }); + }); + + it("reports runtime channels not present in the configured view", () => { + expect(compareChannelSets(["telegram"], ["telegram", "discord"])).toEqual({ + missing: [], + unexpected: ["discord"], + }); + }); + + it("dedupes configured input before comparing", () => { + expect(compareChannelSets(["telegram", "telegram"], ["telegram"])).toEqual({ + missing: [], + unexpected: [], + }); + }); + + it("sorts the missing/unexpected outputs", () => { + expect( + compareChannelSets(["telegram", "slack", "discord"], ["whatsapp"]), + ).toEqual({ missing: ["discord", "slack", "telegram"], unexpected: ["whatsapp"] }); + }); +}); diff --git a/src/lib/channel-runtime-status.ts b/src/lib/channel-runtime-status.ts new file mode 100644 index 0000000000..6bd33da8f8 --- /dev/null +++ b/src/lib/channel-runtime-status.ts @@ -0,0 +1,355 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Probe the OpenClaw runtime channel registry from inside a sandbox. + * + * Issue #4156: a user can have a valid channel block in `openclaw.json` on + * the host but the dashboard's "Channels — Gateway-wide channel status + * snapshot" panel still reports "No channels found" because NemoClaw + * never compared the registered set with the runtime's view. The + * post-create verification and the doctor diagnostic both reach into this + * module so the answer is consistent across surfaces. + * + * Two probe layers, intentionally separate: + * + * 1. **Config layer** (`extractEnabledChannelsFromOpenclawConfig`) reads + * `/sandbox/.openclaw/openclaw.json` — the same file OpenClaw parses + * at startup. Catches "config never had the channel" failures and + * malformed-schema cases where NemoClaw's generator wrote something + * the runtime can't load. Cheap and deterministic. + * + * 2. **Runtime layer** (`probeChannelRuntimeStatus`) tails the gateway + * log at `/tmp/gateway.log` and checks each channel name. The log is + * where the OpenClaw process records its own boot events (the + * existing `getUpdates conflict` detection in `status-command-deps.ts` + * relies on the same file). If a channel never appears in the log, + * the runtime never tried to start it — the exact symptom behind + * "No channels found" in the dashboard. + * + * The two signals combine: a channel is "runtime-visible" only when both + * the config exposes it AND the runtime log shows it. A channel present + * in config but absent from the log is the #4156 failure mode and is + * reported separately so the diagnostic can give the operator a precise + * next step (the dashboard view, the gateway log) instead of a generic + * "messaging may be broken" message. + * + * Pure JSON / log parsing is split from the SSH/exec probes so the + * comparison logic stays unit-testable without touching a sandbox. + */ + +// OpenClaw's openclaw.json uses one key per channel under `channels.*`. +// Some channels are exposed under their canonical NemoClaw name (telegram, +// discord, slack, whatsapp); WeChat is bridged through the +// openclaw-weixin plugin, so the runtime key differs from the registry +// name. Keep the map narrow on purpose — an unknown channel key under +// `channels.*` is left out of the visible set rather than guessed at, +// because the registry side is authoritative for naming. +const CHANNEL_KEY_TO_NAME: Record = { + telegram: "telegram", + discord: "discord", + slack: "slack", + whatsapp: "whatsapp", + "openclaw-weixin": "wechat", +}; + +export type RuntimeChannelStatus = { + /** + * True when at least the config layer was read and parsed. False on SSH + * failure, missing file, empty stdout, or invalid JSON — `detail` + * carries the specific reason so callers can surface an actionable hint. + */ + ok: boolean; + /** + * Channels the runtime exposes — config has them AND the gateway log + * confirms the runtime acknowledged them. Sorted, deduplicated. + */ + visibleChannels: string[]; + /** + * Channels present in `openclaw.json` but never mentioned in the + * gateway log. This is the #4156 failure signature: configured but the + * runtime never started the bridge, so the dashboard's "Channels" + * panel renders "No channels found" even though config looks right. + * Empty when the log was unreachable or no configured channels were + * missing from it (use `logProbeOk` to distinguish those cases). + */ + configuredButNotRunning: string[]; + /** + * True when the gateway log probe succeeded. False when the log was + * missing or unreadable — in that case `configuredButNotRunning` will + * be empty even if the runtime is genuinely broken, so the caller + * should treat the result as config-only. + */ + logProbeOk: boolean; + detail: string; +}; + +export interface ChannelRuntimeStatusDeps { + /** Absolute path inside the sandbox, e.g. `/sandbox/.openclaw/openclaw.json`. */ + configFilePath: string; + /** + * Path to the in-sandbox gateway log. Defaults to `/tmp/gateway.log` + * (the path OpenClaw's gateway writes when the agent starts — same + * file the existing Telegram-conflict probe in + * `src/lib/status-command-deps.ts` reads). Override only when running + * an alternate agent layout that ships logs elsewhere. + */ + gatewayLogPath?: string; + /** Sandbox shell exec — returns `null` when the exec itself failed. */ + executeSandboxCommand: ( + script: string, + ) => { status: number; stdout: string; stderr: string } | null; +} + +/** + * Extract the set of channels with at least one enabled account from a parsed + * OpenClaw config. Returns a sorted, deduplicated list of canonical channel + * names (telegram, discord, slack, whatsapp, wechat). Unknown keys under + * `channels.*` are ignored — registry-side names are authoritative. + */ +export function extractEnabledChannelsFromOpenclawConfig(json: unknown): string[] { + if (!json || typeof json !== "object") return []; + const channels = (json as Record).channels; + if (!channels || typeof channels !== "object") return []; + const visible = new Set(); + for (const [key, value] of Object.entries(channels as Record)) { + const canonical = CHANNEL_KEY_TO_NAME[key]; + if (!canonical) continue; + if (!value || typeof value !== "object") continue; + const accounts = (value as Record).accounts; + if (!accounts || typeof accounts !== "object") continue; + for (const account of Object.values(accounts as Record)) { + if ( + account && + typeof account === "object" && + (account as Record).enabled === true + ) { + visible.add(canonical); + break; + } + } + } + return [...visible].sort(); +} + +function shellQuote(value: string): string { + return `'${value.replace(/'/g, "'\\''")}'`; +} + +// Sentinel header the gateway-log scan script always echoes when the log +// file is readable. Distinguishes "log missing entirely" (no stdout) from +// "log present but no channels matched" (header echoed, no FOUND: lines). +const LOG_PROBE_OK_MARKER = "GATEWAY_LOG_PROBED"; +const LOG_FOUND_PREFIX = "FOUND:"; + +// Regex the awk filter uses to detect a new gateway launch. Tracks both +// the initial-launch line and the respawn line written by +// `scripts/nemoclaw-start.sh` (search for "openclaw gateway launched" and +// "respawning" in that file). Whenever the awk pass sees this marker, it +// drops everything accumulated so far — the result is the slice of the +// log file written since the most recent boot. Without this, stale +// channel mentions from a previous gateway run would still satisfy the +// probe even though the *current* OpenClaw process never started that +// channel (#4156 review). +const GATEWAY_BOOT_MARKER_REGEX = "\\[gateway\\].*(launched|respawning)"; + +/** + * Build a shell snippet that probes the gateway log file. Returns each + * channel pattern the *current* OpenClaw launch segment mentions as a + * `FOUND:` line, prefixed by a `GATEWAY_LOG_PROBED` sentinel so + * "log missing" and "log present, no channel matched" stay distinguishable. + * + * Two-pass design (one awk + one grep) so cost stays bounded even on + * long-lived sandboxes: + * + * 1. `awk` walks the log once, discarding lines and resetting its + * buffer every time the gateway boot/respawn marker fires. The + * buffer at EOF is the slice written since the most recent launch. + * 2. `grep -iwoE` pulls just channel-name tokens out of that slice; + * `sort -fu` collapses duplicates so the output is bounded by the + * number of channel patterns (today: 6). + * + * Pure builder — no side effects, exported for unit testing the exact + * script the probe emits. + */ +export function buildGatewayLogScanScript(gatewayLogPath: string): string { + const quotedPath = shellQuote(gatewayLogPath); + const patternAlternation = RUNTIME_LOG_PATTERNS.map((entry) => entry.pattern).join("|"); + // The awk program uses single-quoted strings inside the shell single- + // quote context, so we escape the embedded single quotes the same way + // `shellQuote` does — '\'' ends the outer quote, injects a literal, + // re-enters the quoted segment. + const awkProgram = + `/${GATEWAY_BOOT_MARKER_REGEX}/ { buf=""; next } { buf = buf $0 ORS } END { printf "%s", buf }`; + const escapedAwkProgram = awkProgram.replace(/'/g, "'\\''"); + // `test -r` handles missing and permission-denied uniformly. The + // awk-then-grep pipeline reads the file once and emits at most one + // line per channel match. + return ( + `if test -r ${quotedPath}; then ` + + `echo ${LOG_PROBE_OK_MARKER}; ` + + `awk '${escapedAwkProgram}' ${quotedPath} 2>/dev/null | ` + + `grep -iwoE '${patternAlternation}' 2>/dev/null | sort -fu | ` + + `sed 's/^/${LOG_FOUND_PREFIX}/'` + + `; fi` + ); +} + +/** + * Parse the stdout of `buildGatewayLogScanScript` into a Set of canonical + * channel names that the runtime has acknowledged. Both `openclaw-weixin` + * and `wechat` patterns collapse onto the `wechat` channel name. Matches + * are case-insensitive because `grep -iwoE` echoes whatever case the log + * actually contained. + */ +export function parseGatewayLogScanOutput(stdout: string): Set { + const found = new Set(); + for (const line of stdout.split(/\r?\n/)) { + const trimmed = line.trim(); + if (!trimmed.startsWith(LOG_FOUND_PREFIX)) continue; + const pattern = trimmed.slice(LOG_FOUND_PREFIX.length).toLowerCase(); + for (const entry of RUNTIME_LOG_PATTERNS) { + if (entry.pattern === pattern) { + found.add(entry.channel); + } + } + } + return found; +} + +// Patterns to search the gateway log for. The first column is the literal +// token the OpenClaw runtime writes; the second is the canonical channel +// name the registry uses. WeChat boots through the openclaw-weixin plugin +// name, so we accept either token. Keep this list tight — the probe greps +// once per pattern so cost scales with the array length, not log size. +const RUNTIME_LOG_PATTERNS: readonly { pattern: string; channel: string }[] = [ + { pattern: "telegram", channel: "telegram" }, + { pattern: "discord", channel: "discord" }, + { pattern: "slack", channel: "slack" }, + { pattern: "whatsapp", channel: "whatsapp" }, + { pattern: "wechat", channel: "wechat" }, + { pattern: "openclaw-weixin", channel: "wechat" }, +]; +const DEFAULT_GATEWAY_LOG_PATH = "/tmp/gateway.log"; + +/** + * Read the in-sandbox agent config AND the gateway log to determine which + * channels the runtime exposes to the dashboard. Returns: + * + * - `visibleChannels`: configured AND mentioned in the gateway log + * (the runtime has acknowledged the channel exists). + * - `configuredButNotRunning`: configured but NOT mentioned in the log + * (the #4156 symptom — runtime ignored the channel; dashboard will + * render "No channels found" for it). + * - `logProbeOk`: false if the gateway log was missing or unreadable; + * in that case the config probe still ran but the runtime layer + * could not corroborate. + * + * The probe is intentionally conservative: any failure to read the config + * (sandbox unreachable, file missing, invalid JSON) is surfaced as + * `ok: false` so callers can either warn or, when a deeper probe is + * desired, decide to fail. The detail string is the one the caller + * should render verbatim in a diagnostic hint. + */ +export function probeChannelRuntimeStatus(deps: ChannelRuntimeStatusDeps): RuntimeChannelStatus { + const configFilePath = deps.configFilePath; + const result = deps.executeSandboxCommand(`cat ${shellQuote(configFilePath)} 2>/dev/null || true`); + if (!result) { + return { + ok: false, + visibleChannels: [], + configuredButNotRunning: [], + logProbeOk: false, + detail: "sandbox unreachable (could not read runtime channel config)", + }; + } + const stdout = (result.stdout || "").trim(); + if (!stdout) { + return { + ok: false, + visibleChannels: [], + configuredButNotRunning: [], + logProbeOk: false, + detail: `runtime channel config ${configFilePath} is missing or empty`, + }; + } + let parsed: unknown; + try { + parsed = JSON.parse(stdout); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + return { + ok: false, + visibleChannels: [], + configuredButNotRunning: [], + logProbeOk: false, + detail: `runtime channel config ${configFilePath} is not valid JSON: ${message}`, + }; + } + const configuredChannels = extractEnabledChannelsFromOpenclawConfig(parsed); + + // Second layer: gateway log. We do not fail the probe when the log is + // unreadable — the config check is still valuable on its own — but we + // flag `logProbeOk: false` so callers know the runtime layer didn't + // corroborate, and they can downgrade their certainty accordingly. + // + // Use `grep -m 1` (early exit on first match) over the whole file rather + // than a tail window: channel-startup lines fire once per boot, and a + // long-lived sandbox can scroll them out of the last few hundred lines. + // The `LOG_FOUND_PREFIX` sentinel pattern lets us tell "log missing" + // (empty stdout) apart from "log present but no channels matched" + // (stdout has the sentinel header but no FOUND: lines). Each `grep` is + // O(file size) but exits at the first match, so worst case is a single + // O(file) scan per missing pattern — bounded and predictable. + const gatewayLogPath = deps.gatewayLogPath || DEFAULT_GATEWAY_LOG_PATH; + const logScript = buildGatewayLogScanScript(gatewayLogPath); + const logResult = deps.executeSandboxCommand(logScript); + const logStdout = logResult && typeof logResult.stdout === "string" ? logResult.stdout : ""; + const logProbeOk = logStdout.includes(LOG_PROBE_OK_MARKER); + if (!logProbeOk) { + return { + ok: true, + visibleChannels: configuredChannels, + configuredButNotRunning: [], + logProbeOk: false, + detail: `config ${configFilePath} parsed; gateway log ${gatewayLogPath} unreadable, runtime confirmation skipped`, + }; + } + const mentioned = parseGatewayLogScanOutput(logStdout); + const visibleChannels: string[] = []; + const configuredButNotRunning: string[] = []; + for (const channel of configuredChannels) { + if (mentioned.has(channel)) { + visibleChannels.push(channel); + } else { + configuredButNotRunning.push(channel); + } + } + visibleChannels.sort(); + configuredButNotRunning.sort(); + return { + ok: true, + visibleChannels, + configuredButNotRunning, + logProbeOk: true, + detail: `config ${configFilePath} parsed and gateway log ${gatewayLogPath} corroborated`, + }; +} + +/** + * Compare configured channels (the registry view) with channels the runtime + * would expose. Returns missing (configured but not visible at runtime) and + * unexpected (visible at runtime but not configured locally) sets, sorted + * for stable rendering. Both inputs are deduplicated on the way in so a + * caller does not need to normalize first. + */ +export function compareChannelSets( + configured: readonly string[], + visible: readonly string[], +): { missing: string[]; unexpected: string[] } { + const visibleSet = new Set(visible); + const configuredSet = new Set(configured); + const missing = [...configuredSet].filter((name) => !visibleSet.has(name)).sort(); + const unexpected = [...visibleSet].filter((name) => !configuredSet.has(name)).sort(); + return { missing, unexpected }; +} diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index bdc01905d3..174d6437a6 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -7431,6 +7431,20 @@ async function onboard(opts: OnboardOptions = {}): Promise { captureForwardList: () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }) || null, getMessagingChannels: () => selectedMessagingChannels || [], providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName), + probeChannelRuntimeStatus: () => { + // Only OpenClaw stores channel config in the JSON the dashboard + // "Channels" panel reads from (#4156). Skip for non-JSON agents + // (Hermes) — runtimeMissing stays null, no warn line. + const configPaths = agent?.configPaths; + if (!configPaths || configPaths.format !== "json") return null; + const channelRuntimeStatus: typeof import("./channel-runtime-status") = + require("./channel-runtime-status"); + return channelRuntimeStatus.probeChannelRuntimeStatus({ + configFilePath: `${configPaths.dir}/${configPaths.configFile}`, + executeSandboxCommand: (script: string) => + executeSandboxCommandForVerification(name, script), + }); + }, }); }, formatVerificationDiagnostics: (result) => { diff --git a/src/lib/verify-deployment.test.ts b/src/lib/verify-deployment.test.ts index 6eeaa4cabe..528e6d97c1 100644 --- a/src/lib/verify-deployment.test.ts +++ b/src/lib/verify-deployment.test.ts @@ -120,6 +120,147 @@ describe("verifyDeployment", () => { expect(msgDiag?.detail).toContain("discord"); }); + it("warns when an expected channel is absent from the runtime config entirely (stale rebuild)", async () => { + // Registry says telegram is enabled, but a stale or bad rebuild + // produced an openclaw.json with no `channels.telegram` block. The + // probe extracts no channels from the file, so neither visibleChannels + // nor configuredButNotRunning mention telegram — yet the registry + // expects it. verifyDeployment must catch this by comparing the + // expected set against `visibleChannels` directly. + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + providerExistsInGateway: () => true, + probeChannelRuntimeStatus: () => ({ + ok: true, + visibleChannels: [], + configuredButNotRunning: [], + logProbeOk: true, + detail: "config + log corroborated (empty channels block)", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.verification.messagingBridgesHealthy).toBe(false); + expect(result.verification.messagingRuntimeChannelsMissing).toEqual(["telegram"]); + const msgDiag = result.diagnostics.find((d) => d.link === "messaging"); + expect(msgDiag?.detail).toContain("configured but not in OpenClaw runtime: telegram"); + }); + + it("warns when a configured channel is configured but the runtime never started it (#4156)", async () => { + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + providerExistsInGateway: () => true, + probeChannelRuntimeStatus: () => ({ + ok: true, + visibleChannels: [], + configuredButNotRunning: ["telegram"], + logProbeOk: true, + detail: "config /sandbox/.openclaw/openclaw.json parsed and gateway log /tmp/gateway.log corroborated", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.verification.messagingBridgesHealthy).toBe(false); + expect(result.verification.messagingRuntimeChannelsMissing).toEqual(["telegram"]); + const msgDiag = result.diagnostics.find((d) => d.link === "messaging"); + expect(msgDiag?.status).toBe("warn"); + expect(msgDiag?.detail).toContain("configured but not in OpenClaw runtime: telegram"); + expect(msgDiag?.hint).toContain("No channels found"); + expect(msgDiag?.hint).toContain("gateway.log"); + }); + + it("does not falsely warn when runtime probe corroborates every configured channel", async () => { + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + probeChannelRuntimeStatus: () => ({ + ok: true, + visibleChannels: ["telegram"], + configuredButNotRunning: [], + logProbeOk: true, + detail: "config + log corroborated", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.verification.messagingBridgesHealthy).toBe(true); + expect(result.verification.messagingRuntimeChannelsMissing).toEqual([]); + expect(result.diagnostics.find((d) => d.link === "messaging")).toBeUndefined(); + }); + + it("warns when the gateway log is unavailable so the runtime layer cannot corroborate", async () => { + // Provider attached, config has the channel, but the gateway log is + // unreadable (sandbox just rebuilt, log not yet created). The probe + // can only confirm config — we must surface that as a warn rather + // than claim runtime verification. + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + providerExistsInGateway: () => true, + probeChannelRuntimeStatus: () => ({ + ok: true, + visibleChannels: ["telegram"], + configuredButNotRunning: [], + logProbeOk: false, + detail: "config /sandbox/.openclaw/openclaw.json parsed; gateway log /tmp/gateway.log unreadable, runtime confirmation skipped", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.verification.messagingBridgesHealthy).toBe(false); + const msgDiag = result.diagnostics.find((d) => d.link === "messaging"); + expect(msgDiag?.status).toBe("warn"); + expect(msgDiag?.detail).toContain("runtime gateway log not yet available"); + }); + + it("surfaces an inconclusive runtime probe as a messaging warn (catches malformed openclaw.json #4156)", async () => { + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + providerExistsInGateway: () => true, + probeChannelRuntimeStatus: () => ({ + ok: false, + visibleChannels: [], + configuredButNotRunning: [], + logProbeOk: false, + detail: "runtime channel config /sandbox/.openclaw/openclaw.json is missing or empty", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + // The provider is attached but the runtime config could not be read — + // that is exactly the gap the probe was added to catch (#4156), so it + // must surface as a warn diagnostic, not silently pass. + expect(result.verification.messagingBridgesHealthy).toBe(false); + expect(result.verification.messagingRuntimeChannelsMissing).toBeNull(); + const msgDiag = result.diagnostics.find((d) => d.link === "messaging"); + expect(msgDiag?.status).toBe("warn"); + expect(msgDiag?.detail).toContain("runtime channel probe inconclusive"); + expect(msgDiag?.hint).toContain("openclaw.json"); + }); + + it("skips runtime probe entirely when no channels are configured", async () => { + let probeCalls = 0; + const deps = makeDeps({ + getMessagingChannels: () => [], + probeChannelRuntimeStatus: () => { + probeCalls += 1; + return { + ok: true, + visibleChannels: [], + configuredButNotRunning: [], + logProbeOk: true, + detail: "x", + }; + }, + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(probeCalls).toBe(0); + expect(result.verification.messagingRuntimeChannelsMissing).toBeNull(); + }); + + it("leaves messagingRuntimeChannelsMissing null when no probe dep is wired (e.g. Hermes)", async () => { + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + // no probeChannelRuntimeStatus + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.verification.messagingRuntimeChannelsMissing).toBeNull(); + expect(result.verification.messagingBridgesHealthy).toBe(true); + }); + it("detects gateway version from openclaw --version", async () => { const deps = makeDeps({ executeSandboxCommand: (_name: string, script: string) => { @@ -260,4 +401,34 @@ describe("formatVerificationDiagnostics", () => { expect(lines.some((l) => l.includes("issues"))).toBe(true); expect(lines.some((l) => l.includes("gateway"))).toBe(true); }); + + it("still surfaces messaging warnings alongside the healthy success line (#4156)", async () => { + // The overall result is healthy (gateway + dashboard pass) but the + // runtime never started telegram. Pre-fix the warning was silently + // dropped on the healthy path; the user only learned of the failure + // from the dashboard's "No channels found" panel later. + const deps = makeDeps({ + executeSandboxCommand: (_name: string, script: string) => { + if (script.includes("openclaw --version")) { + return { status: 0, stdout: "2026.5.18", stderr: "" }; + } + return { status: 0, stdout: "200", stderr: "" }; + }, + getMessagingChannels: () => ["telegram"], + providerExistsInGateway: () => true, + probeChannelRuntimeStatus: () => ({ + ok: true, + visibleChannels: [], + configuredButNotRunning: ["telegram"], + logProbeOk: true, + detail: "config + log corroborated", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.healthy).toBe(true); + const lines = formatVerificationDiagnostics(result); + expect(lines.some((l) => l.includes("verified"))).toBe(true); + expect(lines.some((l) => l.includes("messaging:"))).toBe(true); + expect(lines.some((l) => l.includes("configured but not in OpenClaw runtime: telegram"))).toBe(true); + }); }); diff --git a/src/lib/verify-deployment.ts b/src/lib/verify-deployment.ts index 56c0078324..5b95d4a8ae 100644 --- a/src/lib/verify-deployment.ts +++ b/src/lib/verify-deployment.ts @@ -18,6 +18,7 @@ */ import type { DashboardDeliveryChain } from "./dashboard/contract"; +import { compareChannelSets, type RuntimeChannelStatus } from "./channel-runtime-status"; // ── Types ──────────────────────────────────────────────────────────── @@ -29,6 +30,17 @@ export interface DeploymentVerification { inferenceRouteWorking: boolean; dashboardReachable: boolean; messagingBridgesHealthy: boolean; + /** + * Channels recorded in the registry that the in-sandbox agent config + * does not expose. Set to null when the runtime probe is disabled + * (no agent config to read, e.g. Hermes) or when no channels are + * configured. See [[channel-runtime-status]] for the probe internals. + * Why: fixes #4156 — empty/null lets onboarding finish quietly; a + * non-empty array surfaces "configured but invisible at runtime" so + * the dashboard's "No channels found" panel does not catch the user + * by surprise. + */ + messagingRuntimeChannelsMissing: string[] | null; accessMethod: AccessMethod; } @@ -60,6 +72,20 @@ export interface VerifyDeploymentDeps { /** Check if a messaging bridge is polling (provider exists in gateway). */ providerExistsInGateway: (providerName: string) => boolean; + + /** + * Probe the in-sandbox agent config to learn which channels the runtime + * would actually expose to the dashboard "Channels" snapshot. Optional: + * onboarding only wires it when the agent has a JSON config the runtime + * parses (today: OpenClaw). Returning `null` means "skip the comparison"; + * a result object with `ok: false` means "tried to probe and failed", + * which downgrades the diagnostic to a warning rather than a fail. + * + * Fixes #4156: configured/registered channels were never compared with + * the runtime view, so a user could land on the dashboard and see + * "No channels found" without any NemoClaw warning. + */ + probeChannelRuntimeStatus?: () => RuntimeChannelStatus | null; } export interface VerifyDeploymentOptions { @@ -228,27 +254,132 @@ function detectAccessMethod(chain: DashboardDeliveryChain): AccessMethod { return "ssh-tunnel"; } +export interface MessagingBridgeStatus { + healthy: boolean; + detail: string; + /** Channel names that the gateway has no bridge provider for. */ + missingProviders: string[]; + /** + * Channel names recorded in the registry but absent from the in-sandbox + * agent config (the surface OpenClaw renders into the dashboard's + * "Channels — Gateway-wide channel status snapshot" panel). Null when + * the runtime probe was not run (no `probeChannelRuntimeStatus` dep, or + * no configured channels to compare against). Empty array means the + * probe ran and everything matched. See #4156. + */ + runtimeMissing: string[] | null; + /** Detail from the runtime probe when it ran (ok or failure reason). */ + runtimeProbeDetail: string | null; +} + /** - * Verify messaging bridge health for all configured channels. + * Verify messaging bridge health for all configured channels. Combines the + * provider-attachment check (does OpenShell know about the bridge?) with the + * runtime-config probe (does the in-sandbox agent config actually expose + * the channel?) so the "No channels found" dashboard symptom from #4156 + * surfaces here as a warning. */ function verifyMessagingBridges( sandboxName: string, deps: VerifyDeploymentDeps, -): { healthy: boolean; detail: string } { +): MessagingBridgeStatus { const channels = deps.getMessagingChannels(sandboxName); if (channels.length === 0) { - return { healthy: true, detail: "no messaging channels configured" }; + return { + healthy: true, + detail: "no messaging channels configured", + missingProviders: [], + runtimeMissing: null, + runtimeProbeDetail: null, + }; } - const missing: string[] = []; + const missingProviders: string[] = []; for (const channel of channels) { if (!deps.providerExistsInGateway(channel)) { - missing.push(channel); + missingProviders.push(channel); } } - if (missing.length > 0) { - return { healthy: false, detail: `missing providers: ${missing.join(", ")}` }; + let runtimeMissing: string[] | null = null; + let runtimeProbeDetail: string | null = null; + let runtimeProbeFailed = false; + let runtimeProbeOnlyConfig = false; + if (deps.probeChannelRuntimeStatus) { + const runtime = deps.probeChannelRuntimeStatus(); + if (runtime) { + runtimeProbeDetail = runtime.detail; + if (runtime.ok) { + // Compare the registry's expected set (`channels`) with the + // runtime-visible set so that channels missing from openclaw.json + // entirely — a stale or failed rebuild — are caught alongside + // channels that the runtime never started. Relying on + // `configuredButNotRunning` alone would miss the + // "config has no telegram block at all" case the registry + // already knows about. + runtimeMissing = compareChannelSets(channels, runtime.visibleChannels).missing; + runtimeProbeOnlyConfig = !runtime.logProbeOk; + } else { + // ok=false = could not read /sandbox/.openclaw/openclaw.json (missing, + // empty, invalid JSON, or sandbox unreachable). With provider checks + // alone this case would silently pass — yet it's exactly the + // malformed-runtime-config the probe was added to catch (#4156). + // Treat it as warn-level so the diagnostic surfaces with the probe's + // own detail string instead of being swallowed. + runtimeProbeFailed = true; + } + } + } + const parts: string[] = []; + if (missingProviders.length > 0) { + parts.push(`missing providers: ${missingProviders.join(", ")}`); + } + if (runtimeMissing && runtimeMissing.length > 0) { + parts.push(`configured but not in OpenClaw runtime: ${runtimeMissing.join(", ")}`); } - return { healthy: true, detail: `${channels.length} channel(s) attached` }; + if (runtimeProbeFailed && runtimeProbeDetail) { + parts.push(`runtime channel probe inconclusive: ${runtimeProbeDetail}`); + } + if (runtimeProbeOnlyConfig && (!runtimeMissing || runtimeMissing.length === 0)) { + // No missing channels, but the gateway log was unreadable so we can't + // actually confirm the runtime started each bridge. Surface that as a + // warn — the operator should look at the dashboard to be sure. + parts.push("runtime gateway log not yet available; checked config only"); + } + const healthy = + missingProviders.length === 0 && + (!runtimeMissing || runtimeMissing.length === 0) && + !runtimeProbeFailed && + !runtimeProbeOnlyConfig; + const detail = healthy + ? `${channels.length} channel(s) attached` + : parts.join("; ") || "messaging channel verification failed"; + return { + healthy, + detail, + missingProviders, + runtimeMissing, + runtimeProbeDetail, + }; +} + +function buildMessagingHint(messaging: MessagingBridgeStatus): string { + if (messaging.runtimeMissing && messaging.runtimeMissing.length > 0) { + return ( + `Configured channel(s) ${messaging.runtimeMissing.join(", ")} were not acknowledged by the OpenClaw ` + + `runtime (no startup entries in /tmp/gateway.log). The dashboard "Channels" panel will show ` + + `"No channels found" for these. Inspect the gateway log with \`nemoclaw logs\` and ` + + `re-run \`nemoclaw rebuild\` if the channel block needs to be regenerated.` + ); + } + if (messaging.missingProviders.length === 0 && messaging.runtimeProbeDetail) { + // Provider attachment looks fine but the runtime config could not be read. + // Tell the operator how to follow up rather than burying the probe detail. + return ( + `Could not verify the OpenClaw runtime channel registry: ${messaging.runtimeProbeDetail}. ` + + `Start the sandbox and re-run \`nemoclaw doctor\`, or rebuild with ` + + `\`nemoclaw rebuild\` if the config file is missing.` + ); + } + return "Some messaging providers are not attached to the gateway. Re-run onboard with the relevant channels enabled."; } // ── Main entry point ───────────────────────────────────────────────── @@ -304,14 +435,15 @@ export async function verifyDeployment( : "The inference proxy may not be ready yet. Try: nemoclaw status (it may take a few seconds after creation).", }); - // 5. Messaging bridges + // 5. Messaging bridges (providers attached AND runtime config exposes + // each configured channel — #4156). const messaging = verifyMessagingBridges(sandboxName, deps); if (!messaging.healthy) { diagnostics.push({ link: "messaging", status: "warn", detail: messaging.detail, - hint: "Some messaging providers are not attached to the gateway. Re-run onboard with the relevant channels enabled.", + hint: buildMessagingHint(messaging), }); } @@ -323,6 +455,7 @@ export async function verifyDeployment( inferenceRouteWorking: inference.working, dashboardReachable: dashboard.reachable, messagingBridgesHealthy: messaging.healthy, + messagingRuntimeChannelsMissing: messaging.runtimeMissing, accessMethod, }; @@ -352,6 +485,19 @@ export function formatVerificationDiagnostics(result: VerifyDeploymentResult): s if (result.verification.gatewayVersion) { lines.push(` OpenClaw version: ${result.verification.gatewayVersion}`); } + // The overall result is healthy when gateway + dashboard are reachable, + // but the run can still carry warn-level diagnostics (#4156: configured + // channels missing from the runtime registry would otherwise pass + // silently and the user would only learn about it from the dashboard's + // "No channels found" panel after the fact). Surface those alongside + // the success line instead of swallowing them. + for (const d of result.diagnostics) { + if (d.status !== "warn") continue; + lines.push(` ${Y}!${RESET} ${d.link}: ${d.detail}`); + if (d.hint) { + lines.push(` ${D}${d.hint}${RESET}`); + } + } return lines; } diff --git a/test/e2e/test-messaging-providers.sh b/test/e2e/test-messaging-providers.sh index 02094a65bf..2dc8375a08 100755 --- a/test/e2e/test-messaging-providers.sh +++ b/test/e2e/test-messaging-providers.sh @@ -2052,6 +2052,53 @@ else skip "S2: No Slack-related output in gateway log" fi +# ══════════════════════════════════════════════════════════════════ +# Phase 7b: Channel runtime registry verification (#4156) +# ══════════════════════════════════════════════════════════════════ +# Asserts that the new runtime-channel diagnostic (`nemoclaw +# doctor --json` → Messaging → "Runtime channel registry") fires after +# rebuild. If the docker image was baked correctly, the diagnostic +# reports each configured channel as visible to the OpenClaw runtime; +# if the bake failed (the gap behind #4156), it reports the missing set +# instead of silently passing. +section "Phase 7b: Channel runtime registry verification (#4156)" + +doctor_json=$(nemoclaw "$SANDBOX_NAME" doctor --json 2>/dev/null || true) +if [ -z "$doctor_json" ]; then + skip "RT0: Could not collect doctor --json output" +else + runtime_check=$(echo "$doctor_json" | python3 -c " +import json, sys +try: + report = json.load(sys.stdin) +except Exception as e: + print(json.dumps({'error': str(e)})); sys.exit(0) +match = next( + (c for c in report.get('checks', []) if c.get('label') == 'Runtime channel registry'), + None, +) +print(json.dumps(match or {'missing': True})) +" 2>/dev/null || echo '{"error":"parse"}') + + if echo "$runtime_check" | grep -q '"missing"'; then + skip "RT1: doctor --json had no Runtime channel registry check (no configured channels)" + else + info "Runtime channel registry check: ${runtime_check:0:300}" + rt_status=$(echo "$runtime_check" | python3 -c "import json,sys; print(json.load(sys.stdin).get('status',''))" 2>/dev/null || echo "") + if [ "$rt_status" = "ok" ]; then + pass "RT1: doctor reports configured channels are visible to OpenClaw runtime registry" + elif [ "$rt_status" = "warn" ]; then + # A warn is still a pass for this E2E: it means the diagnostic detected + # the very gap #4156 closes (e.g. a channel configured but absent from + # /sandbox/.openclaw/openclaw.json after rebuild). The detail field + # surfaces which channels are missing so the suite output stays useful. + pass "RT1: doctor surfaced runtime channel registry warning (detail: $(echo "$runtime_check" | python3 -c "import json,sys; print(json.load(sys.stdin).get('detail',''))"))" + else + fail "RT1: Unexpected Runtime channel registry status '$rt_status' (raw: ${runtime_check:0:300})" + fi + fi +fi + # ══════════════════════════════════════════════════════════════════ # Phase 8: Cleanup # ══════════════════════════════════════════════════════════════════ From 4e3b027c55eda95c465babb06232fa86067ea48f Mon Sep 17 00:00:00 2001 From: Yimo Jiang Date: Mon, 25 May 2026 10:35:09 +0000 Subject: [PATCH 2/4] fix(cli): address CodeRabbit + onboard budget review on #4156 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Extract the post-deploy channel-runtime probe wiring into src/lib/onboard/verify-channel-runtime.ts so the entrypoint stays net-neutral (the `onboard-entrypoint-budget` CI gate). - channel-runtime-status: keep `visibleChannels` strictly log-corroborated. When the gateway log is unreadable we no longer return the configured set there; callers that diff against `visibleChannels` would otherwise treat an inconclusive probe as healthy. - channel-runtime-status: expose `configuredChannels` separately so callers can detect stale-rebuild mismatches (registry expects a channel that `openclaw.json` no longer contains) even when the runtime layer is unavailable. - verify-deployment + doctor: branch on `logProbeOk` so the runtime diff only runs with log corroboration, and surface a separate `missing from sandbox config` warning for config-only mismatches when the log is unreadable. - Neutralize the messaging hint so it does not blame the gateway log for what might just be a stale rebuild — the operator is pointed at both `openclaw.json` and the log. - Tests cover the new contract: visibleChannels stays empty when log is unavailable, configuredChannels still surfaces the config-derived set, and verify-deployment / doctor flag a stale rebuild even when the gateway log is missing. Signed-off-by: Yimo Jiang --- src/lib/actions/sandbox/doctor.ts | 52 ++++++--- src/lib/channel-runtime-status.test.ts | 15 ++- src/lib/channel-runtime-status.ts | 25 +++- src/lib/onboard.ts | 15 +-- .../onboard/verify-channel-runtime.test.ts | 48 ++++++++ src/lib/onboard/verify-channel-runtime.ts | 51 +++++++++ src/lib/verify-deployment.test.ts | 56 ++++++++- src/lib/verify-deployment.ts | 108 +++++++++++++----- 8 files changed, 303 insertions(+), 67 deletions(-) create mode 100644 src/lib/onboard/verify-channel-runtime.test.ts create mode 100644 src/lib/onboard/verify-channel-runtime.ts diff --git a/src/lib/actions/sandbox/doctor.ts b/src/lib/actions/sandbox/doctor.ts index 5462d55ed3..971126cf61 100644 --- a/src/lib/actions/sandbox/doctor.ts +++ b/src/lib/actions/sandbox/doctor.ts @@ -390,22 +390,42 @@ function channelRuntimeDoctorCheck( `or rebuild with \`${CLI_NAME} ${sandboxName} rebuild\` if the config file is missing`, }; } - // Compare the registry's expected set with what the runtime acknowledged - // (visible = config has the channel AND gateway log mentioned it). This - // catches both "config dropped the channel" (stale/bad rebuild) and - // "config has it but runtime didn't start it" (the reporter's case). - const { missing: notRunning } = compareChannelSets(enabledChannels, runtime.visibleChannels); - if (notRunning.length > 0) { - return { - group: "Messaging", - label: "Runtime channel registry", - status: "warn", - detail: `configured but not in OpenClaw runtime: ${notRunning.join(", ")}`, - hint: - `the OpenClaw dashboard "Channels" panel will show "No channels found" for ` + - `${notRunning.join(", ")}; inspect the gateway log with \`${CLI_NAME} ${sandboxName} logs\` ` + - `and re-run \`${CLI_NAME} ${sandboxName} rebuild\` if the channels block needs to be regenerated`, - }; + if (runtime.logProbeOk) { + // Diff against the log-corroborated runtime view. Catches both the + // stale-rebuild path (channel block missing) and the runtime-startup + // path (config has it, log doesn't). + const { missing: notRunning } = compareChannelSets(enabledChannels, runtime.visibleChannels); + if (notRunning.length > 0) { + return { + group: "Messaging", + label: "Runtime channel registry", + status: "warn", + detail: `not visible to OpenClaw runtime: ${notRunning.join(", ")}`, + hint: + `the OpenClaw dashboard "Channels" panel will show "No channels found" for ` + + `${notRunning.join(", ")}; inspect \`${agent.configPaths.dir}/${agent.configPaths.configFile}\` ` + + `and the gateway log with \`${CLI_NAME} ${sandboxName} logs\`, then re-run ` + + `\`${CLI_NAME} ${sandboxName} rebuild\` if the channels block needs to be regenerated`, + }; + } + } else { + // Log unavailable: we can still detect a config-only mismatch + // (registry expects telegram but openclaw.json doesn't have it). + // Surface that as a warn so a stale rebuild isn't masked by an + // unreadable log (CodeRabbit on PR #4182). The log-unavailable + // warning below still runs when configMissing is empty. + const { missing: configMissing } = compareChannelSets(enabledChannels, runtime.configuredChannels); + if (configMissing.length > 0) { + return { + group: "Messaging", + label: "Runtime channel registry", + status: "warn", + detail: `missing from sandbox config: ${configMissing.join(", ")}`, + hint: + `\`${agent.configPaths.dir}/${agent.configPaths.configFile}\` is missing the channel block ` + + `for ${configMissing.join(", ")}; re-run \`${CLI_NAME} ${sandboxName} rebuild\` so the config is regenerated`, + }; + } } if (!runtime.logProbeOk) { return { diff --git a/src/lib/channel-runtime-status.test.ts b/src/lib/channel-runtime-status.test.ts index c2f631224c..0e7221dace 100644 --- a/src/lib/channel-runtime-status.test.ts +++ b/src/lib/channel-runtime-status.test.ts @@ -353,7 +353,7 @@ describe("probeChannelRuntimeStatus", () => { expect(result.configuredButNotRunning).toEqual([]); }); - it("falls back to config-only when the gateway log is missing", () => { + it("keeps visibleChannels empty when the gateway log is missing, so callers do not treat an inconclusive probe as healthy", () => { const config = JSON.stringify({ channels: { telegram: { accounts: { default: { enabled: true } } } }, }); @@ -364,11 +364,16 @@ describe("probeChannelRuntimeStatus", () => { }); expect(result.ok).toBe(true); expect(result.logProbeOk).toBe(false); - // Without log corroboration, the config view is reported as visible - // but configuredButNotRunning stays empty — the caller decides how - // to surface the "could not verify runtime" caveat in its diagnostic. - expect(result.visibleChannels).toEqual(["telegram"]); + // `visibleChannels` is documented as "config + log corroborated". When + // the log layer is unavailable, the runtime view is unknown — keep it + // empty so callers must consult `logProbeOk` and decide how to render + // the caveat instead of treating config-only as healthy + // (CodeRabbit catch on PR #4182). + expect(result.visibleChannels).toEqual([]); expect(result.configuredButNotRunning).toEqual([]); + // But the config-derived set is still exposed so callers can detect + // stale-rebuild mismatches without runtime corroboration. + expect(result.configuredChannels).toEqual(["telegram"]); expect(result.detail).toContain("unreadable"); }); diff --git a/src/lib/channel-runtime-status.ts b/src/lib/channel-runtime-status.ts index 6bd33da8f8..4d4fca13a3 100644 --- a/src/lib/channel-runtime-status.ts +++ b/src/lib/channel-runtime-status.ts @@ -62,9 +62,18 @@ export type RuntimeChannelStatus = { ok: boolean; /** * Channels the runtime exposes — config has them AND the gateway log - * confirms the runtime acknowledged them. Sorted, deduplicated. + * confirms the runtime acknowledged them. Sorted, deduplicated. Empty + * when `logProbeOk` is false, since we have no log to corroborate. */ visibleChannels: string[]; + /** + * Channels that the in-sandbox config (the file at `configFilePath`) + * has marked as enabled. Always populated when `ok` is true, regardless + * of the gateway log layer — gives callers a way to detect stale + * rebuilds (registry expects telegram, but `openclaw.json` dropped it) + * even when the runtime layer cannot corroborate. + */ + configuredChannels: string[]; /** * Channels present in `openclaw.json` but never mentioned in the * gateway log. This is the #4156 failure signature: configured but the @@ -258,6 +267,7 @@ export function probeChannelRuntimeStatus(deps: ChannelRuntimeStatusDeps): Runti return { ok: false, visibleChannels: [], + configuredChannels: [], configuredButNotRunning: [], logProbeOk: false, detail: "sandbox unreachable (could not read runtime channel config)", @@ -268,6 +278,7 @@ export function probeChannelRuntimeStatus(deps: ChannelRuntimeStatusDeps): Runti return { ok: false, visibleChannels: [], + configuredChannels: [], configuredButNotRunning: [], logProbeOk: false, detail: `runtime channel config ${configFilePath} is missing or empty`, @@ -281,6 +292,7 @@ export function probeChannelRuntimeStatus(deps: ChannelRuntimeStatusDeps): Runti return { ok: false, visibleChannels: [], + configuredChannels: [], configuredButNotRunning: [], logProbeOk: false, detail: `runtime channel config ${configFilePath} is not valid JSON: ${message}`, @@ -307,9 +319,17 @@ export function probeChannelRuntimeStatus(deps: ChannelRuntimeStatusDeps): Runti const logStdout = logResult && typeof logResult.stdout === "string" ? logResult.stdout : ""; const logProbeOk = logStdout.includes(LOG_PROBE_OK_MARKER); if (!logProbeOk) { + // Keep `visibleChannels` strictly log-corroborated — returning the + // configured set there would let any caller diffing against it + // treat an inconclusive probe as healthy (CodeRabbit catch on PR + // #4182). `configuredChannels` still carries the config-derived + // set so the caller can detect stale rebuilds (registry expects + // a channel that `openclaw.json` no longer contains) even when the + // log layer is unavailable. return { ok: true, - visibleChannels: configuredChannels, + visibleChannels: [], + configuredChannels, configuredButNotRunning: [], logProbeOk: false, detail: `config ${configFilePath} parsed; gateway log ${gatewayLogPath} unreadable, runtime confirmation skipped`, @@ -330,6 +350,7 @@ export function probeChannelRuntimeStatus(deps: ChannelRuntimeStatusDeps): Runti return { ok: true, visibleChannels, + configuredChannels, configuredButNotRunning, logProbeOk: true, detail: `config ${configFilePath} parsed and gateway log ${gatewayLogPath} corroborated`, diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 174d6437a6..688109894b 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -7431,20 +7431,11 @@ async function onboard(opts: OnboardOptions = {}): Promise { captureForwardList: () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }) || null, getMessagingChannels: () => selectedMessagingChannels || [], providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName), - probeChannelRuntimeStatus: () => { - // Only OpenClaw stores channel config in the JSON the dashboard - // "Channels" panel reads from (#4156). Skip for non-JSON agents - // (Hermes) — runtimeMissing stays null, no warn line. - const configPaths = agent?.configPaths; - if (!configPaths || configPaths.format !== "json") return null; - const channelRuntimeStatus: typeof import("./channel-runtime-status") = - require("./channel-runtime-status"); - return channelRuntimeStatus.probeChannelRuntimeStatus({ - configFilePath: `${configPaths.dir}/${configPaths.configFile}`, + probeChannelRuntimeStatus: + require("./onboard/verify-channel-runtime").buildChannelRuntimeProbe(agent, { executeSandboxCommand: (script: string) => executeSandboxCommandForVerification(name, script), - }); - }, + }) ?? undefined, }); }, formatVerificationDiagnostics: (result) => { diff --git a/src/lib/onboard/verify-channel-runtime.test.ts b/src/lib/onboard/verify-channel-runtime.test.ts new file mode 100644 index 0000000000..42e3417532 --- /dev/null +++ b/src/lib/onboard/verify-channel-runtime.test.ts @@ -0,0 +1,48 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import { describe, it, expect } from "vitest"; +import { buildChannelRuntimeProbe } from "../../../dist/lib/onboard/verify-channel-runtime.js"; + +// The helper only inspects `agent.configPaths`, so a minimal stub is enough. +function fakeAgent(format: string | null) { + if (format === null) return null; + return { + configPaths: { + dir: "/sandbox/.openclaw", + configFile: "openclaw.json", + envFile: null, + format, + }, + } as unknown as Parameters[0]; +} + +describe("buildChannelRuntimeProbe", () => { + it("returns null when no agent is selected", () => { + expect(buildChannelRuntimeProbe(null, { executeSandboxCommand: () => null })).toBeNull(); + }); + + it("returns null for non-JSON agents (e.g. Hermes uses yaml)", () => { + expect( + buildChannelRuntimeProbe(fakeAgent("yaml"), { executeSandboxCommand: () => null }), + ).toBeNull(); + }); + + it("returns a probe function for JSON agents (OpenClaw) that targets /", () => { + const captured: string[] = []; + const probe = buildChannelRuntimeProbe(fakeAgent("json"), { + executeSandboxCommand: (script: string) => { + captured.push(script); + // First call reads the config; second call scans the gateway log. + return { status: 0, stdout: script.startsWith("cat ") ? "{}" : "", stderr: "" }; + }, + }); + expect(probe).toBeTypeOf("function"); + const result = probe!(); + expect(result).not.toBeNull(); + // The first exec call must be against the agent's config path (the + // `cat ` snippet from probeChannelRuntimeStatus). + expect(captured[0]).toContain("/sandbox/.openclaw/openclaw.json"); + expect(captured[0].startsWith("cat ")).toBe(true); + }); +}); diff --git a/src/lib/onboard/verify-channel-runtime.ts b/src/lib/onboard/verify-channel-runtime.ts new file mode 100644 index 0000000000..8a35d943ba --- /dev/null +++ b/src/lib/onboard/verify-channel-runtime.ts @@ -0,0 +1,51 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +/** + * Build the `probeChannelRuntimeStatus` dep that the post-deployment + * verifier wires through to `verifyDeployment`. Kept out of `onboard.ts` + * itself so the entrypoint stays net-neutral (see the + * `onboard-entrypoint-budget` CI gate) and so the wiring can be + * unit-tested without spinning up the whole onboarding state machine. + */ + +import type { AgentDefinition } from "../agent/defs"; +import { + probeChannelRuntimeStatus, + type ChannelRuntimeStatusDeps, + type RuntimeChannelStatus, +} from "../channel-runtime-status"; + +export interface ChannelRuntimeProbeDeps { + /** + * Execute a shell snippet inside the named sandbox. Returns null when + * the openshell exec itself failed to spawn or timed out (matching + * the contract in `onboard/sandbox-verification-exec.ts`). + */ + executeSandboxCommand: ChannelRuntimeStatusDeps["executeSandboxCommand"]; +} + +/** + * Return a no-arg probe function suitable for `verifyDeployment`'s + * optional `probeChannelRuntimeStatus` dep. Returns `null` when the + * agent does not store channel config in a JSON file the OpenClaw + * runtime parses (today: only OpenClaw qualifies — Hermes uses env/yaml). + * + * Fixes #4156: NemoClaw onboarding never compared configured channels + * to the runtime view, so a baked image with a missing or unloaded + * channel block produced the dashboard's "No channels found" panel + * without any host-side warning. + */ +export function buildChannelRuntimeProbe( + agent: AgentDefinition | null, + deps: ChannelRuntimeProbeDeps, +): (() => RuntimeChannelStatus | null) | null { + const configPaths = agent?.configPaths; + if (!configPaths || configPaths.format !== "json") return null; + const configFilePath = `${configPaths.dir}/${configPaths.configFile}`; + return () => + probeChannelRuntimeStatus({ + configFilePath, + executeSandboxCommand: deps.executeSandboxCommand, + }); +} diff --git a/src/lib/verify-deployment.test.ts b/src/lib/verify-deployment.test.ts index 528e6d97c1..71164748a2 100644 --- a/src/lib/verify-deployment.test.ts +++ b/src/lib/verify-deployment.test.ts @@ -133,6 +133,7 @@ describe("verifyDeployment", () => { probeChannelRuntimeStatus: () => ({ ok: true, visibleChannels: [], + configuredChannels: [], configuredButNotRunning: [], logProbeOk: true, detail: "config + log corroborated (empty channels block)", @@ -152,6 +153,7 @@ describe("verifyDeployment", () => { probeChannelRuntimeStatus: () => ({ ok: true, visibleChannels: [], + configuredChannels: ["telegram"], configuredButNotRunning: ["telegram"], logProbeOk: true, detail: "config /sandbox/.openclaw/openclaw.json parsed and gateway log /tmp/gateway.log corroborated", @@ -164,7 +166,12 @@ describe("verifyDeployment", () => { expect(msgDiag?.status).toBe("warn"); expect(msgDiag?.detail).toContain("configured but not in OpenClaw runtime: telegram"); expect(msgDiag?.hint).toContain("No channels found"); - expect(msgDiag?.hint).toContain("gateway.log"); + // Hint should mention both layers neutrally (config file + log) since + // the cause could be either a stale rebuild or a runtime failure + // (CodeRabbit catch on PR #4182). It must not point at only the log. + expect(msgDiag?.hint).toContain("openclaw.json"); + expect(msgDiag?.hint).toContain("logs"); + expect(msgDiag?.hint).not.toContain("no startup entries"); }); it("does not falsely warn when runtime probe corroborates every configured channel", async () => { @@ -173,6 +180,7 @@ describe("verifyDeployment", () => { probeChannelRuntimeStatus: () => ({ ok: true, visibleChannels: ["telegram"], + configuredChannels: ["telegram"], configuredButNotRunning: [], logProbeOk: true, detail: "config + log corroborated", @@ -188,13 +196,17 @@ describe("verifyDeployment", () => { // Provider attached, config has the channel, but the gateway log is // unreadable (sandbox just rebuilt, log not yet created). The probe // can only confirm config — we must surface that as a warn rather - // than claim runtime verification. + // than claim runtime verification. The probe now returns + // `visibleChannels: []` when `logProbeOk` is false so callers cannot + // accidentally treat config-only as healthy, and verifyDeployment + // must NOT then flag every configured channel as missing. const deps = makeDeps({ getMessagingChannels: () => ["telegram"], providerExistsInGateway: () => true, probeChannelRuntimeStatus: () => ({ ok: true, - visibleChannels: ["telegram"], + visibleChannels: [], + configuredChannels: ["telegram"], configuredButNotRunning: [], logProbeOk: false, detail: "config /sandbox/.openclaw/openclaw.json parsed; gateway log /tmp/gateway.log unreadable, runtime confirmation skipped", @@ -202,9 +214,44 @@ describe("verifyDeployment", () => { }); const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); expect(result.verification.messagingBridgesHealthy).toBe(false); + // No false-positive "configured but not in OpenClaw runtime" — we + // simply do not have enough evidence to make that claim. + expect(result.verification.messagingRuntimeChannelsMissing).toBeNull(); + expect(result.verification.messagingConfigChannelsMissing).toEqual([]); const msgDiag = result.diagnostics.find((d) => d.link === "messaging"); expect(msgDiag?.status).toBe("warn"); expect(msgDiag?.detail).toContain("runtime gateway log not yet available"); + expect(msgDiag?.detail).not.toContain("configured but not in OpenClaw runtime"); + }); + + it("flags a stale rebuild even when the gateway log is unavailable (config-only diff)", async () => { + // Registry expects telegram but openclaw.json never had the channel + // block — and the gateway log is unreadable, so the runtime layer + // cannot corroborate. Earlier revisions of this fix masked the + // mismatch behind the log warning; this test pins the new + // configMissing surface that exposes config-only mismatches even + // without log corroboration (CodeRabbit on PR #4182). + const deps = makeDeps({ + getMessagingChannels: () => ["telegram"], + providerExistsInGateway: () => true, + probeChannelRuntimeStatus: () => ({ + ok: true, + visibleChannels: [], + configuredChannels: [], + configuredButNotRunning: [], + logProbeOk: false, + detail: "config /sandbox/.openclaw/openclaw.json parsed; gateway log /tmp/gateway.log unreadable, runtime confirmation skipped", + }), + }); + const result = await verifyDeployment("my-sandbox", chain, deps, NO_RETRY); + expect(result.verification.messagingBridgesHealthy).toBe(false); + expect(result.verification.messagingRuntimeChannelsMissing).toBeNull(); + expect(result.verification.messagingConfigChannelsMissing).toEqual(["telegram"]); + const msgDiag = result.diagnostics.find((d) => d.link === "messaging"); + expect(msgDiag?.status).toBe("warn"); + expect(msgDiag?.detail).toContain("missing from sandbox config: telegram"); + expect(msgDiag?.hint).toContain("openclaw.json"); + expect(msgDiag?.hint).toContain("rebuild"); }); it("surfaces an inconclusive runtime probe as a messaging warn (catches malformed openclaw.json #4156)", async () => { @@ -214,6 +261,7 @@ describe("verifyDeployment", () => { probeChannelRuntimeStatus: () => ({ ok: false, visibleChannels: [], + configuredChannels: [], configuredButNotRunning: [], logProbeOk: false, detail: "runtime channel config /sandbox/.openclaw/openclaw.json is missing or empty", @@ -240,6 +288,7 @@ describe("verifyDeployment", () => { return { ok: true, visibleChannels: [], + configuredChannels: [], configuredButNotRunning: [], logProbeOk: true, detail: "x", @@ -419,6 +468,7 @@ describe("formatVerificationDiagnostics", () => { probeChannelRuntimeStatus: () => ({ ok: true, visibleChannels: [], + configuredChannels: ["telegram"], configuredButNotRunning: ["telegram"], logProbeOk: true, detail: "config + log corroborated", diff --git a/src/lib/verify-deployment.ts b/src/lib/verify-deployment.ts index 5b95d4a8ae..f26432f16c 100644 --- a/src/lib/verify-deployment.ts +++ b/src/lib/verify-deployment.ts @@ -33,14 +33,25 @@ export interface DeploymentVerification { /** * Channels recorded in the registry that the in-sandbox agent config * does not expose. Set to null when the runtime probe is disabled - * (no agent config to read, e.g. Hermes) or when no channels are - * configured. See [[channel-runtime-status]] for the probe internals. - * Why: fixes #4156 — empty/null lets onboarding finish quietly; a - * non-empty array surfaces "configured but invisible at runtime" so - * the dashboard's "No channels found" panel does not catch the user - * by surprise. + * (no agent config to read, e.g. Hermes), when the gateway log layer + * was unavailable so the runtime view could not be corroborated, or + * when no channels are configured. See [[channel-runtime-status]] for + * the probe internals. Why: fixes #4156 — empty/null lets onboarding + * finish quietly; a non-empty array surfaces "configured but invisible + * at runtime" so the dashboard's "No channels found" panel does not + * catch the user by surprise. */ messagingRuntimeChannelsMissing: string[] | null; + /** + * Channels expected by the registry that are missing from the + * in-sandbox agent config file (`openclaw.json`). Distinct from + * `messagingRuntimeChannelsMissing`: this surfaces stale-rebuild + * mismatches even when the gateway log isn't readable, while the + * runtime field requires log corroboration. Null when no channels + * are configured or the probe is disabled; empty array when the + * config has every expected channel. + */ + messagingConfigChannelsMissing: string[] | null; accessMethod: AccessMethod; } @@ -260,14 +271,19 @@ export interface MessagingBridgeStatus { /** Channel names that the gateway has no bridge provider for. */ missingProviders: string[]; /** - * Channel names recorded in the registry but absent from the in-sandbox - * agent config (the surface OpenClaw renders into the dashboard's - * "Channels — Gateway-wide channel status snapshot" panel). Null when - * the runtime probe was not run (no `probeChannelRuntimeStatus` dep, or - * no configured channels to compare against). Empty array means the - * probe ran and everything matched. See #4156. + * Channel names recorded in the registry but not corroborated by the + * OpenClaw runtime log. Null when the probe was not run or the log + * layer was unavailable. Empty array means the probe ran with log + * corroboration and everything matched. See #4156. */ runtimeMissing: string[] | null; + /** + * Channel names recorded in the registry but absent from the in-sandbox + * config file. Surfaced even when the log layer is unavailable so a + * stale rebuild can be detected without runtime corroboration. Null + * when the probe was not run or no config-only diff was performed. + */ + configMissing: string[] | null; /** Detail from the runtime probe when it ran (ok or failure reason). */ runtimeProbeDetail: string | null; } @@ -290,6 +306,7 @@ function verifyMessagingBridges( detail: "no messaging channels configured", missingProviders: [], runtimeMissing: null, + configMissing: null, runtimeProbeDetail: null, }; } @@ -300,6 +317,7 @@ function verifyMessagingBridges( } } let runtimeMissing: string[] | null = null; + let configMissing: string[] | null = null; let runtimeProbeDetail: string | null = null; let runtimeProbeFailed = false; let runtimeProbeOnlyConfig = false; @@ -308,15 +326,22 @@ function verifyMessagingBridges( if (runtime) { runtimeProbeDetail = runtime.detail; if (runtime.ok) { - // Compare the registry's expected set (`channels`) with the - // runtime-visible set so that channels missing from openclaw.json - // entirely — a stale or failed rebuild — are caught alongside - // channels that the runtime never started. Relying on - // `configuredButNotRunning` alone would miss the - // "config has no telegram block at all" case the registry - // already knows about. - runtimeMissing = compareChannelSets(channels, runtime.visibleChannels).missing; - runtimeProbeOnlyConfig = !runtime.logProbeOk; + if (runtime.logProbeOk) { + // Log corroboration is available — compare the registry's + // expected set with what the runtime actually acknowledged. + // Catches both "config drops the channel" (stale/bad rebuild) + // and "config has it but runtime never started it" (#4156). + runtimeMissing = compareChannelSets(channels, runtime.visibleChannels).missing; + } else { + // No log to corroborate; we cannot honestly claim which channels + // are missing at runtime, so do not populate `runtimeMissing`. + // We CAN still detect a config-only mismatch — registry expects + // telegram but openclaw.json never had the channel block — so + // diff against the config-derived set and surface that separately + // (CodeRabbit catch on PR #4182). + configMissing = compareChannelSets(channels, runtime.configuredChannels).missing; + runtimeProbeOnlyConfig = true; + } } else { // ok=false = could not read /sandbox/.openclaw/openclaw.json (missing, // empty, invalid JSON, or sandbox unreachable). With provider checks @@ -335,18 +360,26 @@ function verifyMessagingBridges( if (runtimeMissing && runtimeMissing.length > 0) { parts.push(`configured but not in OpenClaw runtime: ${runtimeMissing.join(", ")}`); } + if (configMissing && configMissing.length > 0) { + // Specific to the log-unavailable branch: registry expected channels + // are absent from the in-sandbox config altogether, so we know they + // can't possibly load at runtime regardless of the missing log. + parts.push(`missing from sandbox config: ${configMissing.join(", ")}`); + } if (runtimeProbeFailed && runtimeProbeDetail) { parts.push(`runtime channel probe inconclusive: ${runtimeProbeDetail}`); } - if (runtimeProbeOnlyConfig && (!runtimeMissing || runtimeMissing.length === 0)) { - // No missing channels, but the gateway log was unreadable so we can't - // actually confirm the runtime started each bridge. Surface that as a - // warn — the operator should look at the dashboard to be sure. + if (runtimeProbeOnlyConfig) { + // The gateway log was unreadable, so we can't actually confirm the + // runtime started each bridge. `runtimeMissing` stays null in this + // branch (see above) — surface the "checked config only" caveat so + // the operator inspects the dashboard. parts.push("runtime gateway log not yet available; checked config only"); } const healthy = missingProviders.length === 0 && (!runtimeMissing || runtimeMissing.length === 0) && + (!configMissing || configMissing.length === 0) && !runtimeProbeFailed && !runtimeProbeOnlyConfig; const detail = healthy @@ -357,17 +390,33 @@ function verifyMessagingBridges( detail, missingProviders, runtimeMissing, + configMissing, runtimeProbeDetail, }; } function buildMessagingHint(messaging: MessagingBridgeStatus): string { if (messaging.runtimeMissing && messaging.runtimeMissing.length > 0) { + // Either cause — missing from openclaw.json (stale rebuild) or + // present in config but never logged by the runtime — produces this + // diff. Keep the copy neutral so the operator checks both layers + // rather than chasing only the log path (CodeRabbit on PR #4182). + return ( + `Configured channel(s) ${messaging.runtimeMissing.join(", ")} were not visible to the OpenClaw ` + + `runtime. The dashboard "Channels" panel will show "No channels found" for these. Inspect ` + + `\`/sandbox/.openclaw/openclaw.json\` and the gateway log with \`nemoclaw logs\`, ` + + `then re-run \`nemoclaw rebuild\` if the channel block needs to be regenerated.` + ); + } + if (messaging.configMissing && messaging.configMissing.length > 0) { + // Config-only branch: we couldn't read the runtime log, but we can + // still see that the registry expects channels that openclaw.json + // doesn't have. That's a stale rebuild — the runtime cannot possibly + // start them. return ( - `Configured channel(s) ${messaging.runtimeMissing.join(", ")} were not acknowledged by the OpenClaw ` + - `runtime (no startup entries in /tmp/gateway.log). The dashboard "Channels" panel will show ` + - `"No channels found" for these. Inspect the gateway log with \`nemoclaw logs\` and ` + - `re-run \`nemoclaw rebuild\` if the channel block needs to be regenerated.` + `Configured channel(s) ${messaging.configMissing.join(", ")} are missing from ` + + `\`/sandbox/.openclaw/openclaw.json\` — the runtime cannot start them. Re-run ` + + `\`nemoclaw rebuild\` so the channel block is regenerated.` ); } if (messaging.missingProviders.length === 0 && messaging.runtimeProbeDetail) { @@ -456,6 +505,7 @@ export async function verifyDeployment( dashboardReachable: dashboard.reachable, messagingBridgesHealthy: messaging.healthy, messagingRuntimeChannelsMissing: messaging.runtimeMissing, + messagingConfigChannelsMissing: messaging.configMissing, accessMethod, }; From c757b542229f1c9351f18d525121df58b4a7aa43 Mon Sep 17 00:00:00 2001 From: Yimo Jiang Date: Mon, 25 May 2026 10:53:50 +0000 Subject: [PATCH 3/4] fix(cli): keep onboard.ts net-neutral for #4156 channel probe wiring Collapse the post-deploy channel-runtime probe wiring into a single call site in `onboard.ts` by adding `buildOnboardChannelRuntimeProbe` (which internally binds `executeSandboxCommandForVerification` to the sandbox name) and folding the still-needed `executeSandboxCommand` dep to a direct function reference. Result: `src/lib/onboard.ts` is now +2/-2 instead of +5/-0, satisfying the `onboard-entrypoint-budget` CI gate while preserving behavior. Signed-off-by: Yimo Jiang --- src/lib/onboard.ts | 9 ++------- src/lib/onboard/verify-channel-runtime.ts | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index 688109894b..30d7ef8bbd 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -7419,8 +7419,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { verifyDeployment: async (name, chain) => { const verifyDeploymentModule: typeof import("./verify-deployment") = require("./verify-deployment"); return verifyDeploymentModule.verifyDeployment(name, chain, { - executeSandboxCommand: (sandbox: string, script: string) => - executeSandboxCommandForVerification(sandbox, script), + executeSandboxCommand: executeSandboxCommandForVerification, probeHostPort: (port: number, probePath: string) => { const result = runCapture( ["curl", "-so", "/dev/null", "-w", "%{http_code}", "--max-time", "3", `http://127.0.0.1:${port}${probePath}`], @@ -7431,11 +7430,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { captureForwardList: () => runCaptureOpenshell(["forward", "list"], { ignoreError: true }) || null, getMessagingChannels: () => selectedMessagingChannels || [], providerExistsInGateway: (providerName: string) => providerExistsInGateway(providerName), - probeChannelRuntimeStatus: - require("./onboard/verify-channel-runtime").buildChannelRuntimeProbe(agent, { - executeSandboxCommand: (script: string) => - executeSandboxCommandForVerification(name, script), - }) ?? undefined, + probeChannelRuntimeStatus: require("./onboard/verify-channel-runtime").buildOnboardChannelRuntimeProbe(agent, name), }); }, formatVerificationDiagnostics: (result) => { diff --git a/src/lib/onboard/verify-channel-runtime.ts b/src/lib/onboard/verify-channel-runtime.ts index 8a35d943ba..e1b3c8186e 100644 --- a/src/lib/onboard/verify-channel-runtime.ts +++ b/src/lib/onboard/verify-channel-runtime.ts @@ -15,6 +15,7 @@ import { type ChannelRuntimeStatusDeps, type RuntimeChannelStatus, } from "../channel-runtime-status"; +import { executeSandboxCommandForVerification } from "./sandbox-verification-exec"; export interface ChannelRuntimeProbeDeps { /** @@ -49,3 +50,21 @@ export function buildChannelRuntimeProbe( executeSandboxCommand: deps.executeSandboxCommand, }); } + +/** + * Onboard-specific convenience wrapper: binds the sandbox name so the + * call site in `onboard.ts` is a single line and the entrypoint stays + * within its size budget. Pre-fills `executeSandboxCommand` with the + * SSH-based exec helper onboarding already uses for verification probes. + */ +export function buildOnboardChannelRuntimeProbe( + agent: AgentDefinition | null, + sandboxName: string, +): (() => RuntimeChannelStatus | null) | undefined { + return ( + buildChannelRuntimeProbe(agent, { + executeSandboxCommand: (script: string) => + executeSandboxCommandForVerification(sandboxName, script), + }) ?? undefined + ); +} From 825366d67559b9837aad5562543890d54ffa02d0 Mon Sep 17 00:00:00 2001 From: Yimo Jiang Date: Mon, 25 May 2026 11:00:03 +0000 Subject: [PATCH 4/4] test(cli): swap execSync for spawnSync in channel runtime probe test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CodeQL flagged the previous `execSync` invocation in `channel-runtime-status.test.ts` for incomplete string escaping — backslashes in the embedded gateway-log-scan script were not being escaped before being re-quoted into a `sh -c "..."` argv. Switch to `spawnSync("sh", ["-c", script])`, which passes the script verbatim as an argv element and avoids the per-character escape gymnastics entirely. Behavior is unchanged and all 34 tests still pass. Signed-off-by: Yimo Jiang --- src/lib/channel-runtime-status.test.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/lib/channel-runtime-status.test.ts b/src/lib/channel-runtime-status.test.ts index 0e7221dace..ef743d2e63 100644 --- a/src/lib/channel-runtime-status.test.ts +++ b/src/lib/channel-runtime-status.test.ts @@ -197,13 +197,13 @@ describe("buildGatewayLogScanScript end-to-end shell behavior", () => { // Real-shell execution to confirm the awk/grep pipeline does what the // unit tests assert in structure. This guards against subtle quoting // and shell-flag drift between the builder and a sandbox sh. - const { execSync, writeFileSync, unlinkSync, mkdtempSync, tmpdir, joinPath } = (() => { + const { spawnSync, writeFileSync, unlinkSync, mkdtempSync, tmpdir, joinPath } = (() => { const cp = require("node:child_process"); const fs = require("node:fs"); const os = require("node:os"); const path = require("node:path"); return { - execSync: cp.execSync, + spawnSync: cp.spawnSync, writeFileSync: fs.writeFileSync, unlinkSync: fs.unlinkSync, mkdtempSync: fs.mkdtempSync, @@ -218,13 +218,14 @@ describe("buildGatewayLogScanScript end-to-end shell behavior", () => { writeFileSync(logPath, logBody); const script = buildGatewayLogScanScript(logPath); try { - // sh -c so we exercise a POSIX shell, not bash-only features. The - // pipeline can legitimately exit non-zero when no channel matches - // (sed gets empty input); pipe through `cat` to swallow the exit. - const stdout = execSync(`sh -c "${script.replace(/"/g, "\\\"").replace(/\$/g, "\\$")}"`, { - encoding: "utf-8", - }); - return parseGatewayLogScanOutput(stdout); + // spawnSync with `shell: false` (default) and the script passed as + // an explicit argv element so the OS receives it verbatim. Avoids + // the brittle/CodeQL-flagged double-escape gymnastics that + // `execSync(\`sh -c "${...}"\`)` would force. The pipeline can + // legitimately exit non-zero when no channel matches, but we only + // care about stdout regardless of exit status. + const result = spawnSync("sh", ["-c", script], { encoding: "utf-8" }); + return parseGatewayLogScanOutput(String(result.stdout || "")); } finally { try { unlinkSync(logPath);