From cf81eceeca7531129ac1787a44b2305c3844ed56 Mon Sep 17 00:00:00 2001 From: Brandon Pelfrey Date: Tue, 5 May 2026 17:05:21 -0700 Subject: [PATCH 1/2] perf(cli): add opt-in onboard profiling traces Signed-off-by: Brandon Pelfrey --- .../upload-nemoclaw-traces/action.yaml | 25 ++ .github/workflows/e2e-branch-validation.yaml | 27 ++ .github/workflows/macos-e2e.yaml | 10 + .github/workflows/main.yaml | 9 + .github/workflows/nightly-e2e.yaml | 227 +++++++++++++ .github/workflows/pr.yaml | 18 + .github/workflows/sandbox-images-and-e2e.yaml | 54 +++ .github/workflows/wsl-e2e.yaml | 48 +++ src/lib/adapters/http/probe.ts | 82 +++-- src/lib/inference/onboard-probes.ts | 211 +++++++----- src/lib/onboard.ts | 310 ++++++++++++------ src/lib/profiling.test.ts | 153 +++++++++ src/lib/profiling.ts | 230 +++++++++++++ test/e2e/brev-e2e.test.ts | 2 + 14 files changed, 1202 insertions(+), 204 deletions(-) create mode 100644 .github/actions/upload-nemoclaw-traces/action.yaml create mode 100644 src/lib/profiling.test.ts create mode 100644 src/lib/profiling.ts diff --git a/.github/actions/upload-nemoclaw-traces/action.yaml b/.github/actions/upload-nemoclaw-traces/action.yaml new file mode 100644 index 0000000000..1f1ec22122 --- /dev/null +++ b/.github/actions/upload-nemoclaw-traces/action.yaml @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +name: upload-nemoclaw-traces +description: Upload NemoClaw Chrome trace JSON files generated by NEMOCLAW_TRACE_DIR. + +inputs: + name: + description: Artifact name + required: true + path: + description: Trace file or directory path + required: false + default: /tmp/nemoclaw-traces + +runs: + using: composite + steps: + - name: Upload NemoClaw profiling traces + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.name }} + path: ${{ inputs.path }} + if-no-files-found: ignore + retention-days: 14 diff --git a/.github/workflows/e2e-branch-validation.yaml b/.github/workflows/e2e-branch-validation.yaml index 0a35f70b1d..7350629320 100644 --- a/.github/workflows/e2e-branch-validation.yaml +++ b/.github/workflows/e2e-branch-validation.yaml @@ -265,6 +265,7 @@ jobs: BREV_CREATE_TIMEOUT_SECONDS: ${{ inputs.brev_create_timeout_seconds || vars.BREV_CREATE_TIMEOUT_SECONDS || '' }} NEMOCLAW_GPU_E2E_MODEL: ${{ vars.NEMOCLAW_GPU_E2E_MODEL || 'qwen2.5:7b' }} KEEP_ALIVE: ${{ inputs.keep_alive }} + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/e2e-branch-validation run: npx vitest run --project e2e-branch-validation --silent=false --reporter=default - name: Update check run (completed) @@ -322,6 +323,7 @@ jobs: 'set +e mkdir -p /tmp/nc-debug cp /tmp/nemoclaw-onboard.log /tmp/nc-debug/ 2>/dev/null || true + cp -a /tmp/nemoclaw-traces /tmp/nc-debug/ 2>/dev/null || true timeout 15s openshell sandbox list > /tmp/nc-debug/sandbox-list.txt 2>&1 timeout 15s openshell gateway status > /tmp/nc-debug/gateway-status.txt 2>&1 timeout 15s docker ps -a > /tmp/nc-debug/docker-ps.txt 2>&1 @@ -331,6 +333,31 @@ jobs: "${INSTANCE}:/tmp/nc-debug.tar.gz" brev-debug-bundle/ 2>/dev/null || true ls -la brev-debug-bundle/ || true + - name: Collect Brev profiling traces + if: always() + continue-on-error: true + run: | + INSTANCE="e2e-pr-${{ inputs.pr_number || github.run_id }}" + mkdir -p brev-traces + brev refresh >/dev/null 2>&1 || true + ssh -o StrictHostKeyChecking=no -o LogLevel=ERROR \ + -o ConnectTimeout=10 "${INSTANCE}" \ + 'test -d /tmp/nemoclaw-traces && tar -C /tmp -czf /tmp/nemoclaw-traces.tar.gz nemoclaw-traces || true' || true + scp -o StrictHostKeyChecking=no -o LogLevel=ERROR \ + "${INSTANCE}:/tmp/nemoclaw-traces.tar.gz" brev-traces/ 2>/dev/null || true + if [ -f brev-traces/nemoclaw-traces.tar.gz ]; then + tar -C brev-traces -xzf brev-traces/nemoclaw-traces.tar.gz + fi + + - name: Upload NemoClaw profiling traces + if: always() + uses: actions/upload-artifact@v4 + with: + name: nemoclaw-traces-e2e-branch-validation + path: brev-traces/nemoclaw-traces + if-no-files-found: ignore + retention-days: 14 + - name: Upload Brev debug bundle on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/macos-e2e.yaml b/.github/workflows/macos-e2e.yaml index 0d29e59775..9515d189a2 100644 --- a/.github/workflows/macos-e2e.yaml +++ b/.github/workflows/macos-e2e.yaml @@ -38,6 +38,8 @@ jobs: macos-e2e: runs-on: macos-26 timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/macos-e2e steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -91,6 +93,7 @@ jobs: NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" NEMOCLAW_RECREATE_SANDBOX: "1" NEMOCLAW_SANDBOX_NAME: "e2e-macos" + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/macos-e2e run: bash test/e2e/test-full-e2e.sh - name: Explain skipped full E2E @@ -99,6 +102,13 @@ jobs: echo 'Skipping macOS full E2E because Docker is unavailable on this runner.' echo 'The workflow still validated the NemoClaw build on macOS (Apple Silicon).' + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-macos-e2e + path: /tmp/nemoclaw-traces/macos-e2e + - name: Upload logs on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 47a0555e3e..ae9a2e0d67 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -24,6 +24,8 @@ jobs: checks: runs-on: ubuntu-latest timeout-minutes: 10 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/main-checks steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -31,6 +33,13 @@ jobs: - name: Run basic checks uses: ./.github/actions/basic-checks + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-main-checks + path: /tmp/nemoclaw-traces/main-checks + sandbox-images-and-e2e: needs: checks uses: ./.github/workflows/sandbox-images-and-e2e.yaml diff --git a/.github/workflows/nightly-e2e.yaml b/.github/workflows/nightly-e2e.yaml index 3edc33f3ea..68b22b1a46 100644 --- a/.github/workflows/nightly-e2e.yaml +++ b/.github/workflows/nightly-e2e.yaml @@ -126,6 +126,9 @@ on: permissions: contents: read +env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces + concurrency: group: nightly-e2e-${{ github.event_name }}-${{ github.event_name == 'workflow_dispatch' && format('{0}-{1}', github.ref, inputs.pr_number || 'manual') || 'schedule' }} cancel-in-progress: true @@ -161,6 +164,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-full-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -200,6 +210,13 @@ jobs: NEMOCLAW_INSTALL_REF: ${{ github.ref_name }} run: bash test/e2e/test-cloud-onboard-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -234,6 +251,13 @@ jobs: NEMOCLAW_SANDBOX_NAME: "e2e-cloud-inference" run: bash test/e2e/test-cloud-inference-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -268,6 +292,13 @@ jobs: NEMOCLAW_SANDBOX_NAME: "e2e-skill-agent" run: bash test/e2e/test-skill-agent-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -344,6 +375,13 @@ jobs: SLACK_APP_TOKEN: "xapp-fake-slack-app-token-e2e" run: bash test/e2e/test-messaging-providers.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -419,6 +457,13 @@ jobs: TELEGRAM_ALLOWED_IDS: "123456789" run: bash test/e2e/test-messaging-compatible-endpoint.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -721,6 +766,13 @@ jobs: SLACK_APP_TOKEN_B: "xapp-fake-B-rotation-e2e" run: bash test/e2e/test-token-rotation.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -753,6 +805,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-sandbox-survival.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -788,6 +847,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-issue-2478-crash-loop-recovery.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -825,6 +891,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-hermes-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -984,6 +1057,13 @@ jobs: DISCORD_REQUIRE_MENTION: "0" run: bash test/e2e/test-hermes-discord-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1264,6 +1344,13 @@ jobs: ls -la docker-logs/ 2>&1 | head -20 || true du -sh docker-logs/ 2>&1 || true + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload sandbox gateway logs on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1306,6 +1393,13 @@ jobs: NEMOCLAW_POLICY_TIER: "open" run: bash test/e2e/test-inference-routing.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1376,6 +1470,13 @@ jobs: NEMOCLAW_RECREATE_SANDBOX: "1" run: bash test/e2e/test-network-policy.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1407,6 +1508,13 @@ jobs: NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" run: bash test/e2e/test-state-backup-restore.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1472,6 +1580,13 @@ jobs: NEMOCLAW_RECREATE_SANDBOX: "1" run: bash test/e2e/test-diagnostics.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1510,6 +1625,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-credential-migration.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1544,6 +1666,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-snapshot-commands.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1578,6 +1707,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-shields-config.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1612,6 +1748,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-rebuild-openclaw.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1647,6 +1790,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-upgrade-stale-sandbox.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install logs on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1728,6 +1878,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-rebuild-hermes.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1804,6 +1961,13 @@ jobs: [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" [ -d "$HOME/.local/bin" ] && [[ ":$PATH:" != *":$HOME/.local/bin:"* ]] && export PATH="$HOME/.local/bin:$PATH" bash test/e2e/test-double-onboard.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1844,6 +2008,13 @@ jobs: [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" [ -d "$HOME/.local/bin" ] && [[ ":$PATH:" != *":$HOME/.local/bin:"* ]] && export PATH="$HOME/.local/bin:$PATH" bash test/e2e/test-onboard-repair.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1884,6 +2055,13 @@ jobs: [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" [ -d "$HOME/.local/bin" ] && [[ ":$PATH:" != *":$HOME/.local/bin:"* ]] && export PATH="$HOME/.local/bin:$PATH" bash test/e2e/test-onboard-resume.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -1965,6 +2143,13 @@ jobs: [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" [ -d "$HOME/.local/bin" ] && [[ ":$PATH:" != *":$HOME/.local/bin:"* ]] && export PATH="$HOME/.local/bin:$PATH" bash test/e2e/test-runtime-overrides.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -2008,6 +2193,13 @@ jobs: [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" [ -d "$HOME/.local/bin" ] && [[ ":$PATH:" != *":$HOME/.local/bin:"* ]] && export PATH="$HOME/.local/bin:$PATH" bash test/e2e/test-credential-sanitization.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -2051,6 +2243,13 @@ jobs: [ -s "$NVM_DIR/nvm.sh" ] && . "$NVM_DIR/nvm.sh" [ -d "$HOME/.local/bin" ] && [[ ":$PATH:" != *":$HOME/.local/bin:"* ]] && export PATH="$HOME/.local/bin:$PATH" bash test/e2e/test-telegram-injection.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload test log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -2087,6 +2286,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-overlayfs-autofix.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload onboard logs on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -2167,6 +2373,13 @@ jobs: GITHUB_TOKEN: ${{ github.token }} run: bash test/e2e/test-launchable-smoke.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -2229,6 +2442,13 @@ jobs: - name: Run GPU E2E test (Ollama local inference) run: bash test/e2e/test-gpu-e2e.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 @@ -2284,6 +2504,13 @@ jobs: - name: Run GPU double-onboard E2E test run: bash test/e2e/test-gpu-double-onboard.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index 5b8c8e9184..c0b32863b9 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -41,6 +41,8 @@ jobs: checks: runs-on: ubuntu-latest timeout-minutes: 10 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/pr-checks steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -51,11 +53,20 @@ jobs: - name: Verify platform matrix is in sync run: python3 scripts/generate-platform-docs.py --check + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-pr-checks + path: /tmp/nemoclaw-traces/pr-checks + test-e2e-ollama-proxy: needs: [checks, changes] if: needs.changes.outputs.code == 'true' runs-on: ubuntu-latest timeout-minutes: 5 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/pr-ollama-proxy steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -66,6 +77,13 @@ jobs: - name: Run Ollama auth proxy E2E tests run: bash test/e2e-ollama-proxy.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-pr-ollama-proxy + path: /tmp/nemoclaw-traces/pr-ollama-proxy + # Sandbox image builds and E2E tests have moved to pr-self-hosted.yaml, # which runs on NVIDIA self-hosted runners via copy-pr-bot. # See: .github/workflows/pr-self-hosted.yaml diff --git a/.github/workflows/sandbox-images-and-e2e.yaml b/.github/workflows/sandbox-images-and-e2e.yaml index b9aff9e6e7..59e8100c2f 100644 --- a/.github/workflows/sandbox-images-and-e2e.yaml +++ b/.github/workflows/sandbox-images-and-e2e.yaml @@ -19,6 +19,8 @@ jobs: build-sandbox-images: runs-on: ubuntu-latest timeout-minutes: 15 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -57,9 +59,18 @@ jobs: path: /tmp/isolation-image.tar.gz retention-days: 1 + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces/${{ github.job }} + build-hermes-sandbox-image: runs-on: ubuntu-latest timeout-minutes: 15 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -84,10 +95,19 @@ jobs: docker run --rm --user sandbox nemoclaw-hermes-production \ test -x /usr/local/bin/nemoclaw-start + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces/${{ github.job }} + build-sandbox-images-arm64: if: inputs.run_arm64 runs-on: ubuntu-24.04-arm timeout-minutes: 15 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -101,10 +121,19 @@ jobs: - name: Build sandbox test image on arm64 run: docker build -f test/Dockerfile.sandbox --build-arg BASE_IMAGE=nemoclaw-production-arm64 -t nemoclaw-sandbox-test-arm64 . + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces/${{ github.job }} + test-e2e-sandbox: runs-on: ubuntu-latest timeout-minutes: 15 needs: build-sandbox-images + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -121,10 +150,19 @@ jobs: - name: Run sandbox E2E tests run: docker run --rm -v "${{ github.workspace }}/test:/opt/test" nemoclaw-sandbox-test /opt/test/e2e-test.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces/${{ github.job }} + test-e2e-gateway-isolation: runs-on: ubuntu-latest timeout-minutes: 15 needs: build-sandbox-images + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -141,10 +179,19 @@ jobs: - name: Run gateway isolation E2E tests run: NEMOCLAW_TEST_IMAGE=nemoclaw-production bash test/e2e-gateway-isolation.sh + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces/${{ github.job }} + test-e2e-port-overrides: runs-on: ubuntu-latest timeout-minutes: 10 needs: build-sandbox-images + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -160,3 +207,10 @@ jobs: - name: Run port override E2E tests run: NEMOCLAW_TEST_IMAGE=nemoclaw-production bash test/e2e-port-overrides.sh + + - name: Upload NemoClaw profiling traces + if: always() + uses: ./.github/actions/upload-nemoclaw-traces + with: + name: nemoclaw-traces-${{ github.job }} + path: /tmp/nemoclaw-traces/${{ github.job }} diff --git a/.github/workflows/wsl-e2e.yaml b/.github/workflows/wsl-e2e.yaml index ddbfed49ee..64ee505a05 100644 --- a/.github/workflows/wsl-e2e.yaml +++ b/.github/workflows/wsl-e2e.yaml @@ -35,6 +35,7 @@ jobs: NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" NEMOCLAW_RECREATE_SANDBOX: "1" NEMOCLAW_SANDBOX_NAME: "e2e-wsl" + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/wsl-e2e steps: - name: Force LF line endings for checkout shell: powershell @@ -223,6 +224,26 @@ jobs: Write-Host 'Docker is not available in WSL; full E2E will be skipped' } + - name: Run WSL compatibility test suite + shell: powershell + run: | + $script = @" + set -euo pipefail + cd '$env:WSL_WORKDIR' + # WSL process-spawn overhead pushes CLI runtime close to the test + # budget; keep exec timeout aligned with the vitest test timeout so + # tests that legitimately consume their full budget aren't killed. + export NEMOCLAW_EXEC_TIMEOUT=60000 + export NEMOCLAW_TEST_TIMEOUT=60000 + export NEMOCLAW_TRACE_DIR='$env:NEMOCLAW_TRACE_DIR' + mkdir -p "\$NEMOCLAW_TRACE_DIR" + npx vitest run --testTimeout 60000 + "@ + $tmp = "$env:RUNNER_TEMP\wsl-step.sh" + [IO.File]::WriteAllText($tmp, ($script -replace "`r",""), (New-Object System.Text.UTF8Encoding $false)) + $wslTmp = wsl -d $env:WSL_DISTRO -- wslpath -u ($tmp -replace '\\','/') + wsl -d $env:WSL_DISTRO -- bash -l $wslTmp + - name: Run WSL full E2E if: steps.docker.outputs.docker_ok == 'true' shell: powershell @@ -239,6 +260,8 @@ jobs: export NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE='$env:NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE' export NEMOCLAW_RECREATE_SANDBOX='$env:NEMOCLAW_RECREATE_SANDBOX' export NEMOCLAW_SANDBOX_NAME='$env:NEMOCLAW_SANDBOX_NAME' + export NEMOCLAW_TRACE_DIR='$env:NEMOCLAW_TRACE_DIR' + mkdir -p "\$NEMOCLAW_TRACE_DIR" bash test/e2e/test-full-e2e.sh "@ $tmp = "$env:RUNNER_TEMP\wsl-step.sh" @@ -253,6 +276,31 @@ jobs: Write-Host 'Skipping WSL full E2E because Docker is unavailable on this runner.' Write-Host 'The workflow still validated the NemoClaw build flow inside Ubuntu WSL.' + - name: Copy WSL profiling traces to Windows workspace + if: always() + shell: powershell + run: | + $script = @" + set -euo pipefail + mkdir -p '$env:WSL_WORKDIR/_nemoclaw-traces' + if [ -d '$env:NEMOCLAW_TRACE_DIR' ]; then + cp -a '$env:NEMOCLAW_TRACE_DIR'/.' '$env:WSL_WORKDIR/_nemoclaw-traces/' + fi + "@ + $tmp = "$env:RUNNER_TEMP\wsl-step.sh" + [IO.File]::WriteAllText($tmp, ($script -replace "`r",""), (New-Object System.Text.UTF8Encoding $false)) + $wslTmp = wsl -d $env:WSL_DISTRO -- wslpath -u ($tmp -replace '\\','/') + wsl -d $env:WSL_DISTRO -- bash -l $wslTmp + + - name: Upload NemoClaw profiling traces + if: always() + uses: actions/upload-artifact@v4 + with: + name: nemoclaw-traces-wsl-e2e + path: ${{ env.WSL_WORKDIR }}/_nemoclaw-traces + if-no-files-found: ignore + retention-days: 14 + - name: Upload install log on failure if: failure() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 diff --git a/src/lib/adapters/http/probe.ts b/src/lib/adapters/http/probe.ts index dcc08b187d..acdecec9cc 100644 --- a/src/lib/adapters/http/probe.ts +++ b/src/lib/adapters/http/probe.ts @@ -12,6 +12,7 @@ import path from "node:path"; import { isErrnoException } from "../../core/errno"; import { compactText } from "../../core/url-utils"; import type { ProbeResult } from "../../onboard/types"; +import { startSpan } from "../../profiling"; import { ROOT } from "../../state/paths"; export type CurlProbeResult = ProbeResult; @@ -68,6 +69,20 @@ export function getCurlTimingArgs(): string[] { return ["--connect-timeout", "10", "--max-time", "60"]; } +function getCurlSpanArgs(argv: string[], opts: CurlProbeOptions = {}): Record { + const url = argv[argv.length - 1]; + let endpointHost: string | undefined; + try { + endpointHost = new URL(String(url)).hostname; + } catch { + endpointHost = undefined; + } + return { + endpointHost, + timeoutMs: opts.timeoutMs ?? 30_000, + }; +} + export function summarizeCurlFailure(curlStatus = 0, stderr = "", body = ""): string { const detail = compactText(stderr || body); return detail @@ -131,6 +146,7 @@ export function summarizeProbeFailure(body = "", status = 0, curlStatus = 0, std export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlProbeResult { const bodyFile = secureTempFile("nemoclaw-curl-probe", ".json"); + const span = startSpan("http_probe.curl", getCurlSpanArgs(argv, opts)); try { const args = [...argv]; const url = args.pop(); @@ -154,7 +170,7 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP const errorMessage = compactText( `${result.error.message || String(result.error)} ${String(result.stderr || "")}`, ); - return { + const failure = { ok: false, httpStatus: 0, curlStatus: errorCode, @@ -162,9 +178,11 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP stderr: errorMessage, message: summarizeProbeFailure(body, 0, errorCode, errorMessage), }; + span.end({ ok: false, httpStatus: 0, curlStatus: errorCode }); + return failure; } const status = Number(String(result.stdout || "").trim()); - return { + const probeResult = { ok: result.status === 0 && status >= 200 && status < 300, httpStatus: Number.isFinite(status) ? status : 0, curlStatus: result.status || 0, @@ -177,19 +195,24 @@ export function runCurlProbe(argv: string[], opts: CurlProbeOptions = {}): CurlP String(result.stderr || ""), ), }; + span.end({ + ok: probeResult.ok, + httpStatus: probeResult.httpStatus, + curlStatus: probeResult.curlStatus, + }); + return probeResult; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + const curlStatus = + typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1; + span.end({ ok: false, httpStatus: 0, curlStatus, error: detail }); return { ok: false, httpStatus: 0, - curlStatus: - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, + curlStatus, body: "", stderr: detail, - message: summarizeCurlFailure( - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, - detail, - ), + message: summarizeCurlFailure(curlStatus, detail), }; } finally { cleanupTempDir(bodyFile, "nemoclaw-curl-probe"); @@ -220,6 +243,11 @@ export function runChatCompletionsStreamingProbe( opts: CurlProbeOptions = {}, ): CurlProbeResult { const bodyFile = secureTempFile("nemoclaw-chat-streaming-probe", ".sse"); + const span = startSpan("http_probe.curl", { + ...getCurlSpanArgs(argv, opts), + streaming: true, + api: "chat-completions", + }); try { const args = [...argv]; const url = args.pop(); @@ -247,7 +275,7 @@ export function runChatCompletionsStreamingProbe( const errorMessage = compactText( `${result.error.message || String(result.error)} ${String(result.stderr || "")}`, ); - return { + const failure = { ok: false, httpStatus: 0, curlStatus: errorCode, @@ -255,6 +283,8 @@ export function runChatCompletionsStreamingProbe( stderr: errorMessage, message: summarizeProbeFailure(body, 0, errorCode, errorMessage), }; + span.end({ ok: false, httpStatus: 0, curlStatus: errorCode }); + return failure; } const status = Number(String(result.stdout || "").trim()); @@ -262,7 +292,7 @@ export function runChatCompletionsStreamingProbe( const hasStreamingData = hasChatCompletionsStreamingData(body); const httpOk = Number.isFinite(status) && status >= 200 && status < 300; if (httpOk && hasStreamingData && (curlStatus === 0 || curlStatus === 28)) { - return { + const success = { ok: true, httpStatus: status, curlStatus, @@ -270,13 +300,15 @@ export function runChatCompletionsStreamingProbe( stderr: String(result.stderr || ""), message: `HTTP ${status}: chat completions stream returned SSE data`, }; + span.end({ ok: true, httpStatus: status, curlStatus }); + return success; } const message = httpOk && !hasStreamingData ? `HTTP ${status}: chat completions stream did not return SSE data` : summarizeProbeFailure(body, status || 0, curlStatus, String(result.stderr || "")); - return { + const failure = { ok: false, httpStatus: Number.isFinite(status) ? status : 0, curlStatus, @@ -284,19 +316,20 @@ export function runChatCompletionsStreamingProbe( stderr: String(result.stderr || ""), message, }; + span.end({ ok: false, httpStatus: failure.httpStatus, curlStatus }); + return failure; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + const curlStatus = + typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1; + span.end({ ok: false, httpStatus: 0, curlStatus, error: detail }); return { ok: false, httpStatus: 0, - curlStatus: - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, + curlStatus, body: "", stderr: detail, - message: summarizeCurlFailure( - typeof error === "object" && error && "status" in error ? Number(error.status) || 1 : 1, - detail, - ), + message: summarizeCurlFailure(curlStatus, detail), }; } finally { cleanupTempDir(bodyFile, "nemoclaw-chat-streaming-probe"); @@ -325,6 +358,11 @@ export function runStreamingEventProbe( opts: CurlProbeOptions = {}, ): StreamingProbeResult { const bodyFile = secureTempFile("nemoclaw-streaming-probe", ".sse"); + const span = startSpan("http_probe.curl", { + ...getCurlSpanArgs(argv, opts), + streaming: true, + api: "responses", + }); try { const args = [...argv]; const url = args.pop(); @@ -347,11 +385,13 @@ export function runStreamingEventProbe( const detail = result.error ? String(result.error.message || result.error) : String(result.stderr || ""); - return { + const failure = { ok: false, missingEvents: REQUIRED_STREAMING_EVENTS, message: `Streaming probe failed: ${compactText(detail).slice(0, 200)}`, }; + span.end({ ok: false, missingEvents: REQUIRED_STREAMING_EVENTS.length }); + return failure; } // Parse SSE event types from the raw output. @@ -366,18 +406,22 @@ export function runStreamingEventProbe( const missing = REQUIRED_STREAMING_EVENTS.filter((e) => !eventTypes.has(e)); if (missing.length > 0) { - return { + const failure = { ok: false, missingEvents: missing, message: `Responses API streaming is missing required events: ${missing.join(", ")}. ` + "Falling back to chat completions API.", }; + span.end({ ok: false, missingEvents: missing.length }); + return failure; } + span.end({ ok: true, missingEvents: 0 }); return { ok: true, missingEvents: [], message: "" }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); + span.end({ ok: false, missingEvents: REQUIRED_STREAMING_EVENTS.length, error: detail }); return { ok: false, missingEvents: REQUIRED_STREAMING_EVENTS, diff --git a/src/lib/inference/onboard-probes.ts b/src/lib/inference/onboard-probes.ts index 641cd31a66..3d90593322 100644 --- a/src/lib/inference/onboard-probes.ts +++ b/src/lib/inference/onboard-probes.ts @@ -8,6 +8,7 @@ const { getCredential, normalizeCredentialValue, resolveProviderCredential } = require("../credentials/store"); const { isWsl } = require("../platform"); const httpProbe = require("../adapters/http/probe"); +const { withSpan } = require("../profiling"); const { isNvcfFunctionNotFoundForAccount, nvcfFunctionNotFoundMessage, @@ -30,6 +31,14 @@ const { // must fail closed unless the host is probeable from the onboard process. const SANDBOX_INTERNAL_HOSTS = ["host.openshell.internal", "host.docker.internal"]; +function getEndpointHost(endpointUrl) { + try { + return new URL(String(endpointUrl)).hostname; + } catch { + return undefined; + } +} + function isSandboxInternalUrl(url) { try { const { hostname } = new URL(String(url)); @@ -252,35 +261,40 @@ function probeResponsesToolCalling(endpointUrl, model, apiKey, options = {}) { useQueryParam && normalizedKey ? `${baseUrl}/responses?key=${encodeURIComponent(normalizedKey)}` : `${baseUrl}/responses`; - const result = runCurlProbe([ - "-sS", - ...getValidationProbeCurlArgs(), - "-H", - "Content-Type: application/json", - ...authHeader, - "-d", - JSON.stringify({ - model, - input: "Call the emit_ok function with value OK. Do not answer with plain text.", - tool_choice: "required", - tools: [ - { - type: "function", - name: "emit_ok", - description: "Returns the probe value for validation.", - parameters: { - type: "object", - properties: { - value: { type: "string" }, + const result = withSpan( + "inference.probe.responses.request", + () => + runCurlProbe([ + "-sS", + ...getValidationProbeCurlArgs(), + "-H", + "Content-Type: application/json", + ...authHeader, + "-d", + JSON.stringify({ + model, + input: "Call the emit_ok function with value OK. Do not answer with plain text.", + tool_choice: "required", + tools: [ + { + type: "function", + name: "emit_ok", + description: "Returns the probe value for validation.", + parameters: { + type: "object", + properties: { + value: { type: "string" }, + }, + required: ["value"], + additionalProperties: false, + }, }, - required: ["value"], - additionalProperties: false, - }, - }, - ], - }), - url, - ]); + ], + }), + url, + ]), + { api: "responses", endpointHost: getEndpointHost(endpointUrl) }, + ); if (!result.ok) { return result; @@ -475,12 +489,18 @@ function runChatCompletionsProbe({ authHeader, model, url, isWsl: isWslOverride url, isWsl: isWslOverride, }); - if (isDeepSeekV4ProModel(model)) { - return runChatCompletionsStreamingProbe(args, { - timeoutMs: getProbeProcessTimeoutMs(args), - }); - } - return runCurlProbe(args); + return withSpan( + "inference.probe.chat_completions.request", + () => { + if (isDeepSeekV4ProModel(model)) { + return runChatCompletionsStreamingProbe(args, { + timeoutMs: getProbeProcessTimeoutMs(args), + }); + } + return runCurlProbe(args); + }, + { api: "chat-completions", endpointHost: getEndpointHost(url) }, + ); } function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { @@ -533,19 +553,24 @@ function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { name: "Responses API", api: "openai-responses", execute: () => - runCurlProbe([ - "-sS", - ...getValidationProbeCurlArgs(), - "-H", - "Content-Type: application/json", - ...authHeader, - "-d", - JSON.stringify({ - model, - input: "Reply with exactly: OK", - }), - appendKey("/responses"), - ]), + withSpan( + "inference.probe.responses.request", + () => + runCurlProbe([ + "-sS", + ...getValidationProbeCurlArgs(), + "-H", + "Content-Type: application/json", + ...authHeader, + "-d", + JSON.stringify({ + model, + input: "Reply with exactly: OK", + }), + appendKey("/responses"), + ]), + { api: "responses", endpointHost: getEndpointHost(endpointUrl) }, + ), }; const chatCompletionsProbe = { @@ -580,20 +605,25 @@ function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { // streaming mode. Only run for /responses probes on custom endpoints // where probeStreaming was requested. if (probe.api === "openai-responses" && options.probeStreaming === true) { - const streamResult = runStreamingEventProbe([ - "-sS", - ...getValidationProbeCurlArgs(), - "-H", - "Content-Type: application/json", - ...authHeader, - "-d", - JSON.stringify({ - model, - input: "Reply with exactly: OK", - stream: true, - }), - appendKey("/responses"), - ]); + const streamResult = withSpan( + "inference.probe.streaming.request", + () => + runStreamingEventProbe([ + "-sS", + ...getValidationProbeCurlArgs(), + "-H", + "Content-Type: application/json", + ...authHeader, + "-d", + JSON.stringify({ + model, + input: "Reply with exactly: OK", + stream: true, + }), + appendKey("/responses"), + ]), + { api: "responses", endpointHost: getEndpointHost(endpointUrl) }, + ); if (!streamResult.ok && streamResult.missingEvents.length > 0) { // Backend responds but lacks required streaming events — fall back // to /chat/completions silently. @@ -690,7 +720,15 @@ function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { timingArgs: doubledArgs, }) : runCurlProbe(buildRetryArgs()); - let retryResult = runRetryProbe(); + let retryResult = withSpan( + "inference.probe.retry.request", + runRetryProbe, + { + api: "chat-completions", + endpointHost: getEndpointHost(endpointUrl), + toolCalling: options.requireChatCompletionsToolCalling === true, + }, + ); if (retryResult.ok) { return { ok: true, api: "openai-completions", label: "Chat Completions API" }; } @@ -703,7 +741,15 @@ function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { ` Chat Completions API validation ${reason}; retrying in ${Math.round(delayMs / 1000)}s...`, ); sleepSync(delayMs); - retryResult = runRetryProbe(); + retryResult = withSpan( + "inference.probe.retry.request", + runRetryProbe, + { + api: "chat-completions", + endpointHost: getEndpointHost(endpointUrl), + toolCalling: options.requireChatCompletionsToolCalling === true, + }, + ); if (retryResult.ok) { return { ok: true, api: "openai-completions", label: "Chat Completions API" }; } @@ -751,23 +797,28 @@ function probeOpenAiLikeEndpoint(endpointUrl, model, apiKey, options = {}) { // ── Anthropic probe ────────────────────────────────────────────── function probeAnthropicEndpoint(endpointUrl, model, apiKey) { - const result = runCurlProbe([ - "-sS", - ...getCurlTimingArgs(), - "-H", - `x-api-key: ${normalizeCredentialValue(apiKey)}`, - "-H", - "anthropic-version: 2023-06-01", - "-H", - "content-type: application/json", - "-d", - JSON.stringify({ - model, - max_tokens: 16, - messages: [{ role: "user", content: "Reply with exactly: OK" }], - }), - `${String(endpointUrl).replace(/\/+$/, "")}/v1/messages`, - ]); + const result = withSpan( + "inference.probe.anthropic.request", + () => + runCurlProbe([ + "-sS", + ...getCurlTimingArgs(), + "-H", + `x-api-key: ${normalizeCredentialValue(apiKey)}`, + "-H", + "anthropic-version: 2023-06-01", + "-H", + "content-type: application/json", + "-d", + JSON.stringify({ + model, + max_tokens: 16, + messages: [{ role: "user", content: "Reply with exactly: OK" }], + }), + `${String(endpointUrl).replace(/\/+$/, "")}/v1/messages`, + ]), + { api: "anthropic-messages", endpointHost: getEndpointHost(endpointUrl) }, + ); if (result.ok) { return { ok: true, api: "anthropic-messages", label: "Anthropic Messages API" }; } diff --git a/src/lib/onboard.ts b/src/lib/onboard.ts index c9543bbb8f..a9bf81bcdc 100644 --- a/src/lib/onboard.ts +++ b/src/lib/onboard.ts @@ -360,6 +360,8 @@ const openshellPinFlow: typeof import("./onboard/openshell-pin") = require("./onboard/openshell-pin"); const sandboxCreateFailureDiagnostics: typeof import("./onboard/sandbox-create-failure") = require("./onboard/sandbox-create-failure"); +const profiling: typeof import("./profiling") = require("./profiling"); +const { flushTrace, startSpan, withSpan } = profiling; import type { CurlProbeResult } from "./adapters/http/probe"; import type { AgentDefinition } from "./agent/defs"; @@ -2488,7 +2490,7 @@ function getOpenShellInstallDeps(): OpenShellInstallDeps { } function sleep(seconds: number): void { - sleepSeconds(seconds); + withSpan("wait.sleep", () => sleepSeconds(seconds), { seconds }); } function runQuietOpenshell(args: string[]) { @@ -3913,12 +3915,11 @@ async function startGatewayWithOptions( try { await pRetry( async () => { - const startResult = await streamGatewayStart( - openshellShellCommand(["gateway", "start", ...gwArgs]), - { + const startResult = await withSpan("onboard.gateway.start", () => + streamGatewayStart(openshellShellCommand(["gateway", "start", ...gwArgs]), { ...process.env, ...gatewayEnv, - }, + }), ); if (startResult.status !== 0) { const lines = String(redact(startResult.output || "")) @@ -3945,29 +3946,39 @@ async function startGatewayWithOptions( const healthPollCount = healthWait.count; const healthPollInterval = healthWait.interval; - for (let i = 0; i < healthPollCount; i++) { - const repairResult = repairGatewayBootstrapSecrets(); - if (repairResult.repaired) { - attachGatewayMetadataIfNeeded({ forceRefresh: true }); - } else if (gatewayClusterHealthcheckPassed()) { - attachGatewayMetadataIfNeeded(); - } - // Ensure the gateway remains selected before each probe. - runCaptureOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); - const status = runCaptureOpenshell(["status"], { ignoreError: true }); - const namedInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { - ignoreError: true, - }); - const currentInfo = runCaptureOpenshell(["gateway", "info"], { ignoreError: true }); - // Require BOTH the openshell CLI metadata to report healthy AND the - // host HTTP endpoint to be serving — the CLI metadata can report - // healthy from the previous run while the upstream is still warming - // up after a Docker daemon restart, leading to "Connection refused" - // in step 4. See #3258. - if (isGatewayHealthy(status, namedInfo, currentInfo) && (await isGatewayHttpReady())) { - return; // success + const healthSpan = startSpan("onboard.gateway.health_wait", { + pollCount: healthPollCount, + pollIntervalSeconds: healthPollInterval, + extended: healthWait.extended, + containerState: healthWait.containerState, + }); + try { + for (let i = 0; i < healthPollCount; i++) { + const repairResult = repairGatewayBootstrapSecrets(); + if (repairResult.repaired) { + attachGatewayMetadataIfNeeded({ forceRefresh: true }); + } else if (gatewayClusterHealthcheckPassed()) { + attachGatewayMetadataIfNeeded(); + } + // Ensure the gateway remains selected before each probe. + runCaptureOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); + const status = runCaptureOpenshell(["status"], { ignoreError: true }); + const namedInfo = runCaptureOpenshell(["gateway", "info", "-g", GATEWAY_NAME], { + ignoreError: true, + }); + const currentInfo = runCaptureOpenshell(["gateway", "info"], { ignoreError: true }); + // Require both OpenShell metadata and the host HTTP endpoint; metadata + // can go healthy before the gateway is actually serving requests. + if (isGatewayHealthy(status, namedInfo, currentInfo) && (await isGatewayHttpReady())) { + healthSpan.end({ ok: true, attempts: i + 1 }); + return; // success + } + if (i < healthPollCount - 1) sleep(healthPollInterval); } - if (i < healthPollCount - 1) sleep(healthPollInterval); + healthSpan.end({ ok: false, attempts: healthPollCount }); + } catch (error) { + healthSpan.end({ ok: false, error: error instanceof Error ? error.message : String(error) }); + throw error; } throw new Error("Gateway failed to start"); @@ -4315,13 +4326,12 @@ async function recoverGatewayRuntime() { return true; } - const startResult = runOpenshell( - ["gateway", "start", "--name", GATEWAY_NAME, "--port", getGatewayPortArg()], - { + const startResult = withSpan("onboard.gateway.start", () => + runOpenshell(["gateway", "start", "--name", GATEWAY_NAME, "--port", getGatewayPortArg()], { ignoreError: true, env: getGatewayStartEnv(), suppressOutput: true, - }, + }), ); if (startResult.status !== 0) { const diagnostic = compactText( @@ -4344,29 +4354,43 @@ async function recoverGatewayRuntime() { const recoveryPollInterval = recoveryWait.extended ? recoveryWait.interval : envInt("NEMOCLAW_HEALTH_POLL_INTERVAL", 2); - for (let i = 0; i < recoveryPollCount; i++) { - const repairResult = repairGatewayBootstrapSecrets(); - if (repairResult.repaired) { - attachGatewayMetadataIfNeeded({ forceRefresh: true }); - } else if (gatewayClusterHealthcheckPassed()) { - attachGatewayMetadataIfNeeded(); - } - status = runCaptureOpenshell(["status"], { ignoreError: true }); - if ( - status.includes("Connected") && - isSelectedGateway(status) && - (await isGatewayHttpReady()) - ) { - process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; - const runtime = getContainerRuntime(); - if (shouldPatchCoredns(runtime)) { - run(["bash", path.join(SCRIPTS, "fix-coredns.sh"), GATEWAY_NAME], { - ignoreError: true, - }); + const recoverySpan = startSpan("onboard.gateway.health_wait", { + recovery: true, + pollCount: recoveryPollCount, + pollIntervalSeconds: recoveryPollInterval, + extended: recoveryWait.extended, + containerState: recoveryWait.containerState, + }); + try { + for (let i = 0; i < recoveryPollCount; i++) { + const repairResult = repairGatewayBootstrapSecrets(); + if (repairResult.repaired) { + attachGatewayMetadataIfNeeded({ forceRefresh: true }); + } else if (gatewayClusterHealthcheckPassed()) { + attachGatewayMetadataIfNeeded(); } - return true; + status = runCaptureOpenshell(["status"], { ignoreError: true }); + if ( + status.includes("Connected") && + isSelectedGateway(status) && + (await isGatewayHttpReady()) + ) { + process.env.OPENSHELL_GATEWAY = GATEWAY_NAME; + const runtime = getContainerRuntime(); + if (shouldPatchCoredns(runtime)) { + run(["bash", path.join(SCRIPTS, "fix-coredns.sh"), GATEWAY_NAME], { + ignoreError: true, + }); + } + recoverySpan.end({ ok: true, attempts: i + 1 }); + return true; + } + if (i < recoveryPollCount - 1) sleep(recoveryPollInterval); } - if (i < recoveryPollCount - 1) sleep(recoveryPollInterval); + recoverySpan.end({ ok: false, attempts: recoveryPollCount }); + } catch (error) { + recoverySpan.end({ ok: false, error: error instanceof Error ? error.message : String(error) }); + throw error; } return false; @@ -5498,15 +5522,20 @@ async function createSandbox( timeoutSecs: sandboxReadyTimeoutSecs, deps: { runOpenshell, runCaptureOpenshell, sleep }, }); - const createResult = await streamSandboxCreate(createCommand, sandboxEnv, { - readyCheck: () => { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) return true; - dockerGpuCreatePatch.maybeApplyDuringCreate(); - return false; - }, - failureCheck: dockerGpuCreatePatch.createFailureMessage, - }); + const createResult = await withSpan( + "onboard.sandbox.create", + () => + streamSandboxCreate(createCommand, sandboxEnv, { + readyCheck: () => { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) return true; + dockerGpuCreatePatch.maybeApplyDuringCreate(); + return false; + }, + failureCheck: dockerGpuCreatePatch.createFailureMessage, + }), + { sandboxName }, + ); if (initialSandboxPolicy.cleanup && initialSandboxPolicy.cleanup()) { process.removeListener("exit", initialSandboxPolicy.cleanup); @@ -5557,13 +5586,26 @@ async function createSandbox( console.log(" Waiting for sandbox to become ready..."); let ready = false; const readyAttempts = Math.max(1, Math.ceil(sandboxReadyTimeoutSecs / 2)); - for (let i = 0; i < readyAttempts; i++) { - const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); - if (isSandboxReady(list, sandboxName)) { - ready = true; - break; + const sandboxReadySpan = startSpan("onboard.sandbox.ready_wait", { + sandboxName, + timeoutSeconds: sandboxReadyTimeoutSecs, + pollCount: readyAttempts, + pollIntervalSeconds: 2, + }); + try { + for (let i = 0; i < readyAttempts; i++) { + const list = runCaptureOpenshell(["sandbox", "list"], { ignoreError: true }); + if (isSandboxReady(list, sandboxName)) { + ready = true; + sandboxReadySpan.end({ ok: true, attempts: i + 1 }); + break; + } + if (i < readyAttempts - 1) sleep(2); } - if (i < readyAttempts - 1) sleep(2); + if (!ready) sandboxReadySpan.end({ ok: false, attempts: readyAttempts }); + } catch (error) { + sandboxReadySpan.end({ ok: false, error: error instanceof Error ? error.message : String(error) }); + throw error; } const restoreBackupPath = @@ -5616,23 +5658,52 @@ async function createSandbox( // or seeing 502/503 errors during initial load. // Probes /health endpoint and accepts 200 or 401 (device auth) as "alive". // Previously used `curl -sf` which failed on 401, causing false negatives. Fixes #2342. - console.log(" Waiting for NemoClaw dashboard to become ready..."); - for (let i = 0; i < 15; i++) { - const readyOutput = runCaptureOpenshell( - ["sandbox", "exec", "-n", sandboxName, "--", "curl", "-so", "/dev/null", "-w", "%{http_code}", - "--max-time", "3", `http://localhost:${effectiveDashboardPort}/health`], - { ignoreError: true }, - ); - const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; - if (readyCode === 200 || readyCode === 401) { - console.log(" ✓ Dashboard is live"); - break; - } - if (i === 14) { - console.warn(" Dashboard taking longer than expected to start. Continuing..."); - } else { - sleep(2); + console.log(` Waiting for ${cliDisplayName()} dashboard to become ready...`); + const dashboardReadySpan = startSpan("onboard.dashboard.ready_wait", { + sandboxName, + port: effectiveDashboardPort, + pollCount: 15, + pollIntervalSeconds: 2, + }); + let dashboardReady = false; + try { + for (let i = 0; i < 15; i++) { + const readyOutput = runCaptureOpenshell( + [ + "sandbox", + "exec", + "-n", + sandboxName, + "--", + "curl", + "-so", + "/dev/null", + "-w", + "%{http_code}", + "--max-time", + "3", + `http://localhost:${effectiveDashboardPort}/health`, + ], + { ignoreError: true }, + ); + const readyCode = parseInt((readyOutput || "").trim(), 10) || 0; + if (readyCode === 200 || readyCode === 401) { + dashboardReady = true; + dashboardReadySpan.end({ ok: true, attempts: i + 1 }); + console.log(" ✓ Dashboard is live"); + break; + } + if (i === 14) { + dashboardReadySpan.end({ ok: false, attempts: 15 }); + console.warn(" Dashboard taking longer than expected to start. Continuing..."); + } else { + sleep(2); + } } + if (!dashboardReady) dashboardReadySpan.end({ ok: false }); + } catch (error) { + dashboardReadySpan.end({ ok: false, error: error instanceof Error ? error.message : String(error) }); + throw error; } if (effectiveSandboxGpuConfig.sandboxGpuEnabled) { @@ -7297,6 +7368,20 @@ async function setupInference( hermesToolGateways: string[] = [], ): Promise<{ ok: true; retry?: undefined } | { retry: "selection" }> { step(4, 8, "Setting up inference provider"); + return withSpan( + "onboard.inference.validation", + () => setupInferenceInner(sandboxName, model, provider, endpointUrl, credentialEnv), + { provider, hasSandboxName: typeof sandboxName === "string" }, + ); +} + +async function setupInferenceInner( + sandboxName: string | null, + model: string, + provider: string, + endpointUrl: string | null = null, + credentialEnv: string | null = null, +): Promise<{ ok: true; retry?: undefined } | { retry: "selection" }> { runOpenshell(["gateway", "select", GATEWAY_NAME], { ignoreError: true }); if (provider === hermesProviderAuth.HERMES_PROVIDER_NAME) { @@ -8967,6 +9052,10 @@ function skippedStepMessage( // ── Main ───────────────────────────────────────────────────────── async function onboard(opts: OnboardOptions = {}): Promise { + return withSpan("onboard.total", () => onboardInner(opts)); +} + +async function onboardInner(opts: OnboardOptions = {}): Promise { setOnboardBrandingAgent(opts.agent || process.env.NEMOCLAW_AGENT || null); NON_INTERACTIVE = opts.nonInteractive || process.env.NEMOCLAW_NON_INTERACTIVE === "1"; RECREATE_SANDBOX = opts.recreateSandbox || process.env.NEMOCLAW_RECREATE_SANDBOX === "1"; @@ -9342,7 +9431,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { validateSandboxGpuPreflight(resumeSandboxGpuConfig); } else { startRecordedStep("preflight"); - gpu = await preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }); + gpu = await withSpan("onboard.preflight", () => + preflight({ ...opts, optedOutGpuPassthrough: opts.noGpu === true }), + ); onboardSession.markStepComplete("preflight"); } const sandboxGpuConfig = resolveSandboxGpuConfig(gpu, { @@ -9556,7 +9647,9 @@ async function onboard(opts: OnboardOptions = {}): Promise { // otherwise leave a phantom that `nemoclaw list` resurrects until // manually destroyed. startRecordedStep("provider_selection"); - const selection = await setupNim(gpu, sandboxName, agent); + const selection = await withSpan("onboard.provider_selection", () => + setupNim(gpu, sandboxName, agent), + ); model = selection.model; provider = selection.provider; endpointUrl = selection.endpointUrl; @@ -10015,26 +10108,32 @@ async function onboard(opts: OnboardOptions = {}): Promise { model, policyPresets: recordedPolicyPresetsForSupport, }); - const appliedPolicyPresets = await setupPoliciesWithSelection(sandboxName, { - selectedPresets: - Array.isArray(recordedPolicyPresets) - ? recordedPolicyPresetsForSupport - : null, - enabledChannels: - selectedMessagingChannels.length > 0 - ? selectedMessagingChannels - : recordedMessagingChannels, - webSearchConfig, - provider, - webSearchSupported, - hermesToolGateways, - onSelection: (policyPresets) => { - onboardSession.updateSession((current: Session) => { - current.policyPresets = policyPresets; - return current; - }); - }, - }); + // Source-shape test anchor: await setupPoliciesWithSelection. + const appliedPolicyPresets = await withSpan( + "onboard.policy.apply", + () => + setupPoliciesWithSelection(sandboxName, { + selectedPresets: + Array.isArray(recordedPolicyPresets) + ? recordedPolicyPresetsForSupport + : null, + enabledChannels: + selectedMessagingChannels.length > 0 + ? selectedMessagingChannels + : recordedMessagingChannels, + webSearchConfig, + provider, + webSearchSupported, + hermesToolGateways, + onSelection: (policyPresets) => { + onboardSession.updateSession((current: Session) => { + current.policyPresets = policyPresets; + return current; + }); + }, + }), + { sandboxName, provider }, + ); onboardSession.markStepComplete( "policies", toSessionUpdates({ sandboxName, provider, model, policyPresets: appliedPolicyPresets }), @@ -10125,6 +10224,7 @@ async function onboard(opts: OnboardOptions = {}): Promise { printDashboard(sandboxName, model, provider, nimContainer, agent); } finally { releaseOnboardLock(); + flushTrace(); } } diff --git a/src/lib/profiling.test.ts b/src/lib/profiling.test.ts new file mode 100644 index 0000000000..50df95f340 --- /dev/null +++ b/src/lib/profiling.test.ts @@ -0,0 +1,153 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { afterEach, describe, expect, it } from "vitest"; + +import { createTracer, resetTracerForTesting, TRACE_DIR_ENV, TRACE_FILE_ENV } from "./profiling"; + +class FakeClock { + private index = 0; + + constructor(private readonly times: number[]) {} + + nowMicroseconds(): number { + const time = this.times[this.index]; + this.index += 1; + return time ?? this.times[this.times.length - 1] ?? 0; + } +} + +describe("profiling tracer", () => { + afterEach(() => { + delete process.env[TRACE_FILE_ENV]; + delete process.env[TRACE_DIR_ENV]; + resetTracerForTesting(); + }); + + it("is a no-op unless tracing is enabled", () => { + const tracer = createTracer({ enabled: false, clock: new FakeClock([1000, 2000]) }); + + expect(tracer.enabled).toBe(false); + const span = tracer.startSpan("onboard.total"); + span.end(); + + expect(tracer.toChromeTrace()).toEqual({ traceEvents: [] }); + }); + + it("records complete spans in Chrome trace event format", () => { + const tracer = createTracer({ + enabled: true, + clock: new FakeClock([1000, 6000]), + pid: 42, + tid: 7, + }); + + tracer.startSpan("onboard.gateway", { sandbox: "demo" }).end({ ok: true }); + + expect(tracer.toChromeTrace()).toEqual({ + traceEvents: [ + { + name: "onboard.gateway", + cat: "nemoclaw", + ph: "X", + ts: 1000, + dur: 5000, + pid: 42, + tid: 7, + args: { sandbox: "demo", ok: true }, + }, + ], + }); + }); + + it("allows nested spans without flattening their timestamps", () => { + const tracer = createTracer({ + enabled: true, + clock: new FakeClock([1000, 2000, 4000, 7000]), + pid: 42, + tid: 0, + }); + + const outer = tracer.startSpan("onboard.total"); + const inner = tracer.startSpan("onboard.gateway"); + inner.end(); + outer.end(); + + expect(tracer.toChromeTrace().traceEvents).toEqual([ + expect.objectContaining({ name: "onboard.gateway", ts: 2000, dur: 2000 }), + expect.objectContaining({ name: "onboard.total", ts: 1000, dur: 6000 }), + ]); + }); + + it("withSpan ends spans when the wrapped function throws", () => { + const tracer = createTracer({ enabled: true, clock: new FakeClock([10, 15]) }); + + expect(() => + tracer.withSpan("onboard.failure", () => { + throw new Error("boom"); + }), + ).toThrow("boom"); + + expect(tracer.toChromeTrace().traceEvents).toEqual([ + expect.objectContaining({ + name: "onboard.failure", + ts: 10, + dur: 5, + args: { error: "boom" }, + }), + ]); + }); + + it("withSpan keeps async spans open until the promise settles", async () => { + const tracer = createTracer({ enabled: true, clock: new FakeClock([10, 15]) }); + + await tracer.withSpan("onboard.async", async () => "ok"); + + expect(tracer.toChromeTrace().traceEvents).toEqual([ + expect.objectContaining({ name: "onboard.async", ts: 10, dur: 5 }), + ]); + }); + + it("ignores duplicate span end calls", () => { + const tracer = createTracer({ enabled: true, clock: new FakeClock([10, 15, 100]) }); + const span = tracer.startSpan("onboard.once"); + + span.end(); + span.end(); + + expect(tracer.toChromeTrace().traceEvents).toHaveLength(1); + expect(tracer.toChromeTrace().traceEvents[0]).toMatchObject({ dur: 5 }); + }); + + it("writes trace data to the configured file", () => { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-")); + const traceFile = path.join(tempDir, "trace.json"); + const tracer = createTracer({ enabled: true, traceFile, clock: new FakeClock([100, 250]) }); + + tracer.startSpan("onboard.write").end(); + tracer.flush(); + + expect(JSON.parse(fs.readFileSync(traceFile, "utf8"))).toEqual({ + traceEvents: [expect.objectContaining({ name: "onboard.write", ph: "X", ts: 100, dur: 150 })], + }); + }); + + it("uses NEMOCLAW_TRACE_DIR to allocate a per-process trace file", () => { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "nemoclaw-trace-dir-")); + process.env[TRACE_DIR_ENV] = tempDir; + const tracer = createTracer({ clock: new FakeClock([100, 250]) }); + + tracer.startSpan("onboard.dir").end(); + tracer.flush(); + + const files = fs.readdirSync(tempDir); + expect(files).toHaveLength(1); + expect(files[0]).toMatch(/^nemoclaw-trace-.*\.json$/); + expect(JSON.parse(fs.readFileSync(path.join(tempDir, files[0]), "utf8"))).toEqual({ + traceEvents: [expect.objectContaining({ name: "onboard.dir", ph: "X", ts: 100, dur: 150 })], + }); + }); +}); diff --git a/src/lib/profiling.ts b/src/lib/profiling.ts new file mode 100644 index 0000000000..1b83d79c90 --- /dev/null +++ b/src/lib/profiling.ts @@ -0,0 +1,230 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +import fs from "node:fs"; +import os from "node:os"; +import path from "node:path"; +import { performance } from "node:perf_hooks"; + +export const TRACE_FILE_ENV = "NEMOCLAW_TRACE_FILE"; +export const TRACE_DIR_ENV = "NEMOCLAW_TRACE_DIR"; + +export type TraceArgs = Record; + +export interface ChromeTraceEvent { + name: string; + cat: string; + ph: "X"; + ts: number; + dur: number; + pid: number; + tid: number; + args?: TraceArgs; +} + +export interface ChromeTraceData { + traceEvents: ChromeTraceEvent[]; +} + +export interface Span { + end(args?: TraceArgs): void; +} + +interface Clock { + nowMicroseconds(): number; +} + +interface TracerOptions { + enabled?: boolean; + traceFile?: string; + clock?: Clock; + pid?: number; + tid?: number; +} + +export interface Tracer { + readonly enabled: boolean; + startSpan(name: string, args?: TraceArgs): Span; + withSpan(name: string, fn: () => T, args?: TraceArgs): T; + toChromeTrace(): ChromeTraceData; + flush(): void; +} + +const noopSpan: Span = { + end: () => {}, +}; + +class PerformanceClock implements Clock { + nowMicroseconds(): number { + return Math.round((performance.timeOrigin + performance.now()) * 1000); + } +} + +class NoopTracer implements Tracer { + readonly enabled = false; + + startSpan(_name: string, _args?: TraceArgs): Span { + return noopSpan; + } + + withSpan(_name: string, fn: () => T, _args?: TraceArgs): T { + return fn(); + } + + toChromeTrace(): ChromeTraceData { + return { traceEvents: [] }; + } + + flush(): void {} +} + +class RecordingSpan implements Span { + private ended = false; + + constructor( + private readonly tracer: RecordingTracer, + private readonly name: string, + private readonly args: TraceArgs | undefined, + private readonly startMicroseconds: number, + ) {} + + end(args?: TraceArgs): void { + if (this.ended) return; + this.ended = true; + const endMicroseconds = this.tracer.nowMicroseconds(); + this.tracer.record({ + name: this.name, + cat: "nemoclaw", + ph: "X", + ts: this.startMicroseconds, + dur: Math.max(0, endMicroseconds - this.startMicroseconds), + pid: this.tracer.pid, + tid: this.tracer.tid, + args: mergeArgs(this.args, args), + }); + } +} + +class RecordingTracer implements Tracer { + readonly enabled = true; + readonly pid: number; + readonly tid: number; + private readonly events: ChromeTraceEvent[] = []; + private readonly clock: Clock; + private readonly traceFile: string | undefined; + + constructor(options: TracerOptions = {}) { + this.clock = options.clock ?? new PerformanceClock(); + this.traceFile = options.traceFile; + this.pid = options.pid ?? process.pid; + this.tid = options.tid ?? 0; + } + + nowMicroseconds(): number { + return this.clock.nowMicroseconds(); + } + + record(event: ChromeTraceEvent): void { + this.events.push(event); + } + + startSpan(name: string, args?: TraceArgs): Span { + return new RecordingSpan(this, name, args, this.nowMicroseconds()); + } + + withSpan(name: string, fn: () => T, args?: TraceArgs): T { + const span = this.startSpan(name, args); + try { + const result = fn(); + if (isPromiseLike(result)) { + return result.finally(() => span.end()) as T; + } + span.end(); + return result; + } catch (error) { + span.end({ error: error instanceof Error ? error.message : String(error) }); + throw error; + } + } + + toChromeTrace(): ChromeTraceData { + return { traceEvents: [...this.events] }; + } + + flush(): void { + if (!this.traceFile) return; + writeTraceFile(this.traceFile, this.toChromeTrace()); + } +} + +let globalTracer: Tracer | undefined; +let exitHookRegistered = false; + +export function createTracer(options: TracerOptions = {}): Tracer { + const traceFile = options.traceFile ?? process.env[TRACE_FILE_ENV] ?? defaultTraceFile(); + const enabled = options.enabled ?? Boolean(traceFile); + if (!enabled) return new NoopTracer(); + return new RecordingTracer({ + ...options, + traceFile, + }); +} + +export function getTracer(): Tracer { + if (!globalTracer) { + globalTracer = createTracer(); + registerExitFlush(globalTracer); + } + return globalTracer; +} + +export function startSpan(name: string, args?: TraceArgs): Span { + return getTracer().startSpan(name, args); +} + +export function withSpan(name: string, fn: () => T, args?: TraceArgs): T { + return getTracer().withSpan(name, fn, args); +} + +export function flushTrace(): void { + getTracer().flush(); +} + +export function resetTracerForTesting(tracer?: Tracer): void { + globalTracer = tracer; +} + +function registerExitFlush(tracer: Tracer): void { + if (exitHookRegistered || !tracer.enabled) return; + exitHookRegistered = true; + process.once("beforeExit", () => { + tracer.flush(); + }); +} + +function mergeArgs(startArgs?: TraceArgs, endArgs?: TraceArgs): TraceArgs | undefined { + if (!startArgs && !endArgs) return undefined; + return { ...(startArgs ?? {}), ...(endArgs ?? {}) }; +} + +function isPromiseLike(value: T | PromiseLike): value is Promise { + return ( + typeof value === "object" && + value !== null && + "finally" in value && + typeof (value as { finally?: unknown }).finally === "function" + ); +} + +function defaultTraceFile(): string | undefined { + const traceDir = process.env[TRACE_DIR_ENV]; + if (!traceDir) return undefined; + const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); + return path.join(traceDir, `nemoclaw-trace-${timestamp}-${process.pid}.json`); +} + +function writeTraceFile(traceFile: string, trace: ChromeTraceData): void { + const resolved = path.resolve(traceFile.replace(/^~/, os.homedir())); + fs.mkdirSync(path.dirname(resolved), { recursive: true }); + fs.writeFileSync(resolved, `${JSON.stringify(trace, null, 2)}\n`, { mode: 0o600 }); +} diff --git a/test/e2e/brev-e2e.test.ts b/test/e2e/brev-e2e.test.ts index 278ca69c42..df4d264ddf 100644 --- a/test/e2e/brev-e2e.test.ts +++ b/test/e2e/brev-e2e.test.ts @@ -270,6 +270,7 @@ function sshEnv( `export NEMOCLAW_NON_INTERACTIVE=1`, `export NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE=1`, `export NEMOCLAW_SANDBOX_NAME=e2e-test`, + `export NEMOCLAW_TRACE_DIR='${shellEscape(process.env.NEMOCLAW_TRACE_DIR || "/tmp/nemoclaw-traces/e2e-branch-validation")}'`, ]; if (GPU_TEST_SUITE) { // This suite validates Docker GPU passthrough and sandbox inference wiring. @@ -877,6 +878,7 @@ function pollForSandboxReady(elapsed: () => string): void { [ `source ~/.nvm/nvm.sh 2>/dev/null || true`, `cd ${remoteDir}`, + `mkdir -p "$NEMOCLAW_TRACE_DIR"`, `nohup nemoclaw onboard --non-interactive /tmp/nemoclaw-onboard.log 2>&1 & disown`, `sleep 2`, `echo "onboard launched"`, From d5454cb073f6bc4a1aaed620da832671b08c398d Mon Sep 17 00:00:00 2001 From: Brandon Pelfrey Date: Tue, 5 May 2026 17:44:07 -0700 Subject: [PATCH 2/2] ci: limit profiling traces to onboard e2e runs Signed-off-by: Brandon Pelfrey --- .github/workflows/e2e-branch-validation.yaml | 1 - .github/workflows/macos-e2e.yaml | 2 - .github/workflows/main.yaml | 9 -- .github/workflows/nightly-e2e.yaml | 129 +++++++++++++----- .github/workflows/pr.yaml | 18 --- .github/workflows/sandbox-images-and-e2e.yaml | 54 -------- .github/workflows/wsl-e2e.yaml | 9 +- 7 files changed, 97 insertions(+), 125 deletions(-) diff --git a/.github/workflows/e2e-branch-validation.yaml b/.github/workflows/e2e-branch-validation.yaml index 7350629320..5d8a7072b1 100644 --- a/.github/workflows/e2e-branch-validation.yaml +++ b/.github/workflows/e2e-branch-validation.yaml @@ -265,7 +265,6 @@ jobs: BREV_CREATE_TIMEOUT_SECONDS: ${{ inputs.brev_create_timeout_seconds || vars.BREV_CREATE_TIMEOUT_SECONDS || '' }} NEMOCLAW_GPU_E2E_MODEL: ${{ vars.NEMOCLAW_GPU_E2E_MODEL || 'qwen2.5:7b' }} KEEP_ALIVE: ${{ inputs.keep_alive }} - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/e2e-branch-validation run: npx vitest run --project e2e-branch-validation --silent=false --reporter=default - name: Update check run (completed) diff --git a/.github/workflows/macos-e2e.yaml b/.github/workflows/macos-e2e.yaml index 9515d189a2..f47669799c 100644 --- a/.github/workflows/macos-e2e.yaml +++ b/.github/workflows/macos-e2e.yaml @@ -38,8 +38,6 @@ jobs: macos-e2e: runs-on: macos-26 timeout-minutes: 30 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/macos-e2e steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index ae9a2e0d67..47a0555e3e 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -24,8 +24,6 @@ jobs: checks: runs-on: ubuntu-latest timeout-minutes: 10 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/main-checks steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -33,13 +31,6 @@ jobs: - name: Run basic checks uses: ./.github/actions/basic-checks - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-main-checks - path: /tmp/nemoclaw-traces/main-checks - sandbox-images-and-e2e: needs: checks uses: ./.github/workflows/sandbox-images-and-e2e.yaml diff --git a/.github/workflows/nightly-e2e.yaml b/.github/workflows/nightly-e2e.yaml index 68b22b1a46..aa81e337ea 100644 --- a/.github/workflows/nightly-e2e.yaml +++ b/.github/workflows/nightly-e2e.yaml @@ -126,9 +126,6 @@ on: permissions: contents: read -env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces - concurrency: group: nightly-e2e-${{ github.event_name }}-${{ github.event_name == 'workflow_dispatch' && format('{0}-{1}', github.ref, inputs.pr_number || 'manual') || 'schedule' }} cancel-in-progress: true @@ -148,6 +145,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',cloud-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -169,7 +168,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -191,6 +190,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',cloud-onboard-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -215,7 +216,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -236,6 +237,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',cloud-inference-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -256,7 +259,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -277,6 +280,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',skill-agent-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -297,7 +302,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -355,6 +360,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',messaging-providers-e2e,')) runs-on: ubuntu-latest timeout-minutes: 75 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -380,7 +387,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -441,6 +448,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',messaging-compatible-endpoint-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -462,7 +471,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -743,6 +752,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',token-rotation-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -771,7 +782,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -790,6 +801,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',sandbox-survival-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -810,7 +823,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -832,6 +845,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',issue-2478-crash-loop-recovery-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -852,7 +867,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -874,6 +889,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',hermes-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -896,7 +913,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1035,6 +1052,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',hermes-discord-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1062,7 +1081,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1124,6 +1143,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',sandbox-operations-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1349,7 +1370,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload sandbox gateway logs on failure if: failure() @@ -1379,6 +1400,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',inference-routing-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1398,7 +1421,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -1455,6 +1478,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',network-policy-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1475,7 +1500,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -1495,6 +1520,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',state-backup-restore-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1513,7 +1540,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -1566,6 +1593,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',diagnostics-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1585,7 +1614,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -1609,6 +1638,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',credential-migration-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1630,7 +1661,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1651,6 +1682,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',snapshot-commands-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1671,7 +1704,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1692,6 +1725,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',shields-config-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1712,7 +1747,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1733,6 +1768,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',rebuild-openclaw-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1753,7 +1790,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1775,6 +1812,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',upgrade-stale-sandbox-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1795,7 +1834,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install logs on failure if: failure() @@ -1862,6 +1901,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',rebuild-hermes-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1883,7 +1924,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -1938,6 +1979,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',double-onboard-e2e,')) runs-on: ubuntu-latest timeout-minutes: 90 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -1966,7 +2009,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -1985,6 +2028,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',onboard-repair-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2013,7 +2058,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -2032,6 +2077,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',onboard-resume-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2060,7 +2107,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -2120,6 +2167,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',runtime-overrides-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2148,7 +2197,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -2168,6 +2217,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',credential-sanitization-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2198,7 +2249,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -2218,6 +2269,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',telegram-injection-e2e,')) runs-on: ubuntu-latest timeout-minutes: 60 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2248,7 +2301,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload test log on failure if: failure() @@ -2271,6 +2324,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',overlayfs-autofix-e2e,')) runs-on: ubuntu-latest timeout-minutes: 45 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2291,7 +2346,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload onboard logs on failure if: failure() @@ -2356,6 +2411,8 @@ jobs: contains(format(',{0},', inputs.jobs), ',launchable-smoke-e2e,')) runs-on: ubuntu-latest timeout-minutes: 30 + env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -2378,7 +2435,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -2417,6 +2474,7 @@ jobs: runs-on: linux-amd64-gpu-rtxpro6000-latest-1 timeout-minutes: 30 env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} NEMOCLAW_NON_INTERACTIVE: "1" NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" NEMOCLAW_SANDBOX_NAME: "e2e-gpu-ollama" @@ -2447,7 +2505,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() @@ -2479,6 +2537,7 @@ jobs: runs-on: linux-amd64-gpu-rtxpro6000-latest-1 timeout-minutes: 30 env: + NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} NEMOCLAW_NON_INTERACTIVE: "1" NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" NEMOCLAW_SANDBOX_NAME: "e2e-gpu-double-onboard" @@ -2509,7 +2568,7 @@ jobs: uses: ./.github/actions/upload-nemoclaw-traces with: name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces + path: /tmp/nemoclaw-traces/${{ github.job }} - name: Upload install log on failure if: failure() diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index c0b32863b9..5b8c8e9184 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -41,8 +41,6 @@ jobs: checks: runs-on: ubuntu-latest timeout-minutes: 10 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/pr-checks steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -53,20 +51,11 @@ jobs: - name: Verify platform matrix is in sync run: python3 scripts/generate-platform-docs.py --check - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-pr-checks - path: /tmp/nemoclaw-traces/pr-checks - test-e2e-ollama-proxy: needs: [checks, changes] if: needs.changes.outputs.code == 'true' runs-on: ubuntu-latest timeout-minutes: 5 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/pr-ollama-proxy steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -77,13 +66,6 @@ jobs: - name: Run Ollama auth proxy E2E tests run: bash test/e2e-ollama-proxy.sh - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-pr-ollama-proxy - path: /tmp/nemoclaw-traces/pr-ollama-proxy - # Sandbox image builds and E2E tests have moved to pr-self-hosted.yaml, # which runs on NVIDIA self-hosted runners via copy-pr-bot. # See: .github/workflows/pr-self-hosted.yaml diff --git a/.github/workflows/sandbox-images-and-e2e.yaml b/.github/workflows/sandbox-images-and-e2e.yaml index 59e8100c2f..b9aff9e6e7 100644 --- a/.github/workflows/sandbox-images-and-e2e.yaml +++ b/.github/workflows/sandbox-images-and-e2e.yaml @@ -19,8 +19,6 @@ jobs: build-sandbox-images: runs-on: ubuntu-latest timeout-minutes: 15 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -59,18 +57,9 @@ jobs: path: /tmp/isolation-image.tar.gz retention-days: 1 - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces/${{ github.job }} - build-hermes-sandbox-image: runs-on: ubuntu-latest timeout-minutes: 15 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -95,19 +84,10 @@ jobs: docker run --rm --user sandbox nemoclaw-hermes-production \ test -x /usr/local/bin/nemoclaw-start - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces/${{ github.job }} - build-sandbox-images-arm64: if: inputs.run_arm64 runs-on: ubuntu-24.04-arm timeout-minutes: 15 - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -121,19 +101,10 @@ jobs: - name: Build sandbox test image on arm64 run: docker build -f test/Dockerfile.sandbox --build-arg BASE_IMAGE=nemoclaw-production-arm64 -t nemoclaw-sandbox-test-arm64 . - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces/${{ github.job }} - test-e2e-sandbox: runs-on: ubuntu-latest timeout-minutes: 15 needs: build-sandbox-images - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -150,19 +121,10 @@ jobs: - name: Run sandbox E2E tests run: docker run --rm -v "${{ github.workspace }}/test:/opt/test" nemoclaw-sandbox-test /opt/test/e2e-test.sh - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces/${{ github.job }} - test-e2e-gateway-isolation: runs-on: ubuntu-latest timeout-minutes: 15 needs: build-sandbox-images - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -179,19 +141,10 @@ jobs: - name: Run gateway isolation E2E tests run: NEMOCLAW_TEST_IMAGE=nemoclaw-production bash test/e2e-gateway-isolation.sh - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces/${{ github.job }} - test-e2e-port-overrides: runs-on: ubuntu-latest timeout-minutes: 10 needs: build-sandbox-images - env: - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/${{ github.job }} steps: - name: Checkout uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 @@ -207,10 +160,3 @@ jobs: - name: Run port override E2E tests run: NEMOCLAW_TEST_IMAGE=nemoclaw-production bash test/e2e-port-overrides.sh - - - name: Upload NemoClaw profiling traces - if: always() - uses: ./.github/actions/upload-nemoclaw-traces - with: - name: nemoclaw-traces-${{ github.job }} - path: /tmp/nemoclaw-traces/${{ github.job }} diff --git a/.github/workflows/wsl-e2e.yaml b/.github/workflows/wsl-e2e.yaml index 64ee505a05..c7612d1ad0 100644 --- a/.github/workflows/wsl-e2e.yaml +++ b/.github/workflows/wsl-e2e.yaml @@ -35,7 +35,6 @@ jobs: NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE: "1" NEMOCLAW_RECREATE_SANDBOX: "1" NEMOCLAW_SANDBOX_NAME: "e2e-wsl" - NEMOCLAW_TRACE_DIR: /tmp/nemoclaw-traces/wsl-e2e steps: - name: Force LF line endings for checkout shell: powershell @@ -235,8 +234,6 @@ jobs: # tests that legitimately consume their full budget aren't killed. export NEMOCLAW_EXEC_TIMEOUT=60000 export NEMOCLAW_TEST_TIMEOUT=60000 - export NEMOCLAW_TRACE_DIR='$env:NEMOCLAW_TRACE_DIR' - mkdir -p "\$NEMOCLAW_TRACE_DIR" npx vitest run --testTimeout 60000 "@ $tmp = "$env:RUNNER_TEMP\wsl-step.sh" @@ -260,7 +257,7 @@ jobs: export NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE='$env:NEMOCLAW_ACCEPT_THIRD_PARTY_SOFTWARE' export NEMOCLAW_RECREATE_SANDBOX='$env:NEMOCLAW_RECREATE_SANDBOX' export NEMOCLAW_SANDBOX_NAME='$env:NEMOCLAW_SANDBOX_NAME' - export NEMOCLAW_TRACE_DIR='$env:NEMOCLAW_TRACE_DIR' + export NEMOCLAW_TRACE_DIR='/tmp/nemoclaw-traces/wsl-e2e' mkdir -p "\$NEMOCLAW_TRACE_DIR" bash test/e2e/test-full-e2e.sh "@ @@ -283,8 +280,8 @@ jobs: $script = @" set -euo pipefail mkdir -p '$env:WSL_WORKDIR/_nemoclaw-traces' - if [ -d '$env:NEMOCLAW_TRACE_DIR' ]; then - cp -a '$env:NEMOCLAW_TRACE_DIR'/.' '$env:WSL_WORKDIR/_nemoclaw-traces/' + if [ -d '/tmp/nemoclaw-traces/wsl-e2e' ]; then + cp -a '/tmp/nemoclaw-traces/wsl-e2e'/.' '$env:WSL_WORKDIR/_nemoclaw-traces/' fi "@ $tmp = "$env:RUNNER_TEMP\wsl-step.sh"