diff --git a/.github/scripts/replay-brain-dlq.sh b/.github/scripts/replay-brain-dlq.sh new file mode 100755 index 0000000..b5eb979 --- /dev/null +++ b/.github/scripts/replay-brain-dlq.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Replay dead-letter brain-feed payloads from GitHub Actions artifacts. +# Usage: BRAIN_INGEST_URL=... GH_TOKEN=... bash replay-brain-dlq.sh [repo] +set -euo pipefail + +REPO="${1:-Mikecranesync/factorylm}" +BRAIN_URL="${BRAIN_INGEST_URL:?BRAIN_INGEST_URL required}" + +echo "=== Brain DLQ Replay ===" +echo "Repo: $REPO" + +# Health check first +BASE_URL=$(echo "$BRAIN_URL" | sed 's|/ingest$||') +HTTP=$(curl -s -o /dev/null -w '%{http_code}' \ + --connect-timeout 5 --max-time 10 "${BASE_URL}/health" || echo "000") +if [ "$HTTP" = "000" ] || [ "$HTTP" -ge 400 ] 2>/dev/null; then + echo "ERROR: Brain ingest not healthy (HTTP $HTTP). Aborting replay." + exit 1 +fi +echo "Brain ingest healthy (HTTP $HTTP)" + +# List DLQ artifacts +ARTIFACTS=$(gh api "repos/$REPO/actions/artifacts" \ + --jq '.artifacts[] | select(.name | startswith("brain-dlq-")) | .id' \ + 2>/dev/null || echo "") + +COUNT=$(echo "$ARTIFACTS" | grep -c . 2>/dev/null || echo "0") +echo "Found $COUNT DLQ artifacts" + +for AID in $ARTIFACTS; do + echo " Replaying artifact $AID..." + gh api "repos/$REPO/actions/artifacts/$AID/zip" > /tmp/dlq.zip + unzip -o /tmp/dlq.zip -d /tmp/dlq-extract > /dev/null + if [ -f /tmp/dlq-extract/payload.json ]; then + RESULT=$(curl -s -o /dev/null -w '%{http_code}' \ + --connect-timeout 5 --max-time 10 \ + -X POST "$BRAIN_URL" -H "Content-Type: application/json" \ + -d @/tmp/dlq-extract/payload.json || echo "000") + echo " HTTP $RESULT" + if [ "$RESULT" -lt 400 ] 2>/dev/null; then + gh api -X DELETE "repos/$REPO/actions/artifacts/$AID" 2>/dev/null || true + echo " Replayed + deleted artifact" + fi + fi + rm -rf /tmp/dlq.zip /tmp/dlq-extract +done +echo "=== Replay complete ===" diff --git a/.github/workflows/brain-feed.yml b/.github/workflows/brain-feed.yml index 670d2a9..98663a7 100644 --- a/.github/workflows/brain-feed.yml +++ b/.github/workflows/brain-feed.yml @@ -6,15 +6,15 @@ on: jobs: brain-ingest: runs-on: ubuntu-latest + continue-on-error: true # NEVER block pushes for brain ingestion steps: - uses: actions/checkout@v4 with: fetch-depth: 2 - - name: Send commit to Open Brain - if: env.BRAIN_INGEST_URL != '' + - name: Build payload + id: payload env: - BRAIN_INGEST_URL: ${{ secrets.BRAIN_INGEST_URL }} COMMIT_MSG: ${{ github.event.head_commit.message }} COMMIT_AUTHOR: ${{ github.event.head_commit.author.name }} COMMIT_REPO: ${{ github.repository }} @@ -23,7 +23,7 @@ jobs: run: | FILES=$(git diff-tree --no-commit-id -r --name-only HEAD | tr '\n' ', ') FIRST_LINE=$(echo "$COMMIT_MSG" | head -1) - + mkdir -p /tmp/brain-dlq jq -n \ --arg content "Git commit in $COMMIT_REPO on $COMMIT_BRANCH by $COMMIT_AUTHOR: $FIRST_LINE | Changed files: $FILES" \ --arg sha "$COMMIT_SHA" \ @@ -36,6 +36,32 @@ jobs: source: "github_commit", tags: ["git", "commit", $repo], metadata: { sha: $sha, repo: $repo, branch: $branch, author: $author, files: $files } - }' | curl -sf -X POST "$BRAIN_INGEST_URL" \ - -H "Content-Type: application/json" \ - -d @- || echo "Brain ingest unavailable (server may be offline)" + }' > /tmp/brain-dlq/payload.json + + - name: Send to Open Brain + id: send + if: ${{ secrets.BRAIN_INGEST_URL != '' }} + env: + BRAIN_INGEST_URL: ${{ secrets.BRAIN_INGEST_URL }} + run: | + HTTP_CODE=$(curl -s -o /tmp/brain-dlq/response.txt -w '%{http_code}' \ + --connect-timeout 5 --max-time 10 \ + -X POST "$BRAIN_INGEST_URL" \ + -H "Content-Type: application/json" \ + -d @/tmp/brain-dlq/payload.json || echo "000") + echo "http_code=$HTTP_CODE" >> "$GITHUB_OUTPUT" + if [ "$HTTP_CODE" = "000" ]; then + echo "::warning::Brain ingest unreachable (timeout/network error)" + elif [ "$HTTP_CODE" -ge 400 ] 2>/dev/null; then + echo "::warning::Brain ingest returned HTTP $HTTP_CODE" + else + echo "Brain ingest OK (HTTP $HTTP_CODE)" + fi + + - name: Save dead letter on failure + if: always() && (steps.send.outputs.http_code == '000' || steps.send.outcome == 'skipped') + uses: actions/upload-artifact@v4 + with: + name: brain-dlq-${{ github.sha }} + path: /tmp/brain-dlq/ + retention-days: 7 diff --git a/.github/workflows/ci-watchdog.yml b/.github/workflows/ci-watchdog.yml new file mode 100644 index 0000000..0a9aab8 --- /dev/null +++ b/.github/workflows/ci-watchdog.yml @@ -0,0 +1,105 @@ +name: CI Watchdog +on: + schedule: + - cron: '*/30 * * * *' + workflow_dispatch: + +permissions: + issues: write + actions: read + +jobs: + health-check: + runs-on: ubuntu-latest + steps: + - name: Check brain-ingest endpoint + id: health + env: + BRAIN_INGEST_URL: ${{ secrets.BRAIN_INGEST_URL }} + run: | + if [ -z "$BRAIN_INGEST_URL" ]; then + echo "status=no_secret" >> "$GITHUB_OUTPUT" + echo "::warning::BRAIN_INGEST_URL secret not configured" + exit 0 + fi + BASE_URL=$(echo "$BRAIN_INGEST_URL" | sed 's|/ingest$||') + HTTP_CODE=$(curl -s -o /dev/null -w '%{http_code}' \ + --connect-timeout 5 --max-time 10 \ + "${BASE_URL}/health" || echo "000") + echo "status=$HTTP_CODE" >> "$GITHUB_OUTPUT" + echo "Brain ingest health: HTTP $HTTP_CODE" + + - name: Check recent workflow failures + id: failures + env: + GH_TOKEN: ${{ github.token }} + run: | + SINCE=$(date -u -d '6 hours ago' +%Y-%m-%dT%H:%M:%SZ 2>/dev/null \ + || date -u -v-6H +%Y-%m-%dT%H:%M:%SZ) + FAIL_COUNT=$(gh api "repos/${{ github.repository }}/actions/runs" \ + --jq "[.workflow_runs[] | select(.name==\"Feed Open Brain\" and .conclusion==\"failure\" and .created_at > \"$SINCE\")] | length" \ + 2>/dev/null || echo "0") + echo "fail_count=$FAIL_COUNT" >> "$GITHUB_OUTPUT" + echo "Brain-feed failures (last 6h): $FAIL_COUNT" + + - name: Classify severity + id: severity + run: | + STATUS="${{ steps.health.outputs.status }}" + FAILS="${{ steps.failures.outputs.fail_count }}" + if [ "$STATUS" = "000" ] && [ "${FAILS:-0}" -gt 10 ]; then + echo "level=CRITICAL" >> "$GITHUB_OUTPUT" + echo "msg=Brain ingest DOWN, $FAILS failures in 6h" >> "$GITHUB_OUTPUT" + elif [ "$STATUS" = "000" ] || [ "$STATUS" = "no_secret" ]; then + echo "level=WARN" >> "$GITHUB_OUTPUT" + echo "msg=Brain ingest unreachable, $FAILS failures" >> "$GITHUB_OUTPUT" + elif [ "${FAILS:-0}" -gt 0 ]; then + echo "level=INFO" >> "$GITHUB_OUTPUT" + echo "msg=Brain ingest healthy but $FAILS recent failures (recovering)" >> "$GITHUB_OUTPUT" + else + echo "level=OK" >> "$GITHUB_OUTPUT" + echo "msg=All systems nominal" >> "$GITHUB_OUTPUT" + fi + + - name: Create or update issue on failure + if: steps.severity.outputs.level == 'CRITICAL' || steps.severity.outputs.level == 'WARN' + env: + GH_TOKEN: ${{ github.token }} + run: | + TITLE="[CI Watchdog] Brain ingest: ${{ steps.severity.outputs.level }}" + BODY="**Severity:** ${{ steps.severity.outputs.level }} + **Status:** ${{ steps.severity.outputs.msg }} + **Health HTTP:** ${{ steps.health.outputs.status }} + **Failures (6h):** ${{ steps.failures.outputs.fail_count }} + **Detected:** $(date -u +%Y-%m-%dT%H:%M:%SZ) + + ### Playbook + 1. SSH to VPS: \`ssh root@100.68.120.99\` + 2. Check service: \`systemctl status brain-ingest\` + 3. Check logs: \`journalctl -u brain-ingest -n 30\` + 4. Restart: \`systemctl restart brain-ingest\` + 5. Verify: \`curl -sf http://localhost:8500/health\` + 6. If still down, check env: \`cat /opt/openclaw/.env.brain\` + + _Auto-generated by CI Watchdog_" + BODY=$(echo "$BODY" | sed 's/^ //') + EXISTING=$(gh issue list --label "ci-watchdog" --state open \ + --json number --jq '.[0].number' 2>/dev/null || echo "") + if [ -n "$EXISTING" ]; then + gh issue comment "$EXISTING" --body "$BODY" + else + gh issue create --title "$TITLE" --body "$BODY" --label "ci-watchdog" + fi + + - name: Close issue if recovered + if: steps.severity.outputs.level == 'OK' + env: + GH_TOKEN: ${{ github.token }} + run: | + EXISTING=$(gh issue list --label "ci-watchdog" --state open \ + --json number --jq '.[0].number' 2>/dev/null || echo "") + if [ -n "$EXISTING" ]; then + gh issue comment "$EXISTING" \ + --body "Resolved: Brain ingest is healthy. Auto-closing." + gh issue close "$EXISTING" + fi diff --git a/docs/ops/incidents/INC-2026-03-09-001.md b/docs/ops/incidents/INC-2026-03-09-001.md new file mode 100644 index 0000000..b143608 --- /dev/null +++ b/docs/ops/incidents/INC-2026-03-09-001.md @@ -0,0 +1,50 @@ +# INC-2026-03-09-001: Brain Ingest Endpoint Down — 14 CI Failures + +| Field | Value | +|-------|-------| +| **Date** | 2026-03-09 | +| **Duration** | 6h+ (ongoing at detection) | +| **Severity** | WARN (data loss, not blocking deploys) | +| **Services** | brain-ingest, brain-feed.yml | +| **Repos** | FactoryLM_OS (primary), factorylm (secondary) | + +## Timeline + +| Time (UTC) | Event | +|------------|-------| +| ~01:43 | Brain-ingest endpoint becomes unreachable | +| 01:43-07:44 | 14 "Feed Open Brain" runs fail with curl exit code 28 (timeout) | +| 07:44 | Failures detected during manual review | +| — | Root cause: VPS brain-ingest service down or network issue | +| — | Circuit breaker patch applied to brain-feed.yml (both repos) | +| — | CI Watchdog workflow created for ongoing monitoring | + +## Root Cause + +The brain-ingest HTTP endpoint on the VPS (port 8500) became unreachable. +`curl` exit code 28 = `CURLE_OPERATION_TIMEDOUT`. The workflow had a +`|| echo` fallback but GitHub Actions runs bash with `set -eo pipefail`, +so the `jq | curl` pipe failure propagated despite the `||` catch. + +Additionally, `if: env.BRAIN_INGEST_URL != ''` doesn't work for secrets +in GitHub Actions — the condition may silently pass or fail depending on +runner version. + +No circuit breaker, health check, or alerting existed. Failure was only +discovered by manual inspection. + +## Resolution + +1. Rewrote brain-feed.yml: `continue-on-error: true`, fast timeouts, dead letter queue +2. Created ci-watchdog.yml: scheduled health check with auto-issue management +3. Created replay-brain-dlq.sh: dead letter replay once endpoint recovers +4. Applied same fix to FactoryLM_OS repo + +## Prevention + +- [x] `continue-on-error: true` on non-critical workflows +- [x] `--connect-timeout 5 --max-time 10` on all curl calls +- [x] Scheduled health check with severity classification +- [x] Dead letter queue for payload recovery +- [ ] Investigate why VPS brain-ingest went down +- [ ] Add Telegram alerting to watchdog (future) diff --git a/docs/ops/traces/2026-03-09_ci-watchdog-deploy.md b/docs/ops/traces/2026-03-09_ci-watchdog-deploy.md new file mode 100644 index 0000000..1fc8190 --- /dev/null +++ b/docs/ops/traces/2026-03-09_ci-watchdog-deploy.md @@ -0,0 +1,33 @@ +# TRC-2026-03-09: CI Watchdog Deploy + +| Field | Value | +|-------|-------| +| **Date** | 2026-03-09 | +| **Node** | CHARLIE | +| **Scope** | CI/CD pipeline hardening | +| **Incident** | INC-2026-03-09-001 | + +## Context + +14 consecutive brain-feed workflow failures on FactoryLM_OS due to +brain-ingest endpoint timeout. No monitoring or alerting existed. + +## Changes Made + +| File | Action | +|------|--------| +| `.github/workflows/brain-feed.yml` | Rewrite — circuit breaker + DLQ | +| `.github/workflows/ci-watchdog.yml` | Create — 30-min health monitor | +| `.github/scripts/replay-brain-dlq.sh` | Create — DLQ replay utility | +| `docs/ops/incidents/INC-2026-03-09-001.md` | Create — incident report | + +## Outcome + +- Brain-feed failures no longer block pushes (green checks even when endpoint is down) +- Failed payloads stored as artifacts for 7-day replay window +- Watchdog auto-creates GitHub issues with VPS playbook on failure +- Watchdog auto-closes issues when endpoint recovers + +## Tags + +`ci-cd` `brain-feed` `watchdog` `circuit-breaker` `dead-letter-queue`