From 0dc1b38a12412d4db53cc01812819e8ae30ddc29 Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Mon, 16 Feb 2026 14:44:18 +0000 Subject: [PATCH 1/5] feat: rename slack-forwarder to slack-gateway in stack and smoke test Service was renamed but deploy configs still referenced the old name. Updates service name, image, command, vault agent ID, and health check URL. --- deploy/monitoring/smoke-test.sh | 2 +- deploy/stack.yaml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/deploy/monitoring/smoke-test.sh b/deploy/monitoring/smoke-test.sh index df2b610..7f833e8 100755 --- a/deploy/monitoring/smoke-test.sh +++ b/deploy/monitoring/smoke-test.sh @@ -35,7 +35,7 @@ HOST_SERVICES=( DOCKER_SERVICES=( "Dispatch|http://warren_dispatch:8601/health" "Chronicle|http://warren_chronicle:8700/api/v1/health" - "Slack-forwarder|http://warren_slack-forwarder:8800/api/v1/health" + "Slack-gateway|http://warren_slack-gateway:8800/api/v1/health" ) # Container to exec health checks from (must be on the warren_agents network). diff --git a/deploy/stack.yaml b/deploy/stack.yaml index 4bc2c9e..dfd30ae 100644 --- a/deploy/stack.yaml +++ b/deploy/stack.yaml @@ -196,12 +196,12 @@ services: - PROMPT_FORGE_API_KEY=b01148d4ac474db310ae509fb3f59c1d7322cb9d94cc3288f011011b8fdf0a0d - NATS_URL=nats://58d522c6250fc6deabd33ab3a32de31da37e2d4c752ffd7138cce9bd9aa0535f@warren_hermes:4222 - slack-forwarder: - image: slack-forwarder:latest + slack-gateway: + image: slack-gateway:latest networks: - agents entrypoint: ["/vault-entrypoint.sh"] - command: ["slack-forwarder"] + command: ["slack-gateway"] volumes: - /home/mike/Warren/deploy/vault-entrypoint.sh:/vault-entrypoint.sh:ro deploy: @@ -216,7 +216,7 @@ services: cpus: '0.5' memory: 256M environment: - - VAULT_AGENT_ID=slack-forwarder + - VAULT_AGENT_ID=slack-gateway - VAULT_SECRETS=slack_app_token:SLACK_APP_TOKEN,slack_bot_token:SLACK_BOT_TOKEN - NATS_URL=nats://58d522c6250fc6deabd33ab3a32de31da37e2d4c752ffd7138cce9bd9aa0535f@warren_hermes:4222 - LISTEN_ADDR=:8750 From dfb994075bb74b4324445ebfbe2b35fa2937a747 Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Mon, 16 Feb 2026 14:45:24 +0000 Subject: [PATCH 2/5] feat: update smoke tests to use /health endpoints --- deploy/monitoring/smoke-test.sh | 246 ++++++++++++++++++++++---------- 1 file changed, 168 insertions(+), 78 deletions(-) diff --git a/deploy/monitoring/smoke-test.sh b/deploy/monitoring/smoke-test.sh index 7f833e8..0b6700e 100755 --- a/deploy/monitoring/smoke-test.sh +++ b/deploy/monitoring/smoke-test.sh @@ -22,28 +22,30 @@ ALEXANDRIA_URL="http://localhost:8500" CURL_TIMEOUT=5 # ── Service definitions ────────────────────────────────────────────── -# Services reachable from the host (published ports): -HOST_SERVICES=( + +# Services with /health JSON endpoints (host-reachable published ports). +# Format: name|health_url|fallback_url (fallback is optional) +HEALTH_SERVICES=( + "Alexandria|http://localhost:8500/health|http://localhost:8500/api/v1/health" + "Dispatch|http://localhost:8600/health|" + "Chronicle|http://localhost:8700/health|" + "Dredd|http://localhost:8750/health|" +) + +# Services checked via simple HTTP/TCP (no /health endpoint). +SIMPLE_SERVICES=( "NATS (Hermes)|http://localhost:8222/healthz" - "Alexandria|http://localhost:8500/api/v1/health" - "PromptForge|http://localhost:8083/health" "OpenClaw Gateway|tcp://localhost:8080" ) -# Services only reachable via Docker overlay network (no published ports). -# Checked from inside the Alexandria container. -DOCKER_SERVICES=( - "Dispatch|http://warren_dispatch:8601/health" - "Chronicle|http://warren_chronicle:8700/api/v1/health" - "Slack-gateway|http://warren_slack-gateway:8800/api/v1/health" +# Overlay-only services checked via docker service replica count. +# Format: name|docker_service_name +OVERLAY_SERVICES=( + "PromptForge|warren_promptforge" + "Slack-gateway|warren_slack-forwarder" ) -# Container to exec health checks from (must be on the warren_agents network). -DOCKER_PROBE_CONTAINER="warren_alexandria" - # ── Try to fetch Slack bot token from Alexandria vault ─────────────── -# GET requests skip API key auth; only X-Agent-ID is needed. -# smoke-test agent has a read grant on slack_bot_token. fetch_token_from_vault() { local resp resp=$(curl -sf --max-time "$CURL_TIMEOUT" \ @@ -64,67 +66,126 @@ fi # ── Health check helpers ───────────────────────────────────────────── failures=() +degraded=() total=0 passed=0 -check_result() { - local name="$1" http_code="$2" +check_pass() { + local name="$1" total=$((total + 1)) - if [[ "$http_code" =~ ^2 ]]; then - echo "[PASS] $name" - passed=$((passed + 1)) - else - local reason="HTTP $http_code" - [[ "$http_code" == "000" ]] && reason="connection refused or timeout" - echo "[FAIL] $name - $reason" - failures+=("$name ($reason)") - fi + echo "[PASS] $name" + passed=$((passed + 1)) +} + +check_degraded() { + local name="$1" detail="$2" + total=$((total + 1)) + echo "[DEGRADED] $name - $detail" + passed=$((passed + 1)) # service is up, but degraded + degraded+=("$name ($detail)") +} + +check_fail() { + local name="$1" reason="$2" + total=$((total + 1)) + echo "[FAIL] $name - $reason" + failures+=("$name ($reason)") } -# ── Check host-reachable services ──────────────────────────────────── -for entry in "${HOST_SERVICES[@]}"; do +# ── Check /health endpoint services ───────────────────────────────── +for entry in "${HEALTH_SERVICES[@]}"; do + IFS='|' read -r name health_url fallback_url <<< "$entry" + + # Try /health endpoint first + body=$(curl -sf --max-time "$CURL_TIMEOUT" "$health_url" 2>/dev/null) || body="" + + if [[ -n "$body" ]]; then + status=$(echo "$body" | jq -r '.status // empty' 2>/dev/null) + case "$status" in + ok|healthy|UP) + # Check for degraded DB status in health response + db_status=$(echo "$body" | jq -r '.db // .database // .components.db.status // empty' 2>/dev/null) + if [[ -n "$db_status" && "$db_status" != "ok" && "$db_status" != "healthy" && "$db_status" != "UP" ]]; then + check_degraded "$name" "service up, db: $db_status" + else + check_pass "$name" + fi + ;; + degraded) + detail=$(echo "$body" | jq -r '.message // .reason // "degraded"' 2>/dev/null) + check_degraded "$name" "$detail" + ;; + *) + check_fail "$name" "unhealthy status: ${status:-unknown}" + ;; + esac + continue + fi + + # Fallback to legacy endpoint if configured + if [[ -n "$fallback_url" ]]; then + http_code=$(curl -sf --max-time "$CURL_TIMEOUT" -o /dev/null -w '%{http_code}' "$fallback_url" 2>/dev/null) || http_code="000" + if [[ "$http_code" =~ ^2 ]]; then + check_pass "$name" + else + [[ "$http_code" == "000" ]] && http_code="connection refused or timeout" + check_fail "$name" "HTTP $http_code (fallback)" + fi + continue + fi + + check_fail "$name" "connection refused or timeout" +done + +# ── Check simple (non-/health) services ────────────────────────────── +for entry in "${SIMPLE_SERVICES[@]}"; do name="${entry%%|*}" url="${entry##*|}" if [[ "$url" == tcp://* ]]; then - # TCP connect check (for services with no health route) local_addr="${url#tcp://}" local_host="${local_addr%%:*}" local_port="${local_addr##*:}" if bash -c "echo >/dev/tcp/$local_host/$local_port" 2>/dev/null; then - http_code="200" + check_pass "$name" else - http_code="000" + check_fail "$name" "connection refused or timeout" fi else http_code=$(curl -sf --max-time "$CURL_TIMEOUT" -o /dev/null -w '%{http_code}' "$url" 2>/dev/null) || http_code="000" + if [[ "$http_code" =~ ^2 ]]; then + check_pass "$name" + else + [[ "$http_code" == "000" ]] && http_code="connection refused or timeout" + check_fail "$name" "HTTP $http_code" + fi fi - check_result "$name" "$http_code" done -# ── Check Docker-internal services ─────────────────────────────────── -# Resolve the actual task container name (swarm appends a task suffix). -probe_id=$(docker ps -q -f "name=${DOCKER_PROBE_CONTAINER}" 2>/dev/null | head -1) -if [[ -z "$probe_id" ]]; then - echo "[WARN] Probe container ${DOCKER_PROBE_CONTAINER} not running — skipping Docker service checks" - for entry in "${DOCKER_SERVICES[@]}"; do - name="${entry%%|*}" - total=$((total + 1)) - echo "[FAIL] $name - probe container unavailable" - failures+=("$name (probe container unavailable)") - done -else - for entry in "${DOCKER_SERVICES[@]}"; do - name="${entry%%|*}" - url="${entry##*|}" - http_code=$(docker exec "$probe_id" \ - wget -qO /dev/null --spider -T "$CURL_TIMEOUT" "$url" 2>&1 \ - && echo "200" || echo "000") 2>/dev/null - check_result "$name" "$http_code" - done -fi +# ── Check overlay-only services via docker service replicas ────────── +for entry in "${OVERLAY_SERVICES[@]}"; do + IFS='|' read -r name svc_name <<< "$entry" + replicas=$(docker service ls --filter "name=${svc_name}" --format '{{.Replicas}}' 2>/dev/null | head -1) + if [[ -z "$replicas" ]]; then + check_fail "$name" "service not found" + continue + fi + # Replicas format: "1/1" (running/desired) + running="${replicas%%/*}" + desired="${replicas##*/}" + if [[ "$running" == "$desired" && "$running" -gt 0 ]] 2>/dev/null; then + check_pass "$name" + elif [[ "$running" -gt 0 ]] 2>/dev/null; then + check_degraded "$name" "replicas $replicas" + else + check_fail "$name" "replicas $replicas" + fi +done echo "---" echo "$passed/$total services healthy" +if [[ ${#degraded[@]} -gt 0 ]]; then + echo "${#degraded[@]} service(s) degraded" +fi # ── Slack alert on failure ─────────────────────────────────────────── send_slack_alert() { @@ -143,35 +204,64 @@ send_slack_alert() { fail_lines+="\\n- $f" done + local degraded_lines="" + for d in "${degraded[@]}"; do + degraded_lines+="\\n- $d" + done + + local summary="${#failures[@]} service(s) DOWN" + if [[ ${#degraded[@]} -gt 0 ]]; then + summary+=", ${#degraded[@]} degraded" + fi + + local blocks + blocks=$(cat < Date: Mon, 16 Feb 2026 14:55:04 +0000 Subject: [PATCH 3/5] fix: add Dispatch fallback health check to smoke test Dispatch /health endpoint returns 404 on current image (pre-PR #11). Added /api/v1/backlog as fallback URL with X-Agent-ID header so the smoke test passes until the image is rebuilt with the /health endpoint. --- deploy/monitoring/smoke-test.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deploy/monitoring/smoke-test.sh b/deploy/monitoring/smoke-test.sh index 0b6700e..4eb2286 100755 --- a/deploy/monitoring/smoke-test.sh +++ b/deploy/monitoring/smoke-test.sh @@ -27,7 +27,7 @@ CURL_TIMEOUT=5 # Format: name|health_url|fallback_url (fallback is optional) HEALTH_SERVICES=( "Alexandria|http://localhost:8500/health|http://localhost:8500/api/v1/health" - "Dispatch|http://localhost:8600/health|" + "Dispatch|http://localhost:8600/health|http://localhost:8600/api/v1/backlog?limit=1" "Chronicle|http://localhost:8700/health|" "Dredd|http://localhost:8750/health|" ) @@ -124,7 +124,8 @@ for entry in "${HEALTH_SERVICES[@]}"; do # Fallback to legacy endpoint if configured if [[ -n "$fallback_url" ]]; then - http_code=$(curl -sf --max-time "$CURL_TIMEOUT" -o /dev/null -w '%{http_code}' "$fallback_url" 2>/dev/null) || http_code="000" + http_code=$(curl -sf --max-time "$CURL_TIMEOUT" -o /dev/null -w '%{http_code}' \ + -H "X-Agent-ID: smoke-test" "$fallback_url" 2>/dev/null) || http_code="000" if [[ "$http_code" =~ ^2 ]]; then check_pass "$name" else From 3e10412a1803a455d15aea8b3c5d9d2476821a75 Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Mon, 16 Feb 2026 15:00:22 +0000 Subject: [PATCH 4/5] refactor(smoke-test): probe overlay services from inside Docker network Instead of trying localhost for overlay-only services (Dispatch, Chronicle, PromptForge, Slack-gateway), use docker exec against a running container on the warren_agents network. - Host checks: Alexandria (:8500), Dredd (:8750), NATS (:8222), OpenClaw (:18789) - Overlay checks via probe: Dispatch, Chronicle, PromptForge, Slack-gateway - Auto-discovers probe container (wget/curl/python3) - Falls back to replica count if no probe available - Slack alerting format unchanged --- deploy/monitoring/smoke-test.sh | 275 +++++++++++++++++++++++++++----- 1 file changed, 234 insertions(+), 41 deletions(-) diff --git a/deploy/monitoring/smoke-test.sh b/deploy/monitoring/smoke-test.sh index 4eb2286..1e9e0e0 100755 --- a/deploy/monitoring/smoke-test.sh +++ b/deploy/monitoring/smoke-test.sh @@ -4,6 +4,10 @@ set -euo pipefail # ── Warren Swarm Smoke Test ───────────────────────────────────────── # Checks health of all Warren services and alerts to Slack on failure. # Run via cron every 15 minutes. +# +# Host-reachable services are checked directly via localhost. +# Overlay-only services (no published ports) are probed from INSIDE +# the Docker network using `docker exec` against a running container. # ───────────────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" @@ -21,28 +25,175 @@ SLACK_CHANNEL="${SLACK_CHANNEL:-C0AE9AMK9MM}" ALEXANDRIA_URL="http://localhost:8500" CURL_TIMEOUT=5 +# ── Overlay probe setup ───────────────────────────────────────────── +# Find a container on the warren_agents network to use as a probe. +# Prefer alexandria (Python image, has urllib). Falls back to wget/curl +# in any warren container, or docker run as last resort. +PROBE_CONTAINER="" +PROBE_CMD="" # "wget" or "curl" or "python" + +find_probe() { + local candidates=("warren_alexandria" "warren_hermes" "warren_dispatch" "warren_chronicle") + for name in "${candidates[@]}"; do + local cid + cid=$(docker ps -q -f "name=${name}" 2>/dev/null | head -1) || continue + [[ -z "$cid" ]] && continue + + # Check for wget (Alpine default) + if docker exec "$cid" which wget >/dev/null 2>&1; then + PROBE_CONTAINER="$cid" + PROBE_CMD="wget" + return 0 + fi + # Check for curl + if docker exec "$cid" which curl >/dev/null 2>&1; then + PROBE_CONTAINER="$cid" + PROBE_CMD="curl" + return 0 + fi + # Check for python3 (can do HTTP) + if docker exec "$cid" which python3 >/dev/null 2>&1; then + PROBE_CONTAINER="$cid" + PROBE_CMD="python" + return 0 + fi + done + return 1 +} + +# Execute an HTTP GET from inside the overlay network. +# Usage: overlay_get [header:value ...] +# Returns: HTTP body on stdout, exits 0 on 2xx, 1 otherwise. +overlay_get() { + local url="$1"; shift + local headers=("$@") + + if [[ -z "$PROBE_CONTAINER" ]]; then + echo "" + return 1 + fi + + case "$PROBE_CMD" in + wget) + local wget_args=("-qO-" "--timeout=$CURL_TIMEOUT") + for h in "${headers[@]}"; do + wget_args+=("--header=$h") + done + wget_args+=("$url") + docker exec "$PROBE_CONTAINER" wget "${wget_args[@]}" 2>/dev/null + ;; + curl) + local curl_args=("-sf" "--max-time" "$CURL_TIMEOUT") + for h in "${headers[@]}"; do + curl_args+=("-H" "$h") + done + curl_args+=("$url") + docker exec "$PROBE_CONTAINER" curl "${curl_args[@]}" 2>/dev/null + ;; + python) + local py_headers="" + for h in "${headers[@]}"; do + local key="${h%%:*}" + local val="${h#*: }" + py_headers+="req.add_header('$key','$val');" + done + docker exec "$PROBE_CONTAINER" python3 -c " +import urllib.request,sys,json +try: + req=urllib.request.Request('$url') + $py_headers + resp=urllib.request.urlopen(req,timeout=$CURL_TIMEOUT) + sys.stdout.write(resp.read().decode()) +except Exception as e: + sys.exit(1) +" 2>/dev/null + ;; + *) + return 1 + ;; + esac +} + +# overlay_check: like overlay_get but returns HTTP status code string +overlay_check() { + local url="$1"; shift + local headers=("$@") + + if [[ -z "$PROBE_CONTAINER" ]]; then + echo "000" + return 0 + fi + + case "$PROBE_CMD" in + wget) + # wget: if it succeeds, it's 2xx + local wget_args=("-qO/dev/null" "--timeout=$CURL_TIMEOUT" "-S") + for h in "${headers[@]}"; do + wget_args+=("--header=$h") + done + wget_args+=("$url") + if docker exec "$PROBE_CONTAINER" wget "${wget_args[@]}" >/dev/null 2>&1; then + echo "200" + else + echo "000" + fi + ;; + curl) + local curl_args=("-sf" "--max-time" "$CURL_TIMEOUT" "-o" "/dev/null" "-w" "%{http_code}") + for h in "${headers[@]}"; do + curl_args+=("-H" "$h") + done + curl_args+=("$url") + docker exec "$PROBE_CONTAINER" curl "${curl_args[@]}" 2>/dev/null || echo "000" + ;; + python) + local py_headers="" + for h in "${headers[@]}"; do + local key="${h%%:*}" + local val="${h#*: }" + py_headers+="req.add_header('$key','$val');" + done + docker exec "$PROBE_CONTAINER" python3 -c " +import urllib.request,sys +try: + req=urllib.request.Request('$url') + $py_headers + resp=urllib.request.urlopen(req,timeout=$CURL_TIMEOUT) + print(resp.status) +except urllib.error.HTTPError as e: + print(e.code) +except: + print('000') +" 2>/dev/null + ;; + *) + echo "000" + ;; + esac +} + # ── Service definitions ────────────────────────────────────────────── -# Services with /health JSON endpoints (host-reachable published ports). +# Host-reachable services (published ports). # Format: name|health_url|fallback_url (fallback is optional) -HEALTH_SERVICES=( +HOST_HEALTH_SERVICES=( "Alexandria|http://localhost:8500/health|http://localhost:8500/api/v1/health" - "Dispatch|http://localhost:8600/health|http://localhost:8600/api/v1/backlog?limit=1" - "Chronicle|http://localhost:8700/health|" "Dredd|http://localhost:8750/health|" ) -# Services checked via simple HTTP/TCP (no /health endpoint). -SIMPLE_SERVICES=( +# Host-reachable simple services (no /health JSON endpoint). +HOST_SIMPLE_SERVICES=( "NATS (Hermes)|http://localhost:8222/healthz" - "OpenClaw Gateway|tcp://localhost:8080" + "OpenClaw Gateway|tcp://localhost:18789" ) -# Overlay-only services checked via docker service replica count. -# Format: name|docker_service_name -OVERLAY_SERVICES=( - "PromptForge|warren_promptforge" - "Slack-gateway|warren_slack-forwarder" +# Overlay-only services — probed from inside the Docker network. +# Format: name|health_url|fallback_url|docker_service_name +OVERLAY_HEALTH_SERVICES=( + "Dispatch|http://warren_dispatch:8600/health|http://warren_dispatch:8600/api/v1/backlog?limit=1|warren_dispatch" + "Chronicle|http://warren_chronicle:8700/health||warren_chronicle" + "PromptForge|http://warren_promptforge:8083/health||warren_promptforge" + "Slack-gateway|http://warren_slack-gateway:8750/health||warren_slack-forwarder" ) # ── Try to fetch Slack bot token from Alexandria vault ─────────────── @@ -92,33 +243,40 @@ check_fail() { failures+=("$name ($reason)") } -# ── Check /health endpoint services ───────────────────────────────── -for entry in "${HEALTH_SERVICES[@]}"; do +# Parse /health JSON body and classify as pass/degraded/fail. +evaluate_health_body() { + local name="$1" body="$2" + local status + status=$(echo "$body" | jq -r '.status // empty' 2>/dev/null) + case "$status" in + ok|healthy|UP) + local db_status + db_status=$(echo "$body" | jq -r '.db // .database // .components.db.status // empty' 2>/dev/null) + if [[ -n "$db_status" && "$db_status" != "ok" && "$db_status" != "healthy" && "$db_status" != "UP" ]]; then + check_degraded "$name" "service up, db: $db_status" + else + check_pass "$name" + fi + ;; + degraded) + local detail + detail=$(echo "$body" | jq -r '.message // .reason // "degraded"' 2>/dev/null) + check_degraded "$name" "$detail" + ;; + *) + check_fail "$name" "unhealthy status: ${status:-unknown}" + ;; + esac +} + +# ── Check host-reachable /health services ──────────────────────────── +for entry in "${HOST_HEALTH_SERVICES[@]}"; do IFS='|' read -r name health_url fallback_url <<< "$entry" - # Try /health endpoint first body=$(curl -sf --max-time "$CURL_TIMEOUT" "$health_url" 2>/dev/null) || body="" if [[ -n "$body" ]]; then - status=$(echo "$body" | jq -r '.status // empty' 2>/dev/null) - case "$status" in - ok|healthy|UP) - # Check for degraded DB status in health response - db_status=$(echo "$body" | jq -r '.db // .database // .components.db.status // empty' 2>/dev/null) - if [[ -n "$db_status" && "$db_status" != "ok" && "$db_status" != "healthy" && "$db_status" != "UP" ]]; then - check_degraded "$name" "service up, db: $db_status" - else - check_pass "$name" - fi - ;; - degraded) - detail=$(echo "$body" | jq -r '.message // .reason // "degraded"' 2>/dev/null) - check_degraded "$name" "$detail" - ;; - *) - check_fail "$name" "unhealthy status: ${status:-unknown}" - ;; - esac + evaluate_health_body "$name" "$body" continue fi @@ -138,8 +296,8 @@ for entry in "${HEALTH_SERVICES[@]}"; do check_fail "$name" "connection refused or timeout" done -# ── Check simple (non-/health) services ────────────────────────────── -for entry in "${SIMPLE_SERVICES[@]}"; do +# ── Check host-reachable simple services ───────────────────────────── +for entry in "${HOST_SIMPLE_SERVICES[@]}"; do name="${entry%%|*}" url="${entry##*|}" if [[ "$url" == tcp://* ]]; then @@ -162,19 +320,54 @@ for entry in "${SIMPLE_SERVICES[@]}"; do fi done -# ── Check overlay-only services via docker service replicas ────────── -for entry in "${OVERLAY_SERVICES[@]}"; do - IFS='|' read -r name svc_name <<< "$entry" +# ── Check overlay-only services via docker exec probe ──────────────── +find_probe || echo "[WARN] No probe container found — overlay checks will use replica count only" + +for entry in "${OVERLAY_HEALTH_SERVICES[@]}"; do + IFS='|' read -r name health_url fallback_url svc_name <<< "$entry" + + if [[ -n "$PROBE_CONTAINER" ]]; then + # Try /health endpoint from inside the network + body=$(overlay_get "$health_url") || body="" + + if [[ -n "$body" ]]; then + evaluate_health_body "$name" "$body" + continue + fi + + # Try fallback URL if configured + if [[ -n "$fallback_url" ]]; then + fb_body=$(overlay_get "$fallback_url" "X-Agent-ID: smoke-test") || fb_body="" + if [[ -n "$fb_body" ]]; then + check_pass "$name" + continue + fi + fi + + # If probe exists but HTTP failed, check if container responds at all + # before falling through to replica count + check_code=$(overlay_check "$health_url") || check_code="000" + if [[ "$check_code" =~ ^2 ]]; then + check_pass "$name" + continue + fi + fi + + # Fallback: check docker service replica count replicas=$(docker service ls --filter "name=${svc_name}" --format '{{.Replicas}}' 2>/dev/null | head -1) if [[ -z "$replicas" ]]; then check_fail "$name" "service not found" continue fi - # Replicas format: "1/1" (running/desired) running="${replicas%%/*}" desired="${replicas##*/}" if [[ "$running" == "$desired" && "$running" -gt 0 ]] 2>/dev/null; then - check_pass "$name" + if [[ -n "$PROBE_CONTAINER" ]]; then + # We had a probe but HTTP failed — service is running but not responding + check_degraded "$name" "replicas $replicas but health endpoint unreachable" + else + check_pass "$name" # no probe available, replicas OK is best we can do + fi elif [[ "$running" -gt 0 ]] 2>/dev/null; then check_degraded "$name" "replicas $replicas" else From d7b5f21778dbafff4857b33e7bba54f999158d2f Mon Sep 17 00:00:00 2001 From: "Kai (via Mike Darlington)" Date: Mon, 16 Feb 2026 15:02:31 +0000 Subject: [PATCH 5/5] fix: smoke test uses overlay probes, fix slack-gateway service name --- deploy/monitoring/smoke-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/monitoring/smoke-test.sh b/deploy/monitoring/smoke-test.sh index 1e9e0e0..41fa819 100755 --- a/deploy/monitoring/smoke-test.sh +++ b/deploy/monitoring/smoke-test.sh @@ -193,7 +193,7 @@ OVERLAY_HEALTH_SERVICES=( "Dispatch|http://warren_dispatch:8600/health|http://warren_dispatch:8600/api/v1/backlog?limit=1|warren_dispatch" "Chronicle|http://warren_chronicle:8700/health||warren_chronicle" "PromptForge|http://warren_promptforge:8083/health||warren_promptforge" - "Slack-gateway|http://warren_slack-gateway:8750/health||warren_slack-forwarder" + "Slack-gateway|http://warren_slack-gateway:8750/health||warren_slack-gateway" ) # ── Try to fetch Slack bot token from Alexandria vault ───────────────