From 45013e8722fe4f2cd4ffc0189378f887be83b621 Mon Sep 17 00:00:00 2001 From: rajashish147 Date: Fri, 3 Apr 2026 19:31:20 +0530 Subject: [PATCH] feat(deploy): improve health checks using in-network curl for Docker containers feat(nginx): add Nginx-level liveness probe for infrastructure health monitoring fix(alertmanager): mask Slack webhook in logs to prevent exposure fix(deploy): enforce immutable SHA tags for production images in blue-green deployment fix(validate): enhance environment variable validation to handle special characters --- .github/workflows/deploy.yml | 21 +++++++++++++-------- infra/nginx/api.conf | 11 +++++++++++ infra/scripts/render-alertmanager.sh | 2 +- scripts/deploy-bluegreen.sh | 27 ++++++++++++++++++--------- scripts/validate-env.sh | 11 ++++++++--- 5 files changed, 51 insertions(+), 21 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f3d19ed..3c41d71 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -724,12 +724,12 @@ jobs: ACTIVE_CONTAINER="api-${ACTIVE_SLOT}" DEPLOY_STATUS="UNKNOWN" - # Health check via docker exec — NO host port binding required. - # api containers live only on api_network; localhost:3000 here means - # the container's own loopback (executed via docker exec). + # Health check via in-network curl container — exercises Docker DNS + # and bridge routing (same path nginx uses). NO host port binding needed. + FT_CURL_IMG="curlimages/curl:8.7.1" if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then - if docker exec "$ACTIVE_CONTAINER" \ - curl -sf --max-time 5 "http://localhost:3000/health" >/dev/null 2>&1; then + if docker run --rm --network api_network "$FT_CURL_IMG" \ + -sf --max-time 5 "http://$ACTIVE_CONTAINER:3000/health" >/dev/null 2>&1; then DEPLOY_STATUS="SUCCESS" else DEPLOY_STATUS="UNHEALTHY" @@ -778,10 +778,15 @@ jobs: exit 1 } - # Poll /ready via Docker service DNS (no host port binding needed) + # Poll /ready via in-network curl (Docker DNS + bridge routing). + # /ready checks Redis, Supabase, and BullMQ — definitive readiness gate. + # Uses docker run rather than docker exec so the check exercises the + # same network path nginx uses, not the container's own loopback. + FT_CURL_IMG="curlimages/curl:8.7.1" for i in $(seq 1 15); do - STATUS=$(docker exec "$ACTIVE_CONTAINER" \ - curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000/ready" 2>/dev/null || echo "000") + STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \ + -s -o /dev/null -w "%{http_code}" \ + "http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000") if [ "$STATUS" = "200" ]; then echo "✓ API ready (container $ACTIVE_CONTAINER, attempt $i)" exit 0 diff --git a/infra/nginx/api.conf b/infra/nginx/api.conf index 571089d..d4e445c 100644 --- a/infra/nginx/api.conf +++ b/infra/nginx/api.conf @@ -85,6 +85,17 @@ server { root /var/www/certbot; } + # Nginx-level liveness probe — answered by nginx directly, no upstream needed. + # Used by: in-network post-switch routing checks, CI health gates, and + # monitoring probes. Returns 200 even when the API container is down so that + # nginx infrastructure health never depends on backend readiness. + # This endpoint intentionally does NOT proxy to the API backend. + location = /health { + access_log off; + return 200 '{"status":"ok"}'; + add_header Content-Type application/json; + } + location / { return 301 https://$host$request_uri; } diff --git a/infra/scripts/render-alertmanager.sh b/infra/scripts/render-alertmanager.sh index 952ac29..692d54f 100644 --- a/infra/scripts/render-alertmanager.sh +++ b/infra/scripts/render-alertmanager.sh @@ -82,7 +82,7 @@ case "${ALERTMANAGER_SLACK_WEBHOOK}" in ;; *) log_error "ALERTMANAGER_SLACK_WEBHOOK does not start with 'https://hooks.slack.com/'." - log_error "Value prefix: $(printf '%s' "${ALERTMANAGER_SLACK_WEBHOOK}" | cut -c1-30)..." + log_error "Value prefix: ***masked*** (redacted to prevent webhook exposure in logs)" exit 1 ;; esac diff --git a/scripts/deploy-bluegreen.sh b/scripts/deploy-bluegreen.sh index 718fd5e..bc81087 100644 --- a/scripts/deploy-bluegreen.sh +++ b/scripts/deploy-bluegreen.sh @@ -100,7 +100,7 @@ _ft_snapshot() { { set +x; } 2>/dev/null printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2 printf '[DEPLOY] slot_file = %s\n' "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2 - printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'server (api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | head -1 || echo 'unreadable')" >&2 + printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'http://(api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2 printf '[DEPLOY] containers =\n' >&2 docker ps --format '[DEPLOY] {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \ || printf '[DEPLOY] (docker ps unavailable)\n' >&2 @@ -150,8 +150,16 @@ fi # --------------------------------------------------------------------------- # CONSTANTS # --------------------------------------------------------------------------- -IMAGE="ghcr.io/fieldtrack-tech/api:${1:-latest}" -IMAGE_SHA="${1:-latest}" +# Immutable SHA tags ONLY — 'latest' is forbidden in production. +# Reject empty and 'latest' before any Docker operation so failures are +# loud and attributed to the caller rather than appearing as pull errors. +IMAGE_SHA="${1:-}" +if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then + printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required -- latest tag is forbidden in production" sha=%s\n' \ + "$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${IMAGE_SHA:-}" >&2 + exit 2 +fi +IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA" BLUE_NAME="api-blue" GREEN_NAME="api-green" @@ -403,7 +411,7 @@ _ft_resolve_slot() { elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then # Both running -- read nginx upstream container as authoritative tiebreaker. local nginx_upstream - nginx_upstream=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") + nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") if [ "$nginx_upstream" = "api-blue" ]; then recovered_slot="blue" elif [ "$nginx_upstream" = "api-green" ]; then recovered_slot="green" else @@ -943,7 +951,8 @@ _ft_log "msg='nginx reloaded' upstream=$INACTIVE_NAME:$APP_PORT" # Upstream sanity check -- confirm nginx config actually points at the new container. # Catches template substitution failures before traffic is affected. -_RELOAD_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") +# Upstream sanity: live config must contain http://INACTIVE_NAME:3000 (set $api_backend format) +_RELOAD_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") if [ "$_RELOAD_CONTAINER" != "$INACTIVE_NAME" ]; then _ft_log "level=ERROR msg='nginx upstream sanity check failed after reload' expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}" cp "$NGINX_BACKUP" "$NGINX_CONF" @@ -1030,8 +1039,8 @@ for _attempt in 1 2 3 4 5; do sleep 5 done -# Container alignment check -- live nginx config MUST point at the new container. -_NGINX_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") +# Container alignment check -- live nginx config MUST contain http://INACTIVE_NAME:3000. +_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") if [ -n "$_NGINX_CONTAINER" ] && [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then _ft_log "level=ERROR msg='nginx container mismatch -- slot switch did not take effect' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER" _PUB_PASSED=false @@ -1193,8 +1202,8 @@ else _FT_TRUTH_CHECK_PASSED=false fi -# (2) Verify nginx upstream container matches target -_NGINX_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") +# (2) Verify nginx upstream container matches target (set $api_backend format) +_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "") if [ -n "$_NGINX_CONTAINER" ]; then if [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then _ft_log "level=ERROR msg='truth check failed: nginx container mismatch' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER" diff --git a/scripts/validate-env.sh b/scripts/validate-env.sh index 50e5b11..ebe8c2d 100644 --- a/scripts/validate-env.sh +++ b/scripts/validate-env.sh @@ -61,10 +61,14 @@ done # ── Helper: read a value from a KEY=VALUE env file ───────────────────────────── # Usage: get_val KEY /path/to/file +# +# Uses grep + cut only — avoids sed quote-stripping which corrupts values +# containing special characters (URLs, tokens, passwords). +# Head -1 prevents duplicate-key ambiguity; -E anchors on the key name so +# KEY_EXTRA= cannot accidentally match KEY=. get_val() { local key="$1" file="$2" - grep -E "^${key}=" "$file" 2>/dev/null | tail -1 | cut -d'=' -f2- \ - | sed "s/^['\"]//; s/['\"]$//" + grep -E "^${key}=" "$file" 2>/dev/null | head -1 | cut -d'=' -f2- } DERIVED_HOSTNAME="" @@ -223,11 +227,12 @@ else SLACK_WEBHOOK="$(get_val "ALERTMANAGER_SLACK_WEBHOOK" "$MONITORING_ENV_FILE")" if [[ -z "$SLACK_WEBHOOK" ]]; then fail "ALERTMANAGER_SLACK_WEBHOOK not set in infra/.env.monitoring" - elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks.slack.com/ ]]; then + elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks\.slack\.com/ ]]; then fail "ALERTMANAGER_SLACK_WEBHOOK is not a valid Slack webhook URL" else pass "ALERTMANAGER_SLACK_WEBHOOK is valid" fi + unset SLACK_WEBHOOK # Cross-check 1: API_HOSTNAME must match the hostname derived from API_BASE_URL MON_HOSTNAME="$(get_val "API_HOSTNAME" "$MONITORING_ENV_FILE")"