Skip to content

Commit e9103cf

Browse files
authored
feat(deploy): improve health checks using in-network curl for Docker containers (#72)
feat(nginx): add Nginx-level liveness probe for infrastructure health monitoring fix(alertmanager): mask Slack webhook in logs to prevent exposure fix(deploy): enforce immutable SHA tags for production images in blue-green deployment fix(validate): enhance environment variable validation to handle special characters
1 parent 975c0e8 commit e9103cf

5 files changed

Lines changed: 51 additions & 21 deletions

File tree

.github/workflows/deploy.yml

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -724,12 +724,12 @@ jobs:
724724
ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
725725
DEPLOY_STATUS="UNKNOWN"
726726
727-
# Health check via docker exec — NO host port binding required.
728-
# api containers live only on api_network; localhost:3000 here means
729-
# the container's own loopback (executed via docker exec).
727+
# Health check via in-network curl container — exercises Docker DNS
728+
# and bridge routing (same path nginx uses). NO host port binding needed.
729+
FT_CURL_IMG="curlimages/curl:8.7.1"
730730
if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then
731-
if docker exec "$ACTIVE_CONTAINER" \
732-
curl -sf --max-time 5 "http://localhost:3000/health" >/dev/null 2>&1; then
731+
if docker run --rm --network api_network "$FT_CURL_IMG" \
732+
-sf --max-time 5 "http://$ACTIVE_CONTAINER:3000/health" >/dev/null 2>&1; then
733733
DEPLOY_STATUS="SUCCESS"
734734
else
735735
DEPLOY_STATUS="UNHEALTHY"
@@ -778,10 +778,15 @@ jobs:
778778
exit 1
779779
}
780780
781-
# Poll /ready via Docker service DNS (no host port binding needed)
781+
# Poll /ready via in-network curl (Docker DNS + bridge routing).
782+
# /ready checks Redis, Supabase, and BullMQ — definitive readiness gate.
783+
# Uses docker run rather than docker exec so the check exercises the
784+
# same network path nginx uses, not the container's own loopback.
785+
FT_CURL_IMG="curlimages/curl:8.7.1"
782786
for i in $(seq 1 15); do
783-
STATUS=$(docker exec "$ACTIVE_CONTAINER" \
784-
curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000/ready" 2>/dev/null || echo "000")
787+
STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \
788+
-s -o /dev/null -w "%{http_code}" \
789+
"http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000")
785790
if [ "$STATUS" = "200" ]; then
786791
echo "✓ API ready (container $ACTIVE_CONTAINER, attempt $i)"
787792
exit 0

infra/nginx/api.conf

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,17 @@ server {
8585
root /var/www/certbot;
8686
}
8787

88+
# Nginx-level liveness probe — answered by nginx directly, no upstream needed.
89+
# Used by: in-network post-switch routing checks, CI health gates, and
90+
# monitoring probes. Returns 200 even when the API container is down so that
91+
# nginx infrastructure health never depends on backend readiness.
92+
# This endpoint intentionally does NOT proxy to the API backend.
93+
location = /health {
94+
access_log off;
95+
return 200 '{"status":"ok"}';
96+
add_header Content-Type application/json;
97+
}
98+
8899
location / {
89100
return 301 https://$host$request_uri;
90101
}

infra/scripts/render-alertmanager.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ case "${ALERTMANAGER_SLACK_WEBHOOK}" in
8282
;;
8383
*)
8484
log_error "ALERTMANAGER_SLACK_WEBHOOK does not start with 'https://hooks.slack.com/'."
85-
log_error "Value prefix: $(printf '%s' "${ALERTMANAGER_SLACK_WEBHOOK}" | cut -c1-30)..."
85+
log_error "Value prefix: ***masked*** (redacted to prevent webhook exposure in logs)"
8686
exit 1
8787
;;
8888
esac

scripts/deploy-bluegreen.sh

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ _ft_snapshot() {
100100
{ set +x; } 2>/dev/null
101101
printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2
102102
printf '[DEPLOY] slot_file = %s\n' "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2
103-
printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'server (api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | head -1 || echo 'unreadable')" >&2
103+
printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'http://(api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2
104104
printf '[DEPLOY] containers =\n' >&2
105105
docker ps --format '[DEPLOY] {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \
106106
|| printf '[DEPLOY] (docker ps unavailable)\n' >&2
@@ -150,8 +150,16 @@ fi
150150
# ---------------------------------------------------------------------------
151151
# CONSTANTS
152152
# ---------------------------------------------------------------------------
153-
IMAGE="ghcr.io/fieldtrack-tech/api:${1:-latest}"
154-
IMAGE_SHA="${1:-latest}"
153+
# Immutable SHA tags ONLY — 'latest' is forbidden in production.
154+
# Reject empty and 'latest' before any Docker operation so failures are
155+
# loud and attributed to the caller rather than appearing as pull errors.
156+
IMAGE_SHA="${1:-}"
157+
if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then
158+
printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required -- latest tag is forbidden in production" sha=%s\n' \
159+
"$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${IMAGE_SHA:-<empty>}" >&2
160+
exit 2
161+
fi
162+
IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA"
155163

156164
BLUE_NAME="api-blue"
157165
GREEN_NAME="api-green"
@@ -403,7 +411,7 @@ _ft_resolve_slot() {
403411
elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then
404412
# Both running -- read nginx upstream container as authoritative tiebreaker.
405413
local nginx_upstream
406-
nginx_upstream=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
414+
nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
407415
if [ "$nginx_upstream" = "api-blue" ]; then recovered_slot="blue"
408416
elif [ "$nginx_upstream" = "api-green" ]; then recovered_slot="green"
409417
else
@@ -943,7 +951,8 @@ _ft_log "msg='nginx reloaded' upstream=$INACTIVE_NAME:$APP_PORT"
943951

944952
# Upstream sanity check -- confirm nginx config actually points at the new container.
945953
# Catches template substitution failures before traffic is affected.
946-
_RELOAD_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
954+
# Upstream sanity: live config must contain http://INACTIVE_NAME:3000 (set $api_backend format)
955+
_RELOAD_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
947956
if [ "$_RELOAD_CONTAINER" != "$INACTIVE_NAME" ]; then
948957
_ft_log "level=ERROR msg='nginx upstream sanity check failed after reload' expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}"
949958
cp "$NGINX_BACKUP" "$NGINX_CONF"
@@ -1030,8 +1039,8 @@ for _attempt in 1 2 3 4 5; do
10301039
sleep 5
10311040
done
10321041

1033-
# Container alignment check -- live nginx config MUST point at the new container.
1034-
_NGINX_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
1042+
# Container alignment check -- live nginx config MUST contain http://INACTIVE_NAME:3000.
1043+
_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
10351044
if [ -n "$_NGINX_CONTAINER" ] && [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
10361045
_ft_log "level=ERROR msg='nginx container mismatch -- slot switch did not take effect' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER"
10371046
_PUB_PASSED=false
@@ -1193,8 +1202,8 @@ else
11931202
_FT_TRUTH_CHECK_PASSED=false
11941203
fi
11951204

1196-
# (2) Verify nginx upstream container matches target
1197-
_NGINX_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
1205+
# (2) Verify nginx upstream container matches target (set $api_backend format)
1206+
_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
11981207
if [ -n "$_NGINX_CONTAINER" ]; then
11991208
if [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
12001209
_ft_log "level=ERROR msg='truth check failed: nginx container mismatch' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER"

scripts/validate-env.sh

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,14 @@ done
6161

6262
# ── Helper: read a value from a KEY=VALUE env file ─────────────────────────────
6363
# Usage: get_val KEY /path/to/file
64+
#
65+
# Uses grep + cut only — avoids sed quote-stripping which corrupts values
66+
# containing special characters (URLs, tokens, passwords).
67+
# Head -1 prevents duplicate-key ambiguity; -E anchors on the key name so
68+
# KEY_EXTRA= cannot accidentally match KEY=.
6469
get_val() {
6570
local key="$1" file="$2"
66-
grep -E "^${key}=" "$file" 2>/dev/null | tail -1 | cut -d'=' -f2- \
67-
| sed "s/^['\"]//; s/['\"]$//"
71+
grep -E "^${key}=" "$file" 2>/dev/null | head -1 | cut -d'=' -f2-
6872
}
6973

7074
DERIVED_HOSTNAME=""
@@ -223,11 +227,12 @@ else
223227
SLACK_WEBHOOK="$(get_val "ALERTMANAGER_SLACK_WEBHOOK" "$MONITORING_ENV_FILE")"
224228
if [[ -z "$SLACK_WEBHOOK" ]]; then
225229
fail "ALERTMANAGER_SLACK_WEBHOOK not set in infra/.env.monitoring"
226-
elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks.slack.com/ ]]; then
230+
elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks\.slack\.com/ ]]; then
227231
fail "ALERTMANAGER_SLACK_WEBHOOK is not a valid Slack webhook URL"
228232
else
229233
pass "ALERTMANAGER_SLACK_WEBHOOK is valid"
230234
fi
235+
unset SLACK_WEBHOOK
231236

232237
# Cross-check 1: API_HOSTNAME must match the hostname derived from API_BASE_URL
233238
MON_HOSTNAME="$(get_val "API_HOSTNAME" "$MONITORING_ENV_FILE")"

0 commit comments

Comments
 (0)