Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -724,12 +724,12 @@ jobs:
ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
DEPLOY_STATUS="UNKNOWN"

# Health check via docker exec β€” NO host port binding required.
# api containers live only on api_network; localhost:3000 here means
# the container's own loopback (executed via docker exec).
# Health check via in-network curl container β€” exercises Docker DNS
# and bridge routing (same path nginx uses). NO host port binding needed.
FT_CURL_IMG="curlimages/curl:8.7.1"
if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then
if docker exec "$ACTIVE_CONTAINER" \
curl -sf --max-time 5 "http://localhost:3000/health" >/dev/null 2>&1; then
if docker run --rm --network api_network "$FT_CURL_IMG" \
-sf --max-time 5 "http://$ACTIVE_CONTAINER:3000/health" >/dev/null 2>&1; then
DEPLOY_STATUS="SUCCESS"
else
DEPLOY_STATUS="UNHEALTHY"
Expand Down Expand Up @@ -778,10 +778,15 @@ jobs:
exit 1
}

# Poll /ready via Docker service DNS (no host port binding needed)
# Poll /ready via in-network curl (Docker DNS + bridge routing).
# /ready checks Redis, Supabase, and BullMQ β€” definitive readiness gate.
# Uses docker run rather than docker exec so the check exercises the
# same network path nginx uses, not the container's own loopback.
FT_CURL_IMG="curlimages/curl:8.7.1"
for i in $(seq 1 15); do
STATUS=$(docker exec "$ACTIVE_CONTAINER" \
curl -s -o /dev/null -w "%{http_code}" "http://localhost:3000/ready" 2>/dev/null || echo "000")
STATUS=$(docker run --rm --network api_network "$FT_CURL_IMG" \
-s -o /dev/null -w "%{http_code}" \
"http://$ACTIVE_CONTAINER:3000/ready" 2>/dev/null || echo "000")
if [ "$STATUS" = "200" ]; then
echo "βœ“ API ready (container $ACTIVE_CONTAINER, attempt $i)"
exit 0
Expand Down
11 changes: 11 additions & 0 deletions infra/nginx/api.conf
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,17 @@ server {
root /var/www/certbot;
}

# Nginx-level liveness probe β€” answered by nginx directly, no upstream needed.
# Used by: in-network post-switch routing checks, CI health gates, and
# monitoring probes. Returns 200 even when the API container is down so that
# nginx infrastructure health never depends on backend readiness.
# This endpoint intentionally does NOT proxy to the API backend.
location = /health {
access_log off;
return 200 '{"status":"ok"}';
add_header Content-Type application/json;
}

location / {
return 301 https://$host$request_uri;
}
Expand Down
2 changes: 1 addition & 1 deletion infra/scripts/render-alertmanager.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ case "${ALERTMANAGER_SLACK_WEBHOOK}" in
;;
*)
log_error "ALERTMANAGER_SLACK_WEBHOOK does not start with 'https://hooks.slack.com/'."
log_error "Value prefix: $(printf '%s' "${ALERTMANAGER_SLACK_WEBHOOK}" | cut -c1-30)..."
log_error "Value prefix: ***masked*** (redacted to prevent webhook exposure in logs)"
exit 1
;;
esac
Expand Down
27 changes: 18 additions & 9 deletions scripts/deploy-bluegreen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ _ft_snapshot() {
{ set +x; } 2>/dev/null
printf '[DEPLOY] -- SYSTEM SNAPSHOT ----------------------------------------\n' >&2
printf '[DEPLOY] slot_file = %s\n' "$(cat "${ACTIVE_SLOT_FILE:-/var/run/api/active-slot}" 2>/dev/null || echo 'MISSING')" >&2
printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'server (api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | head -1 || echo 'unreadable')" >&2
printf '[DEPLOY] nginx_upstream = %s\n' "$(grep -oE 'http://(api-blue|api-green):3000' "${NGINX_CONF:-$HOME/api/infra/nginx/live/api.conf}" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo 'unreadable')" >&2
printf '[DEPLOY] containers =\n' >&2
docker ps --format '[DEPLOY] {{.Names}} -> {{.Status}} ({{.Ports}})' 1>&2 2>/dev/null \
|| printf '[DEPLOY] (docker ps unavailable)\n' >&2
Expand Down Expand Up @@ -150,8 +150,16 @@ fi
# ---------------------------------------------------------------------------
# CONSTANTS
# ---------------------------------------------------------------------------
IMAGE="ghcr.io/fieldtrack-tech/api:${1:-latest}"
IMAGE_SHA="${1:-latest}"
# Immutable SHA tags ONLY β€” 'latest' is forbidden in production.
# Reject empty and 'latest' before any Docker operation so failures are
# loud and attributed to the caller rather than appearing as pull errors.
IMAGE_SHA="${1:-}"
if [ -z "$IMAGE_SHA" ] || [ "$IMAGE_SHA" = "latest" ]; then
printf '[DEPLOY] ts=%s state=INIT level=ERROR msg="image SHA required -- latest tag is forbidden in production" sha=%s\n' \
"$(date -u +"%Y-%m-%dT%H:%M:%SZ")" "${IMAGE_SHA:-<empty>}" >&2
exit 2
fi
IMAGE="ghcr.io/fieldtrack-tech/api:$IMAGE_SHA"

BLUE_NAME="api-blue"
GREEN_NAME="api-green"
Expand Down Expand Up @@ -403,7 +411,7 @@ _ft_resolve_slot() {
elif [ "$blue_running" = "true" ] && [ "$green_running" = "true" ]; then
# Both running -- read nginx upstream container as authoritative tiebreaker.
local nginx_upstream
nginx_upstream=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
nginx_upstream=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
if [ "$nginx_upstream" = "api-blue" ]; then recovered_slot="blue"
elif [ "$nginx_upstream" = "api-green" ]; then recovered_slot="green"
else
Expand Down Expand Up @@ -943,7 +951,8 @@ _ft_log "msg='nginx reloaded' upstream=$INACTIVE_NAME:$APP_PORT"

# Upstream sanity check -- confirm nginx config actually points at the new container.
# Catches template substitution failures before traffic is affected.
_RELOAD_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
# Upstream sanity: live config must contain http://INACTIVE_NAME:3000 (set $api_backend format)
_RELOAD_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
if [ "$_RELOAD_CONTAINER" != "$INACTIVE_NAME" ]; then
_ft_log "level=ERROR msg='nginx upstream sanity check failed after reload' expected=$INACTIVE_NAME actual=${_RELOAD_CONTAINER:-unreadable}"
cp "$NGINX_BACKUP" "$NGINX_CONF"
Expand Down Expand Up @@ -1030,8 +1039,8 @@ for _attempt in 1 2 3 4 5; do
sleep 5
done

# Container alignment check -- live nginx config MUST point at the new container.
_NGINX_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
# Container alignment check -- live nginx config MUST contain http://INACTIVE_NAME:3000.
_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
if [ -n "$_NGINX_CONTAINER" ] && [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
_ft_log "level=ERROR msg='nginx container mismatch -- slot switch did not take effect' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER"
_PUB_PASSED=false
Expand Down Expand Up @@ -1193,8 +1202,8 @@ else
_FT_TRUTH_CHECK_PASSED=false
fi

# (2) Verify nginx upstream container matches target
_NGINX_CONTAINER=$(grep -oE 'server (api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
# (2) Verify nginx upstream container matches target (set $api_backend format)
_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
if [ -n "$_NGINX_CONTAINER" ]; then
if [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
_ft_log "level=ERROR msg='truth check failed: nginx container mismatch' expected=$INACTIVE_NAME actual=$_NGINX_CONTAINER"
Expand Down
11 changes: 8 additions & 3 deletions scripts/validate-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,14 @@ done

# ── Helper: read a value from a KEY=VALUE env file ─────────────────────────────
# Usage: get_val KEY /path/to/file
#
# Uses grep + cut only β€” avoids sed quote-stripping which corrupts values
# containing special characters (URLs, tokens, passwords).
# Head -1 prevents duplicate-key ambiguity; -E anchors on the key name so
# KEY_EXTRA= cannot accidentally match KEY=.
get_val() {
local key="$1" file="$2"
grep -E "^${key}=" "$file" 2>/dev/null | tail -1 | cut -d'=' -f2- \
| sed "s/^['\"]//; s/['\"]$//"
grep -E "^${key}=" "$file" 2>/dev/null | head -1 | cut -d'=' -f2-
}

DERIVED_HOSTNAME=""
Expand Down Expand Up @@ -223,11 +227,12 @@ else
SLACK_WEBHOOK="$(get_val "ALERTMANAGER_SLACK_WEBHOOK" "$MONITORING_ENV_FILE")"
if [[ -z "$SLACK_WEBHOOK" ]]; then
fail "ALERTMANAGER_SLACK_WEBHOOK not set in infra/.env.monitoring"
elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks.slack.com/ ]]; then
elif [[ ! "$SLACK_WEBHOOK" =~ ^https://hooks\.slack\.com/ ]]; then
fail "ALERTMANAGER_SLACK_WEBHOOK is not a valid Slack webhook URL"
else
pass "ALERTMANAGER_SLACK_WEBHOOK is valid"
fi
unset SLACK_WEBHOOK

# Cross-check 1: API_HOSTNAME must match the hostname derived from API_BASE_URL
MON_HOSTNAME="$(get_val "API_HOSTNAME" "$MONITORING_ENV_FILE")"
Expand Down
Loading