Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 48 additions & 21 deletions .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -865,26 +865,37 @@ jobs:
echo "✓ Nginx reloaded."

# ROUTING VALIDATION — Test actual traffic through Nginx
# Config syntax is valid (nginx -t) but routing may still be broken.
# Test by hitting the /health endpoint via localhost + Host header.
echo "=== Testing Nginx routing (localhost + Host header) ==="
# Phase 1 (source of truth): in-network docker run inside api_network.
# Phase 2 (advisory): HTTPS via localhost + Host header; --insecure handles
# Cloudflare origin cert. status=000 = host→Docker TCP routing issue, not TLS.
echo "=== Testing Nginx routing (in-network primary, HTTPS advisory) ==="
sleep 2 # Give Nginx a moment to fully apply reload

ROUTE_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
--resolve "$API_HOSTNAME:443:127.0.0.1" \
-H "Host: $API_HOSTNAME" \
"https://127.0.0.1/health" --insecure 2>/dev/null || echo "000")


ROUTE_STATUS=$(docker run --rm --network api_network \
curlimages/curl:8.7.1 -s -o /dev/null -w "%{http_code}" \
--max-time 10 http://nginx/health 2>/dev/null || echo "000")

if [ "$ROUTE_STATUS" = "200" ]; then
echo "✓ Nginx routing verified (HTTP $ROUTE_STATUS)"
echo "✓ Nginx routing verified via in-network check (HTTP $ROUTE_STATUS)"
else
echo "❌ Nginx routing broken (HTTP $ROUTE_STATUS expected 200) — restoring backup..."
echo "❌ Nginx in-network routing broken (HTTP $ROUTE_STATUS expected 200) — restoring backup..."
LATEST_BAK=$(ls -1t "$NGINX_BACKUP_DIR"/api.conf.bak.* 2>/dev/null | head -1 || true)
[ -n "$LATEST_BAK" ] && cp "$LATEST_BAK" "$NGINX_LIVE"
docker exec nginx nginx -t 2>&1 && docker exec nginx nginx -s reload || true
exit 1
fi

# HTTPS advisory check (non-blocking — host→Docker loopback may fail with status=000)
HTTPS_STATUS=$(curl -s -o /dev/null -w "%{http_code}" \
--resolve "$API_HOSTNAME:443:127.0.0.1" \
-H "Host: $API_HOSTNAME" \
"https://127.0.0.1/health" --insecure 2>/dev/null || echo "000")
if [ "$HTTPS_STATUS" = "200" ]; then
echo "✓ HTTPS advisory check passed (HTTP $HTTPS_STATUS)"
else
echo "⚠ HTTPS advisory status=$HTTPS_STATUS (host→Docker TCP routing; in-network check is authoritative)"
fi

echo "✓ Infra sync completed in $(($(date +%s) - T0))s"

# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -979,22 +990,30 @@ jobs:
echo "=== Checking /health via VPS (API_HOSTNAME=$API_HOSTNAME) ==="
for i in $(seq 1 30); do
echo "---- Attempt $i ----"
# Phase 1: in-network (source of truth)
INNET_BODY=$(docker run --rm --network api_network \
curlimages/curl:8.7.1 -s --max-time 5 http://nginx/health 2>/dev/null || echo "")
if echo "$INNET_BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK via in-network (attempt $i)"
exit 0
fi
# Phase 2: HTTPS advisory (--insecure for Cloudflare origin cert; status=000 = host→Docker TCP issue)
STATUS=$(curl -sS \
--resolve "${API_HOSTNAME}:443:127.0.0.1" \
-o /tmp/resp.txt \
-w "%{http_code}" \
https://${API_HOSTNAME}/health \
--insecure || echo "000")
--insecure 2>/dev/null || echo "000")
BODY=$(cat /tmp/resp.txt 2>/dev/null || echo "")
echo "HTTP: $STATUS"
echo "BODY: $BODY"
echo "HTTP: $STATUS BODY: $BODY"
if [ "$STATUS" = "200" ] && echo "$BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK (attempt $i)"
echo "✓ /health OK via HTTPS (attempt $i)"
exit 0
fi
[ "$STATUS" = "000" ] && echo "⚠ HTTPS status=000 (host→Docker routing; in-network is authoritative)"
sleep 2
done
echo "❌ /health failed"
echo "❌ /health failed after 30 attempts"
exit 1

- name: Wait for /health endpoint (final public check)
Expand All @@ -1012,22 +1031,30 @@ jobs:
echo "=== Final health check via public endpoint (API_HOSTNAME=$API_HOSTNAME) ==="
for i in $(seq 1 10); do
echo "---- Attempt $i ----"
# Phase 1: in-network (source of truth)
INNET_BODY=$(docker run --rm --network api_network \
curlimages/curl:8.7.1 -s --max-time 5 http://nginx/health 2>/dev/null || echo "")
if echo "$INNET_BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK via in-network (attempt $i)"
exit 0
fi
# Phase 2: HTTPS advisory (--insecure for Cloudflare origin cert; status=000 = host→Docker TCP issue)
STATUS=$(curl -sS \
--resolve "${API_HOSTNAME}:443:127.0.0.1" \
-o /tmp/resp.txt \
-w "%{http_code}" \
https://${API_HOSTNAME}/health \
--insecure || echo "000")
--insecure 2>/dev/null || echo "000")
BODY=$(cat /tmp/resp.txt 2>/dev/null || echo "")
echo "HTTP: $STATUS"
echo "BODY: $BODY"
echo "HTTP: $STATUS BODY: $BODY"
if [ "$STATUS" = "200" ] && echo "$BODY" | grep -q '"status":"ok"'; then
echo "✓ /health OK (attempt $i)"
echo "✓ /health OK via HTTPS (attempt $i)"
exit 0
fi
[ "$STATUS" = "000" ] && echo "⚠ HTTPS status=000 (host→Docker routing; in-network is authoritative)"
sleep 2
done
echo "❌ /health failed"
echo "❌ /health failed after 10 attempts"
exit 1

- name: Run smoke tests
Expand Down
2 changes: 1 addition & 1 deletion infra/docker-compose.monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ services:
max-file: "3"

healthcheck:
test: ["CMD", "wget", "--spider", "-q", "http://localhost:80/health"]
test: ["CMD", "wget", "--no-check-certificate", "-qO-", "https://localhost/health"]
interval: 30s
timeout: 5s
retries: 3
Expand Down
111 changes: 87 additions & 24 deletions scripts/deploy-bluegreen.sh
Original file line number Diff line number Diff line change
Expand Up @@ -172,33 +172,27 @@ NETWORK="api_network"
_FT_CURL_IMG="curlimages/curl:8.7.1"
# In-network curl helper with local fallback.
#
# PRIMARY CURL HELPERS — use docker run on api_network (reliable DNS + routing)
#
# Primary: short-lived curlimages/curl container on api_network.
# Exercises Docker DNS + bridge routing (same path nginx uses).
# Fallback: docker exec <container> curl when the curl image cannot be pulled
# or Docker Hub is unreachable. Covers cold-VPS / egress-blocked cases.
# Works with distroless containers (no curl binary available).
#
# Usage: _ft_net_curl <container_name> <curl-flags...>
# The first argument is the container name — used ONLY for the fallback.
# The first argument is the container name — not used (kept for signature compat).
# Remaining arguments are passed verbatim to curl.
_ft_net_curl() {
local _target_container="$1"; shift
# Primary: in-network (Docker DNS + bridge routing)
if docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1; then
return 0
fi
# Fallback: exec into target container's loopback (skips Docker DNS but
# confirms HTTP server is alive inside the container)
docker exec "$_target_container" curl -sf --max-time 3 "$@" >/dev/null 2>&1
docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" >/dev/null 2>&1
}
# Variant that captures the response body or HTTP status code instead of
# just testing. Used where we need the response text for status checks.
# Usage: _ft_net_curl_out <container_name> <curl-flags...>
_ft_net_curl_out() {
local _target_container="$1"; shift
local _out
_out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) \
|| _out=$(docker exec "$_target_container" curl --max-time 3 "$@" 2>/dev/null) \
|| _out=""
_out=$(docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" "$@" 2>/dev/null) || _out=""
printf '%s' "$_out"
}

Expand Down Expand Up @@ -277,22 +271,50 @@ _ft_release_lock() {
_ft_check_external_ready() {
{ set +x; } 2>/dev/null
local attempt=0


# Phase 1 — in-network routing (source of truth).
# Hits nginx directly via Docker bridge; validates full nginx→api routing path.
local _p1_body
_p1_body=$(_ft_net_curl_out "nginx" -s --max-time 5 "http://nginx/health" 2>/dev/null || echo "")
if echo "$_p1_body" | grep -q '"status":"ok"' 2>/dev/null; then
unset _p1_body
set -x
return 0
fi
unset _p1_body

# Phase 2 — HTTPS via localhost + Host header (advisory / TLS diagnostic).
# --insecure accepts Cloudflare origin certificate.
# status=000 means host→Docker TCP routing issue, NOT a TLS problem.
for attempt in 1 2 3; do
local body
body=$(curl -sS --max-time 3 \
body=$(curl -sS --max-time 5 \
--resolve "$API_HOSTNAME:443:127.0.0.1" \
"https://$API_HOSTNAME/health" \
--insecure 2>/dev/null || echo "")
if echo "$body" | grep -q '"status":"ok"' 2>/dev/null; then
set -x
return 0
fi
if [ -z "$body" ]; then
{ printf 'external-ready: HTTPS phase-2 attempt %s — status=000 (host→Docker port routing, not TLS)\n' "$attempt"; } 2>/dev/null
local _http_body
_http_body=$(curl -sS --max-time 5 \
--resolve "$API_HOSTNAME:80:127.0.0.1" \
"http://$API_HOSTNAME/health" 2>/dev/null || echo "")
if echo "$_http_body" | grep -q '"status":"ok"' 2>/dev/null; then
{ printf 'external-ready: HTTP:80 fallback passed (attempt %s)\n' "$attempt"; } 2>/dev/null
unset _http_body
set -x
return 0
fi
unset _http_body
fi
if [ "$attempt" -lt 3 ]; then
sleep "$attempt"
fi
done

set -x
return 1
}
Expand Down Expand Up @@ -631,11 +653,10 @@ if ! docker ps -a --format '{{.Names}}' | grep -Eq '^api-(blue|green)$'; then
# /ready can lag the HTTP server bind by ~1–3 s while workers start.
sleep 2

# Bootstrap readiness: use docker exec (only safe choice when no other
# container is guaranteed to be running yet on api_network).
# Bootstrap readiness: use docker run (works with distroless containers).
_BOOT_OK=false
for _bi in $(seq 1 20); do
if docker exec api-blue curl -sf --max-time 4 "http://localhost:${APP_PORT}/ready" >/dev/null 2>&1; then
if docker run --rm --network "$NETWORK" "$_FT_CURL_IMG" -sf --max-time 4 "http://api-blue:${APP_PORT}/ready" >/dev/null 2>&1; then
_ft_log "msg='bootstrap: api-blue ready' attempt=$_bi"
_BOOT_OK=true
break
Expand Down Expand Up @@ -1021,24 +1042,66 @@ sleep 3
_PUB_PASSED=false
_PUB_STATUS="000"

# Phase 1 — in-network routing (source of truth for rollback decision).
# Validates full nginx→api-<slot>:3000 path inside Docker bridge network.
for _attempt in 1 2 3; do
_P1_BODY=$(_ft_net_curl_out "nginx" -s --max-time 10 "http://nginx/ready" 2>/dev/null || echo "")
if echo "$_P1_BODY" | grep -q '"status":"ready"' 2>/dev/null; then
_PUB_PASSED=true
_PUB_STATUS="200-innet"
_ft_log "msg='public health phase-1 (in-network) passed' attempt=$_attempt/3 container=$INACTIVE_NAME"
unset _P1_BODY
break
fi
_ft_log "msg='public health phase-1 (in-network) attempt failed' attempt=$_attempt/3"
unset _P1_BODY
sleep 3
done

# Phase 2 — HTTPS via localhost + Host header (advisory / TLS diagnostic).
# Uses --insecure to accept Cloudflare origin certificate.
# NOTE: status=000 means host→Docker TCP port routing issue, NOT a TLS problem
# (--insecure already handles cert trust). In-network result above is authoritative.
_HTTPS_PASSED=false
_HTTPS_STATUS="000"
for _attempt in 1 2 3 4 5; do
_PUB_BODY=$(mktemp)
_PUB_STATUS=$(curl --max-time 10 -sS -o "$_PUB_BODY" -w "%{http_code}" \
_HTTPS_STATUS=$(curl --max-time 10 -sS -o "$_PUB_BODY" -w "%{http_code}" \
--resolve "$API_HOSTNAME:443:127.0.0.1" \
"https://$API_HOSTNAME/ready" \
--insecure 2>&1 || echo "000")
--insecure 2>/dev/null || echo "000")

if [ "$_PUB_STATUS" = "200" ] && grep -q '"status":"ready"' "$_PUB_BODY" 2>/dev/null; then
_PUB_PASSED=true
if [ "$_HTTPS_STATUS" = "200" ] && grep -q '"status":"ready"' "$_PUB_BODY" 2>/dev/null; then
_HTTPS_PASSED=true
rm -f "$_PUB_BODY"
break
fi

_ft_log "msg='public health attempt failed' attempt=$_attempt/5 status=$_PUB_STATUS host=$API_HOSTNAME"
if [ "$_HTTPS_STATUS" = "000" ]; then
_ft_log "msg='HTTPS phase-2 status=000 — host→Docker port routing unreachable (not a TLS error; in-network is source of truth)' attempt=$_attempt/5"
_HTTP_FALLBACK=$(curl -sS --max-time 5 \
--resolve "$API_HOSTNAME:80:127.0.0.1" \
"http://$API_HOSTNAME/ready" 2>/dev/null || echo "")
if echo "$_HTTP_FALLBACK" | grep -q '"status":"ready"' 2>/dev/null; then
_ft_log "msg='HTTP:80 fallback confirmed backend reachable' attempt=$_attempt"
_HTTPS_PASSED=true
_HTTPS_STATUS="200-http"
fi
unset _HTTP_FALLBACK
fi
[ "$_HTTPS_PASSED" = "true" ] && { rm -f "$_PUB_BODY"; break; }
_ft_log "msg='HTTPS phase-2 attempt failed' attempt=$_attempt/5 status=$_HTTPS_STATUS host=$API_HOSTNAME"
rm -f "$_PUB_BODY"
sleep 5
done

if [ "$_HTTPS_PASSED" = "true" ]; then
_ft_log "msg='HTTPS phase-2 passed' status=$_HTTPS_STATUS container=$INACTIVE_NAME"
else
_ft_log "level=WARN msg='HTTPS phase-2 diagnostic failed (non-blocking)' status=$_HTTPS_STATUS host=$API_HOSTNAME note='host→Docker routing issue; in-network is authoritative'"
fi
unset _HTTPS_PASSED _HTTPS_STATUS _PUB_BODY

# Container alignment check -- live nginx config MUST contain http://INACTIVE_NAME:3000.
_NGINX_CONTAINER=$(grep -oE 'http://(api-blue|api-green):3000' "$NGINX_CONF" 2>/dev/null | grep -oE 'api-blue|api-green' | head -1 || echo "")
if [ -n "$_NGINX_CONTAINER" ] && [ "$_NGINX_CONTAINER" != "$INACTIVE_NAME" ]; then
Expand Down Expand Up @@ -1093,7 +1156,7 @@ if [ "$_PUB_PASSED" != "true" ]; then
fi
fi

unset _PUB_PASSED _attempt _PUB_STATUS _PUB_BODY _NGINX_CONTAINER
unset _PUB_PASSED _attempt _PUB_STATUS _NGINX_CONTAINER
_ft_log "msg='public health check passed' container=$INACTIVE_NAME host=$API_HOSTNAME endpoint=/ready"

# ---------------------------------------------------------------------------
Expand Down
Loading