33# Production Deployment Pipeline
44#
55# Design principles:
6- # 1. Triggers on every push to master (no paths filter β ensures sync-beta always runs)
6+ # 1. Triggered ONLY after CodeQL deep scan completes successfully β no polling, no race.
7+ # Uses workflow_run event: deploy is event-driven, not concurrent with security scan.
78# 2. Runs ALL validation from scratch β no trust built on PR results alone
89# 3. Trivy scan runs BEFORE Docker push β vulnerable images never reach the registry
910# 4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity)
1213# 7. timeout-minutes on every job β hung processes never block CI indefinitely
1314# 8. npm ci retried up to 3x β registry flakiness never kills a valid deploy
1415#
15- # Parallel stages:
16- # validate ββ
17- # test-api βββΊ build-scan-push ββΊ deploy ββΊ api-health-gate ββΊ sync-infra ββΊ sync-monitoring ββΊ health-and-smoke
18- # β β
19- # rollback ββββββββββββββ (on failure)
16+ # Pipeline order:
17+ # codeql-gate
18+ # βββΊ validate ββ
19+ # βββΊ test-api βββΊ build-scan-push ββΊ vps-readiness-check ββΊ deploy
20+ # β β
21+ # api-health-gate ββββββββββ
22+ # β
23+ # sync-infra ββΊ sync-monitoring ββΊ health-and-smoke
24+ # β
25+ # rollback ββββββββββββββββββββββββββββββββ (on failure)
2026
2127name : Deploy to Production
2228
2329on :
24- push :
30+ # Triggered ONLY when the CodeQL deep scan workflow completes on master.
31+ # This replaces the previous push trigger + polling approach:
32+ # - No race conditions (workflow_run fires AFTER codeql-deep finishes)
33+ # - No API polling loops or timing-dependent checks
34+ # - Deployment is blocked at the event level if CodeQL did not succeed
35+ workflow_run :
36+ workflows : ["CodeQL β Deep Scan (post-merge)"]
37+ types :
38+ - completed
2539 branches :
2640 - master
41+ # Manual dispatch retained for emergency/hotfix deploys.
42+ # The codeql-gate job enforces the conclusion check only for workflow_run.
2743 workflow_dispatch :
2844
2945# Never cancel an in-progress deployment β let it finish or fail cleanly.
@@ -36,6 +52,56 @@ permissions:
3652 contents : read
3753
3854jobs :
55+ # ---------------------------------------------------------------------------
56+ # JOB: codeql-gate
57+ #
58+ # First job in every deploy run. Two responsibilities:
59+ #
60+ # 1. SECURITY GATE (workflow_run only):
61+ # Reads github.event.workflow_run.conclusion and fails hard if CodeQL
62+ # did not pass. This makes the event-driven guarantee explicit and
63+ # visible in the pipeline UI.
64+ #
65+ # 2. SHA RESOLUTION:
66+ # On workflow_run, github.sha = HEAD of default branch at event time,
67+ # NOT the commit that triggered CodeQL. We must deploy exactly the SHA
68+ # that was scanned. Exports deploy_sha = github.event.workflow_run.head_sha
69+ # so all downstream jobs checkout and tag the correct commit.
70+ # On workflow_dispatch, deploy_sha = github.sha (HEAD of triggered branch).
71+ #
72+ # All subsequent jobs that do git checkout use ref: needs.codeql-gate.outputs.deploy_sha.
73+ # ---------------------------------------------------------------------------
74+ codeql-gate :
75+ name : CodeQL Security Gate
76+ runs-on : ubuntu-latest
77+ timeout-minutes : 5
78+ outputs :
79+ deploy_sha : ${{ steps.sha.outputs.deploy_sha }}
80+ steps :
81+ - name : Resolve deploy SHA
82+ id : sha
83+ run : |
84+ if [ "${{ github.event_name }}" = "workflow_run" ]; then
85+ echo "deploy_sha=${{ github.event.workflow_run.head_sha }}" >> "$GITHUB_OUTPUT"
86+ else
87+ echo "deploy_sha=${{ github.sha }}" >> "$GITHUB_OUTPUT"
88+ fi
89+
90+ - name : Verify CodeQL deep scan passed
91+ if : github.event_name == 'workflow_run'
92+ run : |
93+ CONCLUSION="${{ github.event.workflow_run.conclusion }}"
94+ SHA="${{ github.event.workflow_run.head_sha }}"
95+ echo "CodeQL deep scan conclusion : $CONCLUSION"
96+ echo "Scanned commit SHA : $SHA"
97+ if [ "$CONCLUSION" != "success" ]; then
98+ echo "::error::CodeQL deep scan did not pass (conclusion=$CONCLUSION)."
99+ echo " Deployment is blocked. Review findings before retrying:"
100+ echo " https://github.com/${{ github.repository }}/security/code-scanning"
101+ exit 1
102+ fi
103+ echo "β CodeQL gate passed β safe to deploy SHA $SHA"
104+
39105 # ---------------------------------------------------------------------------
40106 # JOB: validate
41107 #
45111 validate :
46112 name : Validate (typecheck + audit)
47113 runs-on : ubuntu-latest
114+ needs : [codeql-gate]
48115 timeout-minutes : 10
49116 steps :
50117 - name : Confirm deployment trigger
@@ -58,27 +125,10 @@ jobs:
58125
59126 - name : Checkout
60127 uses : actions/checkout@v5
61-
62- - name : Setup Node.js 24
63- uses : actions/setup-node@v5
64128 with :
65- node-version : ' 24'
66- cache : npm
67- cache-dependency-path : package-lock.json
68-
69- - name : Install dependencies (with retry)
70- run : |
71- echo "::group::npm ci"
72- for attempt in 1 2 3; do
73- npm ci && break
74- [ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
75- echo "Attempt $attempt failed β retrying in 15s..."
76- sleep 15
77- done
78- echo "::endgroup::"
129+ ref : ${{ needs.codeql-gate.outputs.deploy_sha }}
79130
80- - name : Dependency vulnerability scan
81- run : npm audit --omit=dev --audit-level=high
131+ - name : Setup Node.js 24
82132
83133 - name : TypeScript check
84134 run : npm run typecheck
@@ -102,6 +152,7 @@ jobs:
102152 test-api :
103153 name : API Tests (unit + integration)
104154 runs-on : ubuntu-latest
155+ needs : [codeql-gate]
105156 timeout-minutes : 15
106157 env :
107158 SUPABASE_URL : ${{ secrets.SUPABASE_URL_TEST }}
@@ -110,6 +161,8 @@ jobs:
110161 steps :
111162 - name : Checkout
112163 uses : actions/checkout@v5
164+ with :
165+ ref : ${{ needs.codeql-gate.outputs.deploy_sha }}
113166
114167 - name : Setup Node.js 24
115168 uses : actions/setup-node@v5
@@ -152,22 +205,29 @@ jobs:
152205 build-scan-push :
153206 name : Build, Scan & Push Docker Image
154207 runs-on : ubuntu-latest
155- needs : [validate, test-api]
208+ needs : [codeql-gate, validate, test-api]
156209 timeout-minutes : 25
157210 permissions :
158211 contents : read
159212 packages : write
160213 security-events : write
161214 outputs :
162- sha_short : ${{ steps.meta.outputs.sha_short }}
163- digest : ${{ steps.digest.outputs.digest }}
215+ sha_short : ${{ steps.meta.outputs.sha_short }}
216+ digest : ${{ steps.digest.outputs.digest }}
217+ deploy_sha : ${{ steps.meta.outputs.deploy_sha }}
164218 steps :
165219 - name : Checkout
166220 uses : actions/checkout@v5
221+ with :
222+ ref : ${{ needs.codeql-gate.outputs.deploy_sha }}
167223
168224 - name : Extract commit SHA
169225 id : meta
170- run : echo "sha_short=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"
226+ env :
227+ DEPLOY_SHA : ${{ needs.codeql-gate.outputs.deploy_sha }}
228+ run : |
229+ echo "sha_short=${DEPLOY_SHA::7}" >> "$GITHUB_OUTPUT"
230+ echo "deploy_sha=$DEPLOY_SHA" >> "$GITHUB_OUTPUT"
171231
172232 - name : Set up Docker Buildx
173233 uses : docker/setup-buildx-action@v3
@@ -489,17 +549,59 @@ jobs:
489549 echo "| Registry | ghcr.io/${{ github.repository_owner }}/api |"
490550 } >> "$GITHUB_STEP_SUMMARY"
491551
552+ # ---------------------------------------------------------------------------
553+ # JOB: vps-readiness-check
554+ #
555+ # Validates the VPS is in a deployable state BEFORE running the deploy.
556+ # Runs in PARALLEL with vps-readiness-check (both depend on build-scan-push).
557+ # Both must succeed before deploy is allowed to proceed.
558+ #
559+ # Delegates to scripts/vps-readiness-check.sh which checks:
560+ # - Docker daemon running
561+ # - api_network exists (auto-created if missing)
562+ # - Ports 80/443 free from non-nginx processes
563+ # - No API containers with host port bindings
564+ # - Required .env file present
565+ # - Runtime directories present (auto-created if missing)
566+ # - Sufficient disk space (auto-prunes if borderline)
567+ # ---------------------------------------------------------------------------
568+ vps-readiness-check :
569+ name : VPS Readiness Gate
570+ runs-on : ubuntu-latest
571+ needs : [build-scan-push]
572+ timeout-minutes : 10
573+ steps :
574+ - name : Run VPS readiness check via SSH
575+ uses : appleboy/ssh-action@v1.0.3
576+ with :
577+ host : ${{ secrets.DO_HOST }}
578+ username : ${{ secrets.DO_USER }}
579+ key : ${{ secrets.DO_SSH_KEY }}
580+ script : |
581+ set -euo pipefail
582+ export DEPLOY_ROOT="${DEPLOY_ROOT:-$HOME/api}"
583+ [ -d "$DEPLOY_ROOT" ] || { echo "β DEPLOY_ROOT not found: $DEPLOY_ROOT β run vps-setup.sh first"; exit 1; }
584+ cd "$DEPLOY_ROOT"
585+ # Pull latest scripts without full deploy
586+ git fetch origin master --depth=1
587+ git checkout origin/master -- scripts/vps-readiness-check.sh 2>/dev/null || true
588+ chmod +x scripts/vps-readiness-check.sh
589+ ./scripts/vps-readiness-check.sh
590+
492591 # ---------------------------------------------------------------------------
493592 # JOB: deploy
494593 #
495594 # Blue-Green deployment to VPS via SSH.
496595 # The deploy-bluegreen.sh script manages slot switching and container health.
596+ #
597+ # DEPENDENCY GATES (both must pass):
598+ # - vps-readiness-check: ensures VPS can accept the deployment
497599 # ---------------------------------------------------------------------------
498600 deploy :
499601 name : Deploy (Blue-Green SSH)
500602 runs-on : ubuntu-latest
501- needs : [build-scan-push]
502- timeout-minutes : 15
603+ needs : [build-scan-push, vps-readiness-check ]
604+ timeout-minutes : 20
503605 steps :
504606 - name : Validate required deployment secrets
505607 env :
@@ -548,8 +650,10 @@ jobs:
548650 ls -la "$HOME/api"
549651 [ -d "$DEPLOY_ROOT" ] || { echo "β DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
550652 cd "$DEPLOY_ROOT"
653+ # Pin repo to the exact SHA that was built and scanned by CodeQL.
654+ # Prevents stale scripts from running if concurrent commits landed.
551655 git fetch origin
552- git reset --hard origin/master
656+ git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
553657 chmod +x scripts/*.sh
554658 echo "=== Pre-deploy environment validation ==="
555659 ./scripts/validate-env.sh --check-monitoring
@@ -573,6 +677,10 @@ jobs:
573677 ls -la "$DEPLOY_ROOT"
574678 [ -d "$DEPLOY_ROOT" ] || { echo "β DEPLOY_ROOT not found: $DEPLOY_ROOT"; exit 1; }
575679 cd "$DEPLOY_ROOT"
680+ # Enforce repo is at the exact SHA being deployed (issue 7 β prevents
681+ # stale deploy scripts if another commit landed during this pipeline run).
682+ git fetch origin
683+ git reset --hard ${{ needs.build-scan-push.outputs.deploy_sha }}
576684 chmod +x scripts/*.sh
577685 # Environment already validated in previous step
578686 ./scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}"
@@ -587,14 +695,24 @@ jobs:
587695 key : ${{ secrets.DO_SSH_KEY }}
588696 script : |
589697 ACTIVE_SLOT=$(cat /var/run/api/active-slot 2>/dev/null || echo "unknown")
698+ ACTIVE_CONTAINER="api-${ACTIVE_SLOT}"
590699 DEPLOY_STATUS="UNKNOWN"
591-
592- # Check if health endpoint is responding (good sign of successful deploy)
593- if timeout 5 curl -sf http://127.0.0.1:3000/health >/dev/null 2>&1; then
594- DEPLOY_STATUS="SUCCESS"
700+
701+ # Health check via docker exec β NO host port binding required.
702+ # api containers live only on api_network; localhost:3000 here means
703+ # the container's own loopback (executed via docker exec).
704+ if docker inspect "$ACTIVE_CONTAINER" >/dev/null 2>&1; then
705+ if docker exec "$ACTIVE_CONTAINER" \
706+ curl -sf --max-time 5 "http://localhost:3000/health" >/dev/null 2>&1; then
707+ DEPLOY_STATUS="SUCCESS"
708+ else
709+ DEPLOY_STATUS="UNHEALTHY"
710+ fi
711+ else
712+ DEPLOY_STATUS="CONTAINER_MISSING"
595713 fi
596-
597- echo "DEPLOY_STATE=$DEPLOY_STATUS| SLOT=$ACTIVE_SLOT | SHA=${{ github.sha }}"
714+
715+ echo "DEPLOY_STATE=$DEPLOY_STATUS | SLOT=$ACTIVE_SLOT | CONTAINER=$ACTIVE_CONTAINER | SHA=${{ github.sha }}"
598716
599717 # ---------------------------------------------------------------------------
600718 # JOB: api-health-gate (Step E+)
@@ -923,11 +1041,12 @@ jobs:
9231041 rollback :
9241042 name : Rollback Deployment (auto)
9251043 runs-on : ubuntu-latest
926- needs : [deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
1044+ needs : [vps-readiness-check, deploy, api-health-gate, sync-infra, sync-monitoring, health-and-smoke]
9271045 timeout-minutes : 10
9281046 if : |
9291047 always() &&
9301048 (
1049+ needs.vps-readiness-check.result == 'failure' ||
9311050 needs.deploy.result == 'failure' ||
9321051 needs.api-health-gate.result == 'failure' ||
9331052 needs.sync-infra.result == 'failure' ||
@@ -938,6 +1057,7 @@ jobs:
9381057 - name : Log rollback trigger
9391058 run : |
9401059 echo "ROLLBACK_TRIGGERED=TRUE | FAILED_JOBS:"
1060+ [ "${{ needs.vps-readiness-check.result }}" = "failure" ] && echo " - vps-readiness-check"
9411061 [ "${{ needs.deploy.result }}" = "failure" ] && echo " - deploy"
9421062 [ "${{ needs.api-health-gate.result }}" = "failure" ] && echo " - api-health-gate"
9431063 [ "${{ needs.sync-infra.result }}" = "failure" ] && echo " - sync-infra"
0 commit comments