feat: implement weekly base image digest rotation workflow for securi… #256

Workflow file for this run

	# .github/workflows/deploy.yml
	#
	# Production Deployment Pipeline
	#
	# Design principles:
	# 1. Triggers on every push to master (no paths filter — ensures sync-beta always runs)
	# 2. Runs ALL validation from scratch — no trust built on PR results alone
	# 3. Trivy scan runs BEFORE Docker push — vulnerable images never reach the registry
	# 4. target: production + build-args mirror pr.yml exactly (bit-for-bit parity)
	# 5. Image digest verified against PR simulation artifact when available
	# 6. Blue-Green deploy with automatic rollback on health or smoke test failure
	# 7. timeout-minutes on every job — hung processes never block CI indefinitely
	# 8. npm ci retried up to 3x — registry flakiness never kills a valid deploy
	#
	# Parallel stages:
	# validate ─┐
	# test-api ├─► build-scan-push ─► deploy ─► sync-infra ─► health-and-smoke
	# build-web ┘ │
	# rollback ◄────────────┘ (on failure)

	name: Deploy to Production

	on:
	push:
	branches:
	- master
	workflow_dispatch:

	# Never cancel an in-progress deployment — let it finish or fail cleanly.
	concurrency:
	group: production-deploy
	cancel-in-progress: false

	# Default to read-only. Jobs that need additional access declare it explicitly.
	permissions:
	contents: read

	jobs:
	# ---------------------------------------------------------------------------
	# JOB: validate
	#
	# Fast pre-flight: TypeScript check + dependency audit.
	# Runs in parallel with test-api and build-web to maximise pipeline speed.
	# ---------------------------------------------------------------------------
	validate:
	name: Validate (typecheck + audit)
	runs-on: ubuntu-latest
	timeout-minutes: 10
	steps:
	- name: Confirm deployment trigger
	run: \|
	echo "========================================="
	echo "Deployment triggered on master"
	echo " Commit SHA : ${{ github.sha }}"
	echo " Event : ${{ github.event_name }}"
	echo " Ref : ${{ github.ref }}"
	echo "========================================="

	- name: Checkout
	uses: actions/checkout@v5

	- name: Setup Node.js 24
	uses: actions/setup-node@v5
	with:
	node-version: '24'
	cache: npm
	cache-dependency-path: '**/package-lock.json'

	- name: Install workspace dependencies (with retry)
	run: \|
	echo "::group::npm ci"
	for attempt in 1 2 3; do
	npm ci && break
	[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
	echo "Attempt $attempt failed — retrying in 15s..."
	sleep 15
	done
	echo "::endgroup::"

	- name: Build shared types
	run: npm run build -w packages/types

	- name: Dependency vulnerability scan
	run: npm audit --omit=dev --audit-level=high

	- name: TypeScript check (API)
	working-directory: apps/api
	run: npx tsc --noEmit

	# ---------------------------------------------------------------------------
	# JOB: test-api
	#
	# Full backend test suite — unit tests then integration tests.
	# Runs in parallel with validate and build-web.
	# ---------------------------------------------------------------------------
	test-api:
	name: API Tests (unit + integration)
	runs-on: ubuntu-latest
	timeout-minutes: 15
	env:
	SUPABASE_URL: ${{ secrets.SUPABASE_URL_TEST }}
	SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY_TEST }}
	SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY_TEST }}
	steps:
	- name: Checkout
	uses: actions/checkout@v5

	- name: Setup Node.js 24
	uses: actions/setup-node@v5
	with:
	node-version: '24'
	cache: npm
	cache-dependency-path: '**/package-lock.json'

	- name: Install workspace dependencies (with retry)
	run: \|
	echo "::group::npm ci"
	for attempt in 1 2 3; do
	npm ci && break
	[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
	echo "Attempt $attempt failed — retrying in 15s..."
	sleep 15
	done
	echo "::endgroup::"

	- name: Build shared types
	run: npm run build -w packages/types

	- name: Unit tests
	working-directory: apps/api
	run: npx vitest run tests/unit/

	- name: Integration tests
	working-directory: apps/api
	run: npx vitest run tests/integration/

	# ---------------------------------------------------------------------------
	# JOB: build-web
	#
	# Full frontend validation and production build.
	# Runs in parallel with validate and test-api.
	# ---------------------------------------------------------------------------
	build-web:
	name: Frontend Build (typecheck + lint + build)
	runs-on: ubuntu-latest
	timeout-minutes: 15
	env:
	NEXT_PUBLIC_API_BASE_URL: /api/proxy
	NEXT_PUBLIC_SUPABASE_URL: https://ci-placeholder.supabase.co
	NEXT_PUBLIC_SUPABASE_ANON_KEY: ci-build-placeholder-anon-key
	NEXT_PUBLIC_MAPBOX_TOKEN: pk.ci-build-placeholder
	steps:
	- name: Verify NEXT_PUBLIC_API_BASE_URL is set
	run: \|
	if [ -z "$NEXT_PUBLIC_API_BASE_URL" ]; then
	echo "::error::NEXT_PUBLIC_API_BASE_URL is not set. Add it to the job env block."
	exit 1
	fi
	echo "NEXT_PUBLIC_API_BASE_URL=${NEXT_PUBLIC_API_BASE_URL}"

	- name: Checkout
	uses: actions/checkout@v5

	- name: Setup Node.js 24
	uses: actions/setup-node@v5
	with:
	node-version: '24'
	cache: npm
	cache-dependency-path: '**/package-lock.json'

	- name: Install workspace dependencies (with retry)
	run: \|
	echo "::group::npm ci"
	for attempt in 1 2 3; do
	npm ci && break
	[ $attempt -eq 3 ] && { echo "::error::npm ci failed after 3 attempts"; exit 1; }
	echo "Attempt $attempt failed — retrying in 15s..."
	sleep 15
	done
	echo "::endgroup::"

	- name: Build shared types
	run: npm run build -w packages/types

	- name: TypeScript check (web)
	run: npm run typecheck -w apps/web

	- name: ESLint (web)
	run: npm run lint -w apps/web

	- name: Next.js production build
	run: npm run build -w apps/web

	# ---------------------------------------------------------------------------
	# JOB: build-scan-push
	#
	# Three-phase security gate — identical build config to pr.yml:
	# Phase 1 — Build locally (target: production, same build-args, same cache)
	# Phase 2 — Trivy scan: pinned aquasec/trivy:0.49.1 Docker image, exit-code 1
	# on HIGH/CRITICAL (blocks push). NOT trivy-action — supply-chain safe.
	# DB pre-pulled, scan runs --network none (air-gapped).
	# Phase 3 — Push exact scanned image to GHCR (no rebuild)
	#
	# Image digest verification:
	# After building, the digest is compared against the digest stored by
	# pr.yml's production-simulation job. A match confirms bit-for-bit parity
	# between what was validated in PR and what is being deployed.
	# Comparison is best-effort (continue-on-error) because the merge commit
	# SHA may differ from the PR head SHA on squash-merges.
	# ---------------------------------------------------------------------------
	build-scan-push:
	name: Build, Scan & Push Docker Image
	runs-on: ubuntu-latest
	needs: [validate, test-api, build-web]
	timeout-minutes: 25
	permissions:
	contents: read
	packages: write
	outputs:
	sha_short: ${{ steps.meta.outputs.sha_short }}
	digest: ${{ steps.digest.outputs.digest }}
	steps:
	- name: Checkout
	uses: actions/checkout@v5

	- name: Extract commit SHA
	id: meta
	run: echo "sha_short=${GITHUB_SHA::7}" >> "$GITHUB_OUTPUT"

	- name: Set up Docker Buildx
	uses: docker/setup-buildx-action@v3

	- name: Pull base images (force fresh manifest, prevent stale GHA cache)
	run: \|
	docker pull node:24.2.0-bookworm-slim
	docker pull gcr.io/distroless/nodejs24-debian12:nonroot

	# Phase 1: Build into local Docker daemon for scanning.
	# EXACT same parameters as pr.yml production-simulation:
	# target: production, build-args: NODE_ENV=production, GHA cache.
	# CACHE_BUSTER forces rebuild when package-lock.json changes (prevents stale deps).
	# Cache scoped to production to prevent cross-branch contamination from PR builds.
	- name: Build Docker image (pre-scan, no push)
	uses: docker/build-push-action@v6
	with:
	context: .
	file: ./apps/api/Dockerfile
	target: production
	build-args: \|
	NODE_ENV=production
	CACHE_BUSTER=${{ hashFiles('**/package-lock.json') }}
	push: false
	load: true
	pull: true
	tags: \|
	fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	cache-from: type=gha,scope=production
	cache-to: type=gha,mode=max,scope=production

	# Verify Node.js runtime — exercises TLS stack, not just compile-time version constant.
	# tls.createSecureContext() fails if libssl linkage is broken, proving runtime health.
	- name: Verify Node.js runtime (TLS operational check)
	run: \|
	IMAGE_NAME="fieldtrack-backend:${{ steps.meta.outputs.sha_short }}"
	echo "Testing image: $IMAGE_NAME"
	docker run --rm \
	--entrypoint /nodejs/bin/node \
	"$IMAGE_NAME" \
	-e "
	const crypto = require('crypto');
	const tls = require('tls');
	const ctx = tls.createSecureContext();
	if (!ctx) { process.stderr.write('FAIL: TLS context failed\n'); process.exit(1); }
	const h = crypto.createHash('sha256').update('smoke').digest('hex');
	if (!h) { process.stderr.write('FAIL: hash failed\n'); process.exit(1); }
	process.stdout.write('node=' + process.versions.node + ' openssl=' + process.versions.openssl + ' tls=ok\n');
	"

	# Capture the content-addressable image digest.
	# With cache scoping and cache busting, digest should always reproduce correctly.
	- name: Capture image digest
	id: digest
	run: \|
	IMAGE_NAME="fieldtrack-backend:${{ steps.meta.outputs.sha_short }}"
	DIGEST=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}')
	echo "digest=$DIGEST" >> "$GITHUB_OUTPUT"
	echo "=== Build traceability ==="
	echo " Commit SHA : ${{ github.sha }}"
	echo " Image tag : $IMAGE_NAME"
	echo " Image digest : $DIGEST"

	# Compare this digest with the one stored by pr.yml's production-simulation.
	# A match = bit-for-bit parity. A mismatch = code/cache divergence (warning).
	# continue-on-error: true — squash merges produce a new commit SHA, which
	# may cause minor divergence even with identical source code.
	- name: Verify image digest parity with PR simulation
	continue-on-error: true
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
	run: \|
	# Find the PR number associated with this merge commit
	PR_NUMBER=$(gh api \
	"/repos/${{ github.repository }}/commits/${{ github.sha }}/pulls" \
	--header "X-GitHub-Api-Version: 2022-11-28" \
	--jq '.[0].number // empty' 2>/dev/null \|\| echo "")

	if [ -z "$PR_NUMBER" ]; then
	echo "No associated PR found for commit ${{ github.sha }} — skipping digest comparison."
	exit 0
	fi

	echo "Associated PR: #${PR_NUMBER}"

	# Find the most recent successful pr.yml run for this PR
	RUN_ID=$(gh run list \
	--repo "${{ github.repository }}" \
	--workflow "pr.yml" \
	--json databaseId,conclusion,headSha \
	--jq "map(select(.conclusion == \"success\")) \| .[0].databaseId // empty" \
	2>/dev/null \|\| echo "")

	if [ -z "$RUN_ID" ]; then
	echo "No successful PR validation run found — skipping digest comparison."
	exit 0
	fi

	# Download the image-digest artifact from that run
	gh run download "$RUN_ID" \
	--repo "${{ github.repository }}" \
	--name "image-digest-pr-${PR_NUMBER}" \
	--dir /tmp/pr-digest \
	2>/dev/null \|\| true

	if [ ! -f /tmp/pr-digest/image-digest.txt ]; then
	echo "PR image-digest artifact not found — skipping comparison."
	exit 0
	fi

	PR_DIGEST=$(cat /tmp/pr-digest/image-digest.txt)
	DEPLOY_DIGEST="${{ steps.digest.outputs.digest }}"

	echo "PR simulation digest: $PR_DIGEST"
	echo "Deploy image digest: $DEPLOY_DIGEST"

	if [ "$PR_DIGEST" = "$DEPLOY_DIGEST" ]; then
	echo "✓ Digest match — bit-for-bit parity confirmed between PR and deploy."
	else
	echo "⚠ Digest mismatch — builds diverged between PR and deploy."
	echo " Expected on squash-merges where the commit SHA changes."
	echo " Ensure no source changes occurred between PR approval and deploy trigger."
	fi

	# Phase 2: Trivy scan — image pinned by immutable digest, NOT trivy-action.
	# aquasec/trivy:0.49.1 → sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc
	# Identical severity gates to pr.yml (HIGH,CRITICAL / exit-code 1).
	# Two-phase: DB downloaded first (needs network), then scan runs --network none.
	- name: Get date for Trivy DB cache key
	id: trivy-date
	run: echo "date=$(date +'%Y-%m-%d')" >> "$GITHUB_OUTPUT"

	- name: Cache Trivy DB (daily refresh)
	uses: actions/cache@v4
	with:
	path: /tmp/trivy-cache
	key: trivy-db-${{ runner.os }}-${{ steps.trivy-date.outputs.date }}
	restore-keys: \|
	trivy-db-${{ runner.os }}-

	- name: Pull Trivy vulnerability database
	run: \|
	docker run --rm \
	-v /tmp/trivy-cache:/root/.cache \
	aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc \
	image --download-db-only

	- name: Scan image with Trivy (HIGH/CRITICAL, ignore-unfixed)
	env:
	IMAGE_NAME: fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	run: \|
	SCAN_PASSED=false
	for i in 1 2 3; do
	if docker run --rm \
	--network none \
	-v /var/run/docker.sock:/var/run/docker.sock \
	-v /tmp/trivy-cache:/root/.cache \
	-v "$(pwd)/.trivyignore:/tmp/.trivyignore:ro" \
	aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
	--skip-db-update \
	--ignore-unfixed \
	--severity HIGH,CRITICAL \
	--exit-code 1 \
	--ignorefile /tmp/.trivyignore \
	"$IMAGE_NAME"; then
	SCAN_PASSED=true
	break
	fi
	echo "Trivy attempt $i failed..."
	[ "$i" -lt 3 ] && sleep 5
	done
	if [ "$SCAN_PASSED" != "true" ]; then
	echo "::error::Trivy scan failed after 3 attempts — HIGH/CRITICAL vulnerabilities found or scan error."
	exit 1
	fi
	echo "✓ Trivy scan passed (HIGH/CRITICAL, ignore-unfixed)"

	- name: Scan for unfixed CRITICAL vulnerabilities (informational)
	continue-on-error: true
	env:
	IMAGE_NAME: fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	run: \|
	UNFIXED_COUNT=$(docker run --rm \
	--network none \
	-v /var/run/docker.sock:/var/run/docker.sock \
	-v /tmp/trivy-cache:/root/.cache \
	aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
	--skip-db-update \
	--severity CRITICAL \
	--format json \
	"$IMAGE_NAME" \| jq '[.Results[]?.Misconfigurations[]? // .Results[]?.Vulnerabilities[]? \| select(.FixedVersion == null or .FixedVersion == "")] \| length')

	if [ "$UNFIXED_COUNT" -gt 0 ]; then
	echo "⚠ WARNING: $UNFIXED_COUNT unfixed CRITICAL vulnerabilities found"
	echo " (No patches available upstream — waiting for vendor fix)"
	docker run --rm \
	--network none \
	-v /var/run/docker.sock:/var/run/docker.sock \
	-v /tmp/trivy-cache:/root/.cache \
	aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
	--skip-db-update \
	--severity CRITICAL \
	"$IMAGE_NAME" >> /tmp/unfixed-critical.log \|\| true
	else
	echo "✓ No unfixed CRITICAL vulnerabilities"
	fi

	# Phase 3: Scan passed — push the exact scanned image (same layer digests).
	# Uses docker tag + push rather than rebuilding to guarantee what was scanned
	# is exactly what lands in the registry.
	- name: Verify image digest unchanged before push
	env:
	IMAGE_NAME: fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
	run: \|
	# docker inspect .Id returns the config digest (sha256:...) which is
	# stable across tag operations — same value captured in the digest step.
	CURRENT=$(docker inspect "$IMAGE_NAME" --format='{{.Id}}')
	echo "Expected digest : $IMAGE_DIGEST"
	echo "Current digest : $CURRENT"
	if [ "$CURRENT" != "$IMAGE_DIGEST" ]; then
	echo "ERROR: image digest changed between scan and push — aborting."
	exit 1
	fi
	echo "✓ Digest verified — pushing exactly what was scanned."

	- name: Login to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Push verified image to registry
	run: \|
	OWNER=$(echo "${{ github.repository_owner }}" \| tr '[:upper:]' '[:lower:]')
	docker tag \
	fieldtrack-backend:${{ steps.meta.outputs.sha_short }} \
	ghcr.io/${OWNER}/fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	docker push ghcr.io/${OWNER}/fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	echo "✓ Pushed ghcr.io/${OWNER}/fieldtrack-backend:${{ steps.meta.outputs.sha_short }}"

	# Use the same pinned Trivy image to generate the SBOM — no additional
	# tool dependency, no unpinned action, same supply-chain guarantees.
	- name: Generate SBOM (CycloneDX)
	env:
	IMAGE_NAME: fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	run: \|
	docker run --rm \
	-v /var/run/docker.sock:/var/run/docker.sock \
	aquasec/trivy@sha256:91494b87ddc64f62860d52997532643956c24eeee0d0dda317d563c28c8581bc image \
	--format cyclonedx \
	--output /dev/stdout \
	"$IMAGE_NAME" > sbom.json

	- name: Upload SBOM artifact
	uses: actions/upload-artifact@v4
	with:
	name: sbom-${{ steps.meta.outputs.sha_short }}
	path: sbom.json
	retention-days: 90

	- name: Save build provenance
	env:
	IMAGE_NAME: fieldtrack-backend:${{ steps.meta.outputs.sha_short }}
	IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
	run: \|
	echo "commit=${{ github.sha }}" > provenance.txt
	echo "ref=${{ github.ref }}" >> provenance.txt
	echo "image=${IMAGE_NAME}" >> provenance.txt
	echo "digest=${IMAGE_DIGEST}" >> provenance.txt
	echo "workflow=${{ github.workflow }}" >> provenance.txt
	echo "run_id=${{ github.run_id }}" >> provenance.txt

	- name: Upload provenance artifact
	uses: actions/upload-artifact@v4
	with:
	name: provenance-${{ steps.meta.outputs.sha_short }}
	path: provenance.txt
	retention-days: 90

	- name: Build & scan summary
	if: always()
	env:
	IMAGE_DIGEST: ${{ steps.digest.outputs.digest }}
	run: \|
	SBOM_COUNT=$(python3 -c "import json; d=json.load(open('sbom.json')); print(len(d.get('components', [])))" 2>/dev/null \|\| echo 'n/a')
	{
	echo "## Build · Scan · Push"
	echo "\| Field \| Value \|"
	echo "\|---\|---\|"
	echo "\| Commit SHA \| \`${{ github.sha }}\` \|"
	echo "\| Image tag \| \`fieldtrack-backend:${{ steps.meta.outputs.sha_short }}\` \|"
	echo "\| Image digest \| \`${IMAGE_DIGEST}\` \|"
	echo "\| SBOM components \| ${SBOM_COUNT} \|"
	echo "\| Trivy gate \| HIGH,CRITICAL / exit-code 1 / ignore-unfixed \|"
	echo "\| Registry \| ghcr.io/${{ github.repository_owner }}/fieldtrack-backend \|"
	} >> "$GITHUB_STEP_SUMMARY"

	# ---------------------------------------------------------------------------
	# JOB: deploy
	#
	# Blue-Green deployment to VPS via SSH.
	# The deploy-bluegreen.sh script manages slot switching and container health.
	# ---------------------------------------------------------------------------
	deploy:
	name: Deploy (Blue-Green SSH)
	runs-on: ubuntu-latest
	needs: [build-scan-push]
	timeout-minutes: 15
	steps:
	- name: Validate required deployment secrets
	env:
	API_BASE_URL: ${{ secrets.API_BASE_URL }}
	CORS_ORIGIN: ${{ secrets.CORS_ORIGIN }}
	run: \|
	if [ -z "${API_BASE_URL:-}" ]; then
	echo "::error::API_BASE_URL secret is not set. Deployment aborted."
	exit 1
	fi
	echo "✓ API_BASE_URL is set"
	if [ -z "${CORS_ORIGIN:-}" ]; then
	echo "::error::CORS_ORIGIN secret is not set. Deployment aborted."
	exit 1
	fi
	echo "✓ CORS_ORIGIN is set"

	- name: Validate environment contract before deploy
	uses: appleboy/ssh-action@v1.0.3
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	set -euo pipefail
	export DEPLOY_ROOT="/home/ashish/FieldTrack-2.0"
	cd "$DEPLOY_ROOT"
	git fetch origin
	git reset --hard origin/master
	chmod +x apps/api/scripts/*.sh
	echo "=== Pre-deploy environment validation ==="
	./apps/api/scripts/validate-env.sh --check-monitoring
	echo "✓ Environment contract validated"

	- name: Blue-Green deploy via SSH
	uses: appleboy/ssh-action@v1.0.3
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	set -euo pipefail
	T0=$(date +%s)
	export DEPLOY_ROOT="/home/ashish/FieldTrack-2.0"
	cd "$DEPLOY_ROOT"
	chmod +x apps/api/scripts/*.sh
	# Environment already validated in previous step
	./apps/api/scripts/deploy-bluegreen.sh "${{ needs.build-scan-push.outputs.sha_short }}"
	echo "✓ Deploy completed in $(($(date +%s) - T0))s"

	- name: Log deployment state (slot + SHA for debugging)
	uses: appleboy/ssh-action@v1.0.3
	if: always()
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	ACTIVE_SLOT=$(cat /var/run/fieldtrack/active-slot 2>/dev/null \|\| echo "unknown")
	DEPLOY_STATUS="UNKNOWN"

	# Check if health endpoint is responding (good sign of successful deploy)
	if timeout 5 curl -sf http://127.0.0.1:3000/health >/dev/null 2>&1; then
	DEPLOY_STATUS="SUCCESS"
	fi

	echo "DEPLOY_STATE=$DEPLOY_STATUS\| SLOT=$ACTIVE_SLOT \| SHA=${{ github.sha }}"

	# ---------------------------------------------------------------------------
	# JOB: sync-infra
	#
	# Syncs Nginx config (with slot-aware port substitution).
	# Monitoring restarts are handled exclusively by deploy-bluegreen.sh.
	# ---------------------------------------------------------------------------
	sync-infra:
	name: Sync Infrastructure (nginx)
	runs-on: ubuntu-latest
	needs: [deploy]
	timeout-minutes: 10
	steps:
	- name: Sync infrastructure configs via SSH
	uses: appleboy/ssh-action@v1.0.3
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	set -euo pipefail
	T0=$(date +%s)
	export DEPLOY_ROOT="/home/ashish/FieldTrack-2.0"
	INFRA_DIR="$DEPLOY_ROOT/infra"
	NGINX_LIVE="/etc/nginx/sites-enabled/fieldtrack.conf"
	ACTIVE_SLOT_FILE="/var/run/fieldtrack/active-slot"

	ACTIVE_SLOT=$(cat "$ACTIVE_SLOT_FILE" 2>/dev/null \|\| echo "blue")
	if [ "$ACTIVE_SLOT" = "green" ]; then BACKEND_PORT=3002; else BACKEND_PORT=3001; fi

	# Load env from apps/api/.env — exports DEPLOY_ROOT, API_HOSTNAME, and all
	# app variables. DEPLOY_ROOT is already exported above; load-env.sh uses it.
	source "$DEPLOY_ROOT/apps/api/scripts/load-env.sh"

	echo "✓ API_HOSTNAME: $API_HOSTNAME"

	echo "=== Syncing Nginx (slot: $ACTIVE_SLOT, port: $BACKEND_PORT) ==="
	sudo cp "$NGINX_LIVE" /tmp/fieldtrack.conf.bak 2>/dev/null \|\| true
	NGINX_TMP=$(mktemp /tmp/fieldtrack-nginx.XXXXXX.conf)
	sed \
	-e "s\|__BACKEND_PORT__\|$BACKEND_PORT\|g" \
	-e "s\|__API_HOSTNAME__\|$API_HOSTNAME\|g" \
	"$INFRA_DIR/nginx/fieldtrack.conf" > "$NGINX_TMP"
	sudo cp "$NGINX_TMP" "$NGINX_LIVE"
	rm -f "$NGINX_TMP"

	if ! sudo nginx -t 2>&1; then
	echo "Nginx test failed — restoring backup..."
	sudo cp /tmp/fieldtrack.conf.bak "$NGINX_LIVE"
	exit 1
	fi
	sudo systemctl reload nginx
	echo "✓ Nginx reloaded."

	echo "✓ Infra sync completed in $(($(date +%s) - T0))s"

	# ---------------------------------------------------------------------------
	# JOB: health-and-smoke
	#
	# Step 1: Poll /health and /ready until they return 200 (up to 60 s each).
	# Step 2: Run the full smoke test suite (login + core API flows).
	# Failure here triggers the rollback job automatically.
	# ---------------------------------------------------------------------------
	health-and-smoke:
	name: Health Checks & Smoke Tests
	runs-on: ubuntu-latest
	needs: [sync-infra]
	timeout-minutes: 15
	steps:
	- name: Checkout
	uses: actions/checkout@v5

	- name: Wait for /health endpoint (via VPS)
	uses: appleboy/ssh-action@v1.0.3
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	set -euo pipefail
	export DEPLOY_ROOT="/home/ashish/FieldTrack-2.0"
	cd "$DEPLOY_ROOT"
	source apps/api/scripts/load-env.sh
	echo "=== Checking /health via VPS (API_HOSTNAME=$API_HOSTNAME) ==="
	for i in $(seq 1 30); do
	echo "---- Attempt $i ----"
	STATUS=$(curl -sS \
	--resolve "${API_HOSTNAME}:443:127.0.0.1" \
	-o /tmp/resp.txt \
	-w "%{http_code}" \
	https://${API_HOSTNAME}/health \
	--insecure \|\| echo "000")
	BODY=$(cat /tmp/resp.txt 2>/dev/null \|\| echo "")
	echo "HTTP: $STATUS"
	echo "BODY: $BODY"
	if [ "$STATUS" = "200" ] && echo "$BODY" \| grep -q '"status":"ok"'; then
	echo "✓ /health OK (attempt $i)"
	exit 0
	fi
	sleep 2
	done
	echo "❌ /health failed"
	exit 1

	- name: Wait for /health endpoint (final public check)
	uses: appleboy/ssh-action@v1.0.3
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	set -euo pipefail
	export DEPLOY_ROOT="/home/ashish/FieldTrack-2.0"
	cd "$DEPLOY_ROOT"
	source apps/api/scripts/load-env.sh
	echo "=== Final health check via public endpoint (API_HOSTNAME=$API_HOSTNAME) ==="
	for i in $(seq 1 10); do
	echo "---- Attempt $i ----"
	STATUS=$(curl -sS \
	--resolve "${API_HOSTNAME}:443:127.0.0.1" \
	-o /tmp/resp.txt \
	-w "%{http_code}" \
	https://${API_HOSTNAME}/health \
	--insecure \|\| echo "000")
	BODY=$(cat /tmp/resp.txt 2>/dev/null \|\| echo "")
	echo "HTTP: $STATUS"
	echo "BODY: $BODY"
	if [ "$STATUS" = "200" ] && echo "$BODY" \| grep -q '"status":"ok"'; then
	echo "✓ /health OK (attempt $i)"
	exit 0
	fi
	sleep 2
	done
	echo "❌ /health failed"
	exit 1

	- name: Run smoke tests
	env:
	API_BASE_URL: ${{ secrets.API_BASE_URL }}
	FT_EMP_EMAIL: ${{ secrets.FT_EMP_EMAIL }}
	FT_EMP_PASSWORD: ${{ secrets.FT_EMP_PASSWORD }}
	FT_ADMIN_EMAIL: ${{ secrets.FT_ADMIN_EMAIL }}
	FT_ADMIN_PASSWORD: ${{ secrets.FT_ADMIN_PASSWORD }}
	SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
	SUPABASE_ANON_KEY: ${{ secrets.SUPABASE_ANON_KEY }}
	run: \|
	chmod +x apps/api/scripts/smoke-test.sh
	./apps/api/scripts/smoke-test.sh

	- name: Upload smoke test report
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: smoke-test-report-${{ github.sha }}
	path: smoke-report.json
	retention-days: 30

	- name: Deployment summary
	run: \|
	echo "====================================================="
	echo " Production Deployment: COMPLETE ✅"
	echo "====================================================="
	echo " Commit: ${{ github.sha }}"
	echo " /health: OK"
	echo " /ready: OK"
	echo " Smoke: passed"
	echo "====================================================="

	# ---------------------------------------------------------------------------
	# JOB: rollback
	#
	# Triggered automatically when deploy, sync-infra, OR health-and-smoke fails.
	# Restores the previously healthy Blue-Green slot via the rollback script.
	# 'if: always()' ensures this job can evaluate even if upstream jobs failed.
	# ---------------------------------------------------------------------------
	rollback:
	name: Rollback Deployment (auto)
	runs-on: ubuntu-latest
	needs: [deploy, sync-infra, health-and-smoke]
	timeout-minutes: 10
	if: \|
	always() &&
	(
	needs.deploy.result == 'failure' \|\|
	needs.sync-infra.result == 'failure' \|\|
	needs.health-and-smoke.result == 'failure'
	)
	steps:
	- name: Log rollback trigger
	run: \|
	echo "ROLLBACK_TRIGGERED=TRUE \| FAILED_JOBS:"
	[ "${{ needs.deploy.result }}" = "failure" ] && echo " - deploy"
	[ "${{ needs.sync-infra.result }}" = "failure" ] && echo " - sync-infra"
	[ "${{ needs.health-and-smoke.result }}" = "failure" ] && echo " - health-and-smoke"
	echo "SHA=${{ github.sha }}"

	- name: Rollback on VPS
	uses: appleboy/ssh-action@v1.0.3
	with:
	host: ${{ secrets.DO_HOST }}
	username: ${{ secrets.DO_USER }}
	key: ${{ secrets.DO_SSH_KEY }}
	script: \|
	set -euo pipefail
	export DEPLOY_ROOT="/home/ashish/FieldTrack-2.0"
	cd "$DEPLOY_ROOT"
	chmod +x apps/api/scripts/*.sh
	./apps/api/scripts/rollback.sh --auto

	# Log final state
	ACTIVE_SLOT=$(cat /var/run/fieldtrack/active-slot 2>/dev/null \|\| echo "unknown")
	echo "ROLLBACK_COMPLETE \| ACTIVE_SLOT=$ACTIVE_SLOT \| SHA=${{ github.sha }}"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

feat: implement weekly base image digest rotation workflow for securi… #256

Workflow file

feat: implement weekly base image digest rotation workflow for securi… #256

Uh oh!

Workflow file for this run