From e669cdf0e46720e309af2da102f6c058d5674b3d Mon Sep 17 00:00:00 2001 From: Sissi Feng Date: Thu, 26 Mar 2026 11:11:40 -0400 Subject: [PATCH] Add FastAPI health server and align with camera-proxy deploy pattern - Replace Typer CLI entry with FastAPI app + /health endpoint - Run upload scheduler in background thread via lifespan - Switch to HTTP health check (uvicorn :8000) matching ALB expectations - Update Dockerfile: EXPOSE 8000, HTTP-based HEALTHCHECK - Align tfvars: port 8000, health check 30s/5s, Lab PC IP - Use GitHub App token for infrastructure-modules checkout - Add environment gates to deploy workflow jobs --- .github/workflows/deploy.yml | 13 ++++++- app/Dockerfile | 6 ++-- app/entrypoint.sh | 2 +- app/main.py | 66 +++++++++++++++++++++++++++++++++--- app/requirements.txt | 2 ++ platform/vars/dev.tfvars | 10 +++--- platform/vars/prod.tfvars | 10 +++--- platform/vars/test.tfvars | 10 +++--- 8 files changed, 96 insertions(+), 23 deletions(-) diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 6aa1389..14da103 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -95,18 +95,28 @@ jobs: infrastructure: runs-on: ubuntu-latest needs: changes + environment: ${{ needs.changes.outputs.tfvars }} if: needs.changes.outputs.infra == 'true' steps: - name: Checkout uses: actions/checkout@v4 + - name: Generate token from GitHub App + id: app-token + uses: actions/create-github-app-token@v1 + with: + app-id: ${{ secrets.AC_APP_ID }} + private-key: ${{ secrets.AC_APP_PRIVATE_KEY }} + owner: AccelerationConsortium + repositories: infrastructure-modules + - name: Checkout infrastructure-modules uses: actions/checkout@v4 with: repository: AccelerationConsortium/infrastructure-modules ref: main path: .modules/infrastructure-modules - token: ${{ secrets.GH_PAT }} + token: ${{ steps.app-token.outputs.token }} - name: Configure AWS Credentials uses: aws-actions/configure-aws-credentials@v4 @@ -174,6 +184,7 @@ jobs: deploy-app: runs-on: ubuntu-latest needs: [changes, infrastructure] + environment: ${{ needs.changes.outputs.tfvars }} if: always() && !failure() && !cancelled() && needs.changes.outputs.app == 'true' && github.event_name == 'push' steps: - name: Checkout diff --git a/app/Dockerfile b/app/Dockerfile index bac6ded..659f2ea 100644 --- a/app/Dockerfile +++ b/app/Dockerfile @@ -17,7 +17,9 @@ RUN pip install --no-cache-dir -r requirements.txt COPY . . RUN chmod +x entrypoint.sh -HEALTHCHECK --interval=60s --timeout=10s --start-period=60s --retries=3 \ - CMD pgrep -f "main.py" || exit 1 +EXPOSE 8000 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=30s --retries=3 \ + CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8000/health')" || exit 1 ENTRYPOINT ["./entrypoint.sh"] diff --git a/app/entrypoint.sh b/app/entrypoint.sh index 4a9486b..b383f10 100755 --- a/app/entrypoint.sh +++ b/app/entrypoint.sh @@ -29,4 +29,4 @@ else fi echo "Starting uploader agent..." -exec python main.py run --config /app/config.yaml +exec uvicorn main:app --host 0.0.0.0 --port ${PORT:-8000} diff --git a/app/main.py b/app/main.py index d915677..ccf18be 100644 --- a/app/main.py +++ b/app/main.py @@ -1,6 +1,64 @@ -"""Lab Data Uploader Agent — entry point.""" +"""Lab Data Uploader Agent — FastAPI entry point with background scheduler.""" -from agent.cli import app +from __future__ import annotations -if __name__ == "__main__": - app() +import os +import threading +from contextlib import asynccontextmanager +from pathlib import Path + +from fastapi import FastAPI + +from agent.config import load_config +from agent.logging_utils import get_logger, setup_logging +from agent.scheduler import UploadScheduler + +_scheduler: UploadScheduler | None = None +_scheduler_thread: threading.Thread | None = None + + +def _run_scheduler(scheduler: UploadScheduler) -> None: + """Run the upload scheduler loop in a background thread.""" + scheduler.run_loop() + + +@asynccontextmanager +async def lifespan(application: FastAPI): # noqa: ARG001 + """Start scheduler on startup, stop on shutdown.""" + global _scheduler, _scheduler_thread + + config_path = Path(os.environ.get("AGENT_CONFIG", "/app/config.yaml")) + cfg = load_config(config_path) + setup_logging(cfg.storage.log_dir) + logger = get_logger("main") + + roots = [r.path for r in cfg.watch.session_roots] + logger.info( + "agent_startup", + machine_id=cfg.agent.machine_id, + lab_id=cfg.agent.lab_id, + session_roots=roots, + scan_interval=cfg.agent.scan_interval_seconds, + ) + + _scheduler = UploadScheduler(cfg) + _scheduler_thread = threading.Thread(target=_run_scheduler, args=(_scheduler,), daemon=True) + _scheduler_thread.start() + + logger.info("health_server_ready", port=int(os.environ.get("PORT", "8000"))) + yield + + logger.info("agent_shutdown", reason="lifespan_shutdown") + if _scheduler: + _scheduler.stop() + if _scheduler_thread: + _scheduler_thread.join(timeout=10) + + +app = FastAPI(title="Lab Data Uploader Agent", lifespan=lifespan) + + +@app.get("/health") +def health(): + """Health check endpoint for ALB.""" + return {"status": "healthy"} diff --git a/app/requirements.txt b/app/requirements.txt index 48608a1..88882db 100644 --- a/app/requirements.txt +++ b/app/requirements.txt @@ -5,3 +5,5 @@ tenacity>=8.0,<10 structlog>=24.0,<26 typer>=0.12,<1 boto3>=1.34,<2 +fastapi>=0.115,<1 +uvicorn>=0.32,<1 diff --git a/platform/vars/dev.tfvars b/platform/vars/dev.tfvars index d0caed5..5f7daa4 100644 --- a/platform/vars/dev.tfvars +++ b/platform/vars/dev.tfvars @@ -2,17 +2,17 @@ app_name = "lab-data-uploader" environment = "dev" # Container configuration -container_port = 8080 +container_port = 8000 cpu = 512 memory = 1024 desired_count = 1 # Health check health_check_path = "/health" -health_check_interval = 60 -health_check_timeout = 10 +health_check_interval = 30 +health_check_timeout = 5 -# No public access needed +# No public access needed — internal service enable_cloudfront = false allow_cloudfront_access = false enable_waf = false @@ -22,7 +22,7 @@ enable_cognito_auth = false # Environment variables environment_variables = { ENVIRONMENT = "dev" - NFS_MOUNTS = "100.x.x.x:/labdata:/mnt/lab1" + NFS_MOUNTS = "100.115.219.51:/labdata:/mnt/lab1" } # Secrets (stored in AWS Secrets Manager) diff --git a/platform/vars/prod.tfvars b/platform/vars/prod.tfvars index d336ee5..02299af 100644 --- a/platform/vars/prod.tfvars +++ b/platform/vars/prod.tfvars @@ -2,17 +2,17 @@ app_name = "lab-data-uploader" environment = "prod" # Container configuration -container_port = 8080 +container_port = 8000 cpu = 512 memory = 1024 desired_count = 1 # Health check health_check_path = "/health" -health_check_interval = 60 -health_check_timeout = 10 +health_check_interval = 30 +health_check_timeout = 5 -# No public access needed +# No public access needed — internal service enable_cloudfront = false allow_cloudfront_access = false enable_waf = false @@ -22,7 +22,7 @@ enable_cognito_auth = false # Environment variables environment_variables = { ENVIRONMENT = "prod" - NFS_MOUNTS = "100.x.x.x:/labdata:/mnt/lab1" + NFS_MOUNTS = "100.115.219.51:/labdata:/mnt/lab1" } # Secrets (stored in AWS Secrets Manager) diff --git a/platform/vars/test.tfvars b/platform/vars/test.tfvars index b10cbd6..f6a535b 100644 --- a/platform/vars/test.tfvars +++ b/platform/vars/test.tfvars @@ -2,17 +2,17 @@ app_name = "lab-data-uploader" environment = "test" # Container configuration -container_port = 8080 +container_port = 8000 cpu = 512 memory = 1024 desired_count = 1 # Health check health_check_path = "/health" -health_check_interval = 60 -health_check_timeout = 10 +health_check_interval = 30 +health_check_timeout = 5 -# No public access needed +# No public access needed — internal service enable_cloudfront = false allow_cloudfront_access = false enable_waf = false @@ -22,7 +22,7 @@ enable_cognito_auth = false # Environment variables environment_variables = { ENVIRONMENT = "test" - NFS_MOUNTS = "100.x.x.x:/labdata:/mnt/lab1" + NFS_MOUNTS = "100.115.219.51:/labdata:/mnt/lab1" } # Secrets (stored in AWS Secrets Manager)