Skip to content

[design-spec] k8s-airflow-http-health #99

@rw-codebundle-agent

Description

@rw-codebundle-agent

Design Spec: k8s-airflow-http-health

Parent: #97
Target: rw-cli-codecollection

Spec

# --- Identity ---
codebundle_name: "k8s-airflow-http-health"
target_collection: "rw-cli-codecollection"
display_name: "Kubernetes Airflow HTTP/API Health"
author: "rw-codebundle-agent"

# --- Purpose ---
purpose: |
  Exposes Apache Airflow webserver (and optionally scheduler) health through HTTP:
  liveness-style checks, lightweight REST API probes, and kubectl Service/Endpoints
  correlation so failures are triaged as network/RBAC vs application issues.

# --- Tasks ---
tasks:
  - name: "Resolve Airflow Webserver Base URL"
    description: "Derives the HTTP base URL for the Airflow webserver via optional PROXY_BASE_URL or kubectl port-forward to the webserver Service; fails fast if neither path is reachable."
    script_name: "resolve-airflow-base-url.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "logs-config"

  - name: "Check Airflow Webserver Health Endpoint"
    description: "GETs the webserver `/health` (or chart-documented equivalent) and validates HTTP status and JSON body fields indicating metadata DB and scheduler health where present."
    script_name: "check-airflow-webserver-health.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Check Airflow REST API Health or Version"
    description: "Probes a read-only API route such as `/api/v1/health`, `/api/v2/monitor/health`, or `/api/v1/version` (version-dependent) to confirm the API process responds; supports optional Bearer/basic auth from configured secrets."
    script_name: "check-airflow-api-health.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Verify Kubernetes Service and Endpoints for Webserver"
    description: "Uses kubectl to confirm the webserver Service exists, has Endpoints backing it, and port numbers align with the HTTP probes to explain failures caused by missing endpoints or port mismatches."
    script_name: "verify-airflow-webserver-service.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "logs-config"

  - name: "Optional Check Scheduler or Triggerer HTTP Health"
    description: "When AIRFLOW_SCHEDULER_SERVICE_NAME or AIRFLOW_TRIGGERER_SERVICE_NAME is set, performs a lightweight HTTP GET against chart-exposed health or metrics ports (if any) or documents skip when not exposed."
    script_name: "check-airflow-scheduler-http-health.sh"
    expected_issue_severity: [3, 4]
    access_level: "read-only"
    data_type: "metrics"

# --- Scope ---
scope:
  level: "Resource"
  qualifiers:
    - CONTEXT
    - NAMESPACE
    - AIRFLOW_WEBSERVER_SERVICE_NAME
  iteration_pattern: |
    One SLX per discovered Airflow webserver Service in a namespace (generation rule
    matches Service name/label patterns such as `airflow` and common chart ports like 8080),
    or user-provided Service name when run manually.

# --- Resource Discovery ---
resource_types:
  - "kubernetes_service"
generation_strategy: |
  Match Kubernetes Services whose name or labels indicate the Airflow webserver and
  expose HTTP (typically port 8080). Exclude unrelated Services in the same namespace
  by requiring `airflow` name substring or standard chart labels and the expected port.

# --- Configuration ---
env_vars:
  - name: CONTEXT
    description: "Kubernetes context for kubectl-backed tasks"
    required: true

  - name: NAMESPACE
    description: "Namespace where Airflow runs"
    required: true

  - name: AIRFLOW_WEBSERVER_SERVICE_NAME
    description: "Kubernetes Service name for the Airflow webserver"
    required: true

  - name: PROXY_BASE_URL
    description: "Optional full base URL for HTTP checks; leave empty to auto port-forward to the webserver Service"
    required: false
    default: ""

  - name: AIRFLOW_HTTP_PORT
    description: "Service port for Airflow web UI/API (default 8080)"
    required: false
    default: "8080"

  - name: AIRFLOW_SCHEDULER_SERVICE_NAME
    description: "Optional Service name for scheduler-side HTTP checks when exposed"
    required: false
    default: ""

  - name: AIRFLOW_TRIGGERER_SERVICE_NAME
    description: "Optional Service name for Airflow 2 triggerer HTTP checks when exposed"
    required: false
    default: ""

  - name: KUBERNETES_DISTRIBUTION_BINARY
    description: "kubectl or oc"
    required: false
    default: "kubectl"

secrets:
  - name: kubeconfig
    description: "Standard kubeconfig for kubectl and optional port-forward"
    format: "kubeconfig file"

  - name: airflow_api_credentials
    description: "Optional username/password or token for protected API routes"
    format: "JSON or key-value per implementation notes"

# --- Platform Context ---
platform:
  name: "kubernetes"
  cli_tools:
    - "kubectl"
    - "curl"
    - "jq"
  auth_methods:
    - "kubeconfig secret"
    - "Optional Airflow API credentials for authenticated routes"
  api_docs: "https://airflow.apache.org/docs/apache-airflow/stable/"

# --- Relationships ---
related_bundles:
  - name: "k8s-airflow-workload-diagnostics"
    relationship: "complements"
    notes: "HTTP bundle validates endpoints; workload bundle inspects pods, events, and logs for root cause."

  - name: "k8s-postgres-healthcheck"
    relationship: "complements"
    notes: "Metadata database issues often surface in Airflow health JSON; dedicated Postgres checks remain the source of truth for DB layer."

  - name: "k8s-redis-healthcheck"
    relationship: "complements"
    notes: "Celery executor deployments use Redis; broker health is not duplicated here."

  - name: "k8s-litellm-proxy-health"
    relationship: "complements"
    notes: "Similar kubectl port-forward + HTTP probe pattern; reuse connectivity patterns but not LiteLLM-specific routes."

# --- Test Strategy ---
test_scenarios:
  - name: "healthy_airflow_webserver"
    description: "Webserver Service has Endpoints; /health and API probes return success"
    expected_issues: 0

  - name: "no_endpoints"
    description: "Service exists but Endpoints empty — expect correlation issue"
    expected_issues: 1
    expected_severities: [3]

# --- Notes ---
notes: |
  Airflow major versions differ in exact health JSON and API routes; implementers should
  support Airflow 2.x as primary, document fallbacks for 1.x if needed, and keep probes
  read-only. Avoid DAG-triggering endpoints. Deep metadata checks belong in Postgres/Redis bundles.

Metadata

Metadata

Assignees

No one assigned

    Labels

    completedAgent work completeddesign-specArchitect has produced a design specnew-codebundleScoped issue for SRE to implement a new CodeBundle

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions