Skip to content

[design-spec] k8s-airflow-workload-diagnostics #98

@rw-codebundle-agent

Description

@rw-codebundle-agent

Design Spec: k8s-airflow-workload-diagnostics

Parent: #97
Target: rw-cli-codecollection

Spec

# --- Identity ---
codebundle_name: "k8s-airflow-workload-diagnostics"
target_collection: "rw-cli-codecollection"
display_name: "Kubernetes Airflow Workload Diagnostics"
author: "rw-codebundle-agent"

# --- Purpose ---
purpose: |
  Collects Kubernetes-centric health signals for Apache Airflow installations: workload
  objects (webserver, scheduler, workers, triggerer), pod readiness and restarts,
  events, PVCs for logs/dags, and targeted log excerpts so operators can diagnose
  misconfiguration, resource pressure, and executor failures without duplicating generic
  cluster-only checks.

# --- Tasks ---
tasks:
  - name: "List Airflow Workloads in Namespace"
    description: "Discovers Deployments, StatefulSets, and DaemonSets associated with Airflow via configurable label selectors and name patterns; prints desired vs ready replicas."
    script_name: "list-airflow-workloads.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "logs-config"

  - name: "Check Airflow Pod Health and Restarts"
    description: "Evaluates Pods matched by the Airflow selector for phase, Ready condition, restart counts, and recent container termination reasons (OOMKilled, Error)."
    script_name: "check-airflow-pod-health.sh"
    expected_issue_severity: [2, 4]
    access_level: "read-only"
    data_type: "metrics"

  - name: "Fetch Recent Events for Airflow Resources"
    description: "Pulls Warning/Failed events in the lookback window for Airflow workloads to catch scheduling, volume mount, and probe failures."
    script_name: "fetch-airflow-events.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "logs-config"

  - name: "Summarize PVC Status for Airflow Data Volumes"
    description: "Lists PVCs tied to Airflow workloads (logs, dags, plugins) and reports Bound/Pending and capacity signals to catch storage provisioning issues."
    script_name: "summarize-airflow-pvcs.sh"
    expected_issue_severity: [2, 3]
    access_level: "read-only"
    data_type: "logs-config"

  - name: "Sample Scheduler Logs for DAG Import Errors"
    description: "Tails or fetches recent scheduler pod logs and flags common DAG import/traceback patterns without executing DAGs."
    script_name: "sample-airflow-scheduler-logs.sh"
    expected_issue_severity: [3, 4]
    access_level: "read-only"
    data_type: "logs"

  - name: "Check Worker or KubernetesExecutor Pod Saturation"
    description: "When Celery/Kubernetes executor components are present, summarizes pending task-related signals from pod status and optional resource requests vs limits (best-effort via kubectl describe)."
    script_name: "check-airflow-executor-pods.sh"
    expected_issue_severity: [3, 4]
    access_level: "read-only"
    data_type: "metrics"

# --- Scope ---
scope:
  level: "Resource"
  qualifiers:
    - CONTEXT
    - NAMESPACE
    - AIRFLOW_LABEL_SELECTOR
  iteration_pattern: |
    One SLX per namespace (or per discovered Airflow release) where workloads match
    `AIRFLOW_LABEL_SELECTOR` or default chart labels such as app.kubernetes.io/name=airflow.

# --- Resource Discovery ---
resource_types:
  - "kubernetes_namespace"
generation_strategy: |
  Discover namespaces containing Deployments/StatefulSets whose labels or names match
  Airflow Helm chart conventions; optionally restrict via allowlist env var for large clusters.

# --- Configuration ---
env_vars:
  - name: CONTEXT
    description: "Kubernetes context"
    required: true

  - name: NAMESPACE
    description: "Namespace to inspect; empty may trigger discovery mode if generation rule supports it"
    required: true

  - name: AIRFLOW_LABEL_SELECTOR
    description: "Label selector for Airflow workloads (e.g. app.kubernetes.io/name=airflow)"
    required: false
    default: "app.kubernetes.io/name=airflow"

  - name: AIRFLOW_DEPLOYMENT_NAME_PREFIX
    description: "Optional name prefix filter when labels are inconsistent across custom charts"
    required: false
    default: "airflow"

  - name: RW_LOOKBACK_WINDOW
    description: "Time window for events and log sampling (e.g. 30m, 2h)"
    required: false
    default: "1h"

  - name: KUBERNETES_DISTRIBUTION_BINARY
    description: "kubectl or oc"
    required: false
    default: "kubectl"

secrets:
  - name: kubeconfig
    description: "Standard kubeconfig with list/get/describe/logs permissions on workloads"
    format: "kubeconfig file"

# --- Platform Context ---
platform:
  name: "kubernetes"
  cli_tools:
    - "kubectl"
    - "jq"
  auth_methods:
    - "kubeconfig secret"
  api_docs: "https://kubernetes.io/docs/reference/kubectl/"

# --- Relationships ---
related_bundles:
  - name: "k8s-airflow-http-health"
    relationship: "complements"
    notes: "Use HTTP bundle to validate webserver/API; use this bundle for pod-level and storage signals."

  - name: "k8s-cluster-node-health"
    relationship: "complements"
    notes: "Node-level problems may surface as pod scheduling failures; node checks stay in the node bundle."

  - name: "k8s-statefulset-healthcheck"
    relationship: "complements"
    notes: "Generic StatefulSet checks apply; this bundle adds Airflow-specific selectors and log patterns."

  - name: "k8s-postgres-healthcheck"
    relationship: "complements"
    notes: "Database connectivity belongs in Postgres bundle; scheduler logs may still hint at DB timeouts."

# --- Test Strategy ---
test_scenarios:
  - name: "healthy_release"
    description: "All expected workloads ready, no critical events, PVCs bound"
    expected_issues: 0

  - name: "scheduler_crashloop"
    description: "Scheduler pod restarting with errors in logs"
    expected_issues: 2
    expected_severities: [3, 4]

# --- Notes ---
notes: |
  Keep tasks read-only; do not trigger DAG runs or mutate workloads. Prefer label-driven
  discovery over hard-coded Helm release names. For massively multi-tenant clusters,
  require explicit NAMESPACE. Pair with k8s-airflow-http-health for full stack visibility.

Metadata

Metadata

Assignees

No one assigned

    Labels

    completedAgent work completeddesign-specArchitect has produced a design specnew-codebundleScoped issue for SRE to implement a new CodeBundle

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions