Design Spec: k8s-airflow-workload-diagnostics
Parent: #97
Target: rw-cli-codecollection
Spec
# --- Identity ---
codebundle_name: "k8s-airflow-workload-diagnostics"
target_collection: "rw-cli-codecollection"
display_name: "Kubernetes Airflow Workload Diagnostics"
author: "rw-codebundle-agent"
# --- Purpose ---
purpose: |
Collects Kubernetes-centric health signals for Apache Airflow installations: workload
objects (webserver, scheduler, workers, triggerer), pod readiness and restarts,
events, PVCs for logs/dags, and targeted log excerpts so operators can diagnose
misconfiguration, resource pressure, and executor failures without duplicating generic
cluster-only checks.
# --- Tasks ---
tasks:
- name: "List Airflow Workloads in Namespace"
description: "Discovers Deployments, StatefulSets, and DaemonSets associated with Airflow via configurable label selectors and name patterns; prints desired vs ready replicas."
script_name: "list-airflow-workloads.sh"
expected_issue_severity: [2, 3]
access_level: "read-only"
data_type: "logs-config"
- name: "Check Airflow Pod Health and Restarts"
description: "Evaluates Pods matched by the Airflow selector for phase, Ready condition, restart counts, and recent container termination reasons (OOMKilled, Error)."
script_name: "check-airflow-pod-health.sh"
expected_issue_severity: [2, 4]
access_level: "read-only"
data_type: "metrics"
- name: "Fetch Recent Events for Airflow Resources"
description: "Pulls Warning/Failed events in the lookback window for Airflow workloads to catch scheduling, volume mount, and probe failures."
script_name: "fetch-airflow-events.sh"
expected_issue_severity: [2, 3]
access_level: "read-only"
data_type: "logs-config"
- name: "Summarize PVC Status for Airflow Data Volumes"
description: "Lists PVCs tied to Airflow workloads (logs, dags, plugins) and reports Bound/Pending and capacity signals to catch storage provisioning issues."
script_name: "summarize-airflow-pvcs.sh"
expected_issue_severity: [2, 3]
access_level: "read-only"
data_type: "logs-config"
- name: "Sample Scheduler Logs for DAG Import Errors"
description: "Tails or fetches recent scheduler pod logs and flags common DAG import/traceback patterns without executing DAGs."
script_name: "sample-airflow-scheduler-logs.sh"
expected_issue_severity: [3, 4]
access_level: "read-only"
data_type: "logs"
- name: "Check Worker or KubernetesExecutor Pod Saturation"
description: "When Celery/Kubernetes executor components are present, summarizes pending task-related signals from pod status and optional resource requests vs limits (best-effort via kubectl describe)."
script_name: "check-airflow-executor-pods.sh"
expected_issue_severity: [3, 4]
access_level: "read-only"
data_type: "metrics"
# --- Scope ---
scope:
level: "Resource"
qualifiers:
- CONTEXT
- NAMESPACE
- AIRFLOW_LABEL_SELECTOR
iteration_pattern: |
One SLX per namespace (or per discovered Airflow release) where workloads match
`AIRFLOW_LABEL_SELECTOR` or default chart labels such as app.kubernetes.io/name=airflow.
# --- Resource Discovery ---
resource_types:
- "kubernetes_namespace"
generation_strategy: |
Discover namespaces containing Deployments/StatefulSets whose labels or names match
Airflow Helm chart conventions; optionally restrict via allowlist env var for large clusters.
# --- Configuration ---
env_vars:
- name: CONTEXT
description: "Kubernetes context"
required: true
- name: NAMESPACE
description: "Namespace to inspect; empty may trigger discovery mode if generation rule supports it"
required: true
- name: AIRFLOW_LABEL_SELECTOR
description: "Label selector for Airflow workloads (e.g. app.kubernetes.io/name=airflow)"
required: false
default: "app.kubernetes.io/name=airflow"
- name: AIRFLOW_DEPLOYMENT_NAME_PREFIX
description: "Optional name prefix filter when labels are inconsistent across custom charts"
required: false
default: "airflow"
- name: RW_LOOKBACK_WINDOW
description: "Time window for events and log sampling (e.g. 30m, 2h)"
required: false
default: "1h"
- name: KUBERNETES_DISTRIBUTION_BINARY
description: "kubectl or oc"
required: false
default: "kubectl"
secrets:
- name: kubeconfig
description: "Standard kubeconfig with list/get/describe/logs permissions on workloads"
format: "kubeconfig file"
# --- Platform Context ---
platform:
name: "kubernetes"
cli_tools:
- "kubectl"
- "jq"
auth_methods:
- "kubeconfig secret"
api_docs: "https://kubernetes.io/docs/reference/kubectl/"
# --- Relationships ---
related_bundles:
- name: "k8s-airflow-http-health"
relationship: "complements"
notes: "Use HTTP bundle to validate webserver/API; use this bundle for pod-level and storage signals."
- name: "k8s-cluster-node-health"
relationship: "complements"
notes: "Node-level problems may surface as pod scheduling failures; node checks stay in the node bundle."
- name: "k8s-statefulset-healthcheck"
relationship: "complements"
notes: "Generic StatefulSet checks apply; this bundle adds Airflow-specific selectors and log patterns."
- name: "k8s-postgres-healthcheck"
relationship: "complements"
notes: "Database connectivity belongs in Postgres bundle; scheduler logs may still hint at DB timeouts."
# --- Test Strategy ---
test_scenarios:
- name: "healthy_release"
description: "All expected workloads ready, no critical events, PVCs bound"
expected_issues: 0
- name: "scheduler_crashloop"
description: "Scheduler pod restarting with errors in logs"
expected_issues: 2
expected_severities: [3, 4]
# --- Notes ---
notes: |
Keep tasks read-only; do not trigger DAG runs or mutate workloads. Prefer label-driven
discovery over hard-coded Helm release names. For massively multi-tenant clusters,
require explicit NAMESPACE. Pair with k8s-airflow-http-health for full stack visibility.
Design Spec: k8s-airflow-workload-diagnostics
Parent: #97
Target:
rw-cli-codecollectionSpec