Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
apiVersion: runwhen.com/v1
kind: GenerationRules
spec:
generationRules:
- resourceTypes:
- namespace
matchRules:
- type: pattern
pattern: ".+"
properties: [name]
mode: substring
slxs:
- baseName: af-wl-diag
qualifiers: ["namespace", "cluster"]
baseTemplateName: k8s-airflow-workload-diagnostics
levelOfDetail: basic
outputItems:
- type: slx
- type: sli
- type: runbook
templateName: k8s-airflow-workload-diagnostics-taskset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelIndicator
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
displayUnitsLong: OK
displayUnitsShort: ok
locations:
- {{default_location}}
description: Measures Airflow workload health via workload readiness, pod readiness, and Warning event volume in the namespace.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/k8s-airflow-workload-diagnostics/sli.robot
intervalStrategy: intermezzo
intervalSeconds: 300
configProvided:
- name: KUBERNETES_DISTRIBUTION_BINARY
value: {{custom.kubernetes_distribution_binary | default("kubectl")}}
- name: NAMESPACE
value: {{match_resource.resource.metadata.name}}
- name: CONTEXT
value: "{{context}}"
- name: AIRFLOW_LABEL_SELECTOR
value: "{{custom.airflow_label_selector | default('app.kubernetes.io/name=airflow')}}"
- name: AIRFLOW_DEPLOYMENT_NAME_PREFIX
value: "{{custom.airflow_deployment_name_prefix | default('airflow')}}"
- name: RW_LOOKBACK_WINDOW
value: "{{custom.rw_lookback_window | default('1h')}}"
- name: AIRFLOW_SLI_EVENT_THRESHOLD
value: "{{custom.airflow_sli_event_threshold | default('8')}}"
secretsProvided:
{% if wb_version %}
{% include "kubernetes-auth.yaml" ignore missing %}
{% else %}
- name: kubeconfig
workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}}
{% endif %}
alertConfig:
tasks:
persona: eager-edgar
sessionTTL: 10m
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelX
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/deploy.svg
alias: {{namespace.name}} Airflow Workload Diagnostics
asMeasuredBy: Aggregate SLI over workload readiness, pod readiness, and Warning event volume for Airflow-labeled resources.
configProvided:
- name: NAMESPACE
value: {{match_resource.resource.metadata.name}}
owners:
- {{workspace.owner_email}}
statement: Airflow controllers and pods in this namespace should be ready and free of excessive Warning events.
additionalContext:
{% include "kubernetes-hierarchy.yaml" ignore missing %}
qualified_name: "{{ match_resource.qualified_name }}"
tags:
{% include "kubernetes-tags.yaml" ignore missing %}
- name: access
value: read-only
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
apiVersion: runwhen.com/v1
kind: Runbook
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
location: {{default_location}}
description: Diagnoses Apache Airflow workloads in Kubernetes for replica health, pods, events, PVCs, scheduler logs, and executor saturation.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/k8s-airflow-workload-diagnostics/runbook.robot
configProvided:
- name: KUBERNETES_DISTRIBUTION_BINARY
value: {{custom.kubernetes_distribution_binary | default("kubectl")}}
- name: NAMESPACE
value: {{match_resource.resource.metadata.name}}
- name: CONTEXT
value: "{{context}}"
- name: AIRFLOW_LABEL_SELECTOR
value: "{{custom.airflow_label_selector | default('app.kubernetes.io/name=airflow')}}"
- name: AIRFLOW_DEPLOYMENT_NAME_PREFIX
value: "{{custom.airflow_deployment_name_prefix | default('airflow')}}"
- name: RW_LOOKBACK_WINDOW
value: "{{custom.rw_lookback_window | default('1h')}}"
secretsProvided:
{% if wb_version %}
{% include "kubernetes-auth.yaml" ignore missing %}
{% else %}
- name: kubeconfig
workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}}
{% endif %}
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
version: "3"

tasks:
default:
desc: "Run/refresh RunWhen Local config"
cmds:
- task: generate-rwl-config

clean:
desc: "Run cleanup tasks"
cmds:
- task: clean-rwl-discovery

build-infra:
desc: "Build test infrastructure"
cmds:
- task: create-kubernetes-objects

create-kubernetes-objects:
desc: "Apply manifests from kubernetes directory using kubectl"
cmds:
- kubectl apply -f kubernetes/manifest.yaml
silent: true

remove-kubernetes-objects:
desc: "Delete kubernetes objects"
cmds:
- kubectl delete -f kubernetes/manifest.yaml
silent: true

generate-rwl-config:
desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)"
env:
RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}'
cmds:
- |
repo_url=$(git config --get remote.origin.url)
branch_name=$(git rev-parse --abbrev-ref HEAD)
codebundle=$(basename "$(dirname "$PWD")")
namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N)
cat <<EOF > workspaceInfo.yaml
workspaceName: "$RW_WORKSPACE"
workspaceOwnerEmail: authors@runwhen.com
defaultLocation: location-01
defaultLOD: none
cloudConfig:
kubernetes:
kubeconfigFile: /shared/kubeconfig
namespaceLODs:
$namespace: detailed
namespaces:
- $namespace
codeCollections:
- repoURL: "$repo_url"
branch: "$branch_name"
codeBundles: ["$codebundle"]
custom:
kubeconfig_secret_name: "kubeconfig"
kubernetes_distribution_binary: kubectl
airflow_label_selector: "app.kubernetes.io/name=airflow"
airflow_deployment_name_prefix: "airflow"
rw_lookback_window: "1h"
EOF
silent: true

clean-rwl-discovery:
desc: "Clean RunWhen Local discovery output"
cmds:
- rm -rf output
- rm -f workspaceInfo.yaml
silent: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: v1
kind: Namespace
metadata:
name: test-airflow-workload-diagnostics
labels:
app.kubernetes.io/name: airflow
environment: test
65 changes: 65 additions & 0 deletions codebundles/k8s-airflow-workload-diagnostics/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Kubernetes Airflow Workload Diagnostics

This CodeBundle collects Kubernetes-centric health signals for Apache Airflow installations: workload controllers (webserver, scheduler, workers, triggerer), pod readiness and restarts, recent Warning events, PVCs for logs and DAGs, targeted scheduler log excerpts, and executor-style pod status. All tasks are read-only and do not trigger DAG runs or mutate workloads.

## Overview

- **Workload controllers**: Lists Deployments, StatefulSets, and DaemonSets that match the Airflow label selector or name prefix and compares desired versus ready replicas.
- **Pod health**: Checks Airflow-labeled pods for phase, Ready condition, restart counts, and recent termination reasons (for example OOMKilled).
- **Events**: Surfaces Warning events in the lookback window for Airflow-related object names.
- **Storage**: Summarizes PVCs tied to Airflow pods or common volume name patterns and flags non-Bound phases.
- **Scheduler logs**: Samples scheduler pod logs for DAG import errors and database connectivity hints.
- **Executors**: Best-effort summary of worker or executor-related pods that are Pending or have OOM terminations.
- **SLI**: Publishes a 0–1 health score from workload readiness, pod readiness, and Warning event volume (see `sli.robot`).

## Configuration

### Required variables

- `CONTEXT`: Kubernetes context to use.
- `NAMESPACE`: Namespace that contains the Airflow release.

### Optional variables

- `AIRFLOW_LABEL_SELECTOR`: Label selector for Airflow workloads (default: `app.kubernetes.io/name=airflow`).
- `AIRFLOW_DEPLOYMENT_NAME_PREFIX`: Extra name prefix used when labels are inconsistent (default: `airflow`).
- `RW_LOOKBACK_WINDOW`: Time window for events and log sampling, for example `30m` or `1h` (default: `1h`).
- `KUBERNETES_DISTRIBUTION_BINARY`: `kubectl` or `oc` (default: `kubectl`).

### SLI-only optional variables

- `AIRFLOW_SLI_EVENT_THRESHOLD`: Maximum number of Warning events in the lookback window before the events sub-score fails (default: `8`).

### Bash script defaults (not imported in `runbook.robot`)

- `AIRFLOW_RESTART_WARN_THRESHOLD`: Total container restart count above which the pod health task raises a warning (default: `10`).

### Secrets

- `kubeconfig`: Standard kubeconfig with read-only `get`, `list`, `describe`, and `logs` on workloads and events in the target namespace.

## Tasks overview

### List Airflow Workloads in Namespace

Discovers Deployments, StatefulSets, and DaemonSets via the label selector and optional name prefix merge; raises issues when ready replicas are below desired counts.

### Check Airflow Pod Health and Restarts in Namespace

Evaluates Airflow-labeled pods for phase, Ready condition, high restart counts, and recent container termination reasons.

### Fetch Recent Events for Airflow Resources in Namespace

Collects Warning events since the lookback cutoff for involved objects related to Airflow naming or workloads.

### Summarize PVC Status for Airflow Data Volumes in Namespace

Lists PVCs referenced by Airflow pods or matching common DAGs, logs, or plugins name patterns; issues when a PVC is not Bound.

### Sample Scheduler Logs for DAG Import Errors in Namespace

Tails recent scheduler logs and flags traceback, import, or database connectivity patterns.

### Check Worker or KubernetesExecutor Pod Saturation in Namespace

Surfaces Pending executor-related pods and OOMKilled containers when Celery or executor components are present.
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/usr/bin/env bash
# Summarizes Celery/Kubernetes executor related pods: pending reasons and resource hints from describe.
set -euo pipefail
set -x

: "${CONTEXT:?}" "${NAMESPACE:?}"

OUTPUT_FILE="${OUTPUT_FILE:-check_airflow_executor_pods_issues.json}"
KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}"
LABEL_SEL="${AIRFLOW_LABEL_SELECTOR:-app.kubernetes.io/name=airflow}"

if ! pods_json=$("${KUBECTL}" get pods -n "${NAMESPACE}" --context "${CONTEXT}" -l "${LABEL_SEL}" -o json 2>/dev/null); then
echo '[{"title":"Cannot list Airflow pods","details":"kubectl get pods failed","severity":4,"next_steps":"Verify RBAC."}]' | jq . > "$OUTPUT_FILE"
exit 0
fi

executor_json=$(echo "$pods_json" | jq '[.items[]? | select(
(.metadata.name | test("worker|celery|kubernetes|executor"; "i")) or
(.metadata.labels["app.kubernetes.io/component"]? // "" | test("worker|celery"; "i"))
)]')

issues_json=$(echo "$executor_json" | jq --arg ns "$NAMESPACE" '
[ .[]? |
.metadata.name as $n |
(.status.phase // "") as $ph |
(if $ph == "Pending" then
[{
"title": ("Executor-related pod `" + $n + "` Pending in `" + $ns + "`"),
"details": ((.status.conditions // []) | map(.message // "") | join("; ")),
"severity": 3,
"next_steps": "Describe the pod for scheduling and volume mount errors; check cluster capacity."
}]
else [] end) +
([.status.containerStatuses[]? |
.name as $c |
(.lastState.terminated.reason // "") as $reason |
select($reason == "OOMKilled") |
{
"title": ("OOMKilled in executor pod `" + $n + "` container `" + $c + "`"),
"details": "Last termination: OOMKilled",
"severity": 4,
"next_steps": "Raise memory limits or reduce task concurrency for workers."
}
])
] | flatten
')

echo "$issues_json" > "$OUTPUT_FILE"

echo "Executor-related pods:"
echo "$executor_json" | jq -r '.[] | [.metadata.name, .status.phase] | @tsv' || true
Loading