From cd96da3101cb0e63f60a08eef9ab18df0ac356a4 Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Tue, 21 Apr 2026 13:07:56 +0000 Subject: [PATCH] Add k8s-airflow-http-health CodeBundle for Airflow HTTP and API health checks. Implements design spec issue-99: kubectl port-forward or PROXY_BASE_URL, GET /health JSON validation, REST probes with optional credentials, Service/Endpoints verification, optional scheduler/triggerer HTTP attempts, in-repo SLI with aggregated score, generation rules for Services named like Airflow on port 8080, and Kubernetes test manifests. Made-with: Cursor --- .../k8s-airflow-http-health.yaml | 31 ++ .../k8s-airflow-http-health-sli.yaml | 53 ++++ .../k8s-airflow-http-health-slx.yaml | 25 ++ .../k8s-airflow-http-health-taskset.yaml | 49 ++++ .../.test/Taskfile.yaml | 167 +++++++++++ .../.test/kubernetes/manifest.yaml | 77 +++++ codebundles/k8s-airflow-http-health/README.md | 60 ++++ .../_airflow_http_portforward_helper.sh | 106 +++++++ .../check-airflow-api-health.sh | 77 +++++ .../check-airflow-scheduler-http-health.sh | 112 ++++++++ .../check-airflow-webserver-health.sh | 91 ++++++ .../resolve-airflow-base-url.sh | 75 +++++ .../k8s-airflow-http-health/runbook.robot | 269 ++++++++++++++++++ .../sli-airflow-http-score.sh | 56 ++++ codebundles/k8s-airflow-http-health/sli.robot | 106 +++++++ .../verify-airflow-webserver-service.sh | 95 +++++++ 16 files changed, 1449 insertions(+) create mode 100644 codebundles/k8s-airflow-http-health/.runwhen/generation-rules/k8s-airflow-http-health.yaml create mode 100644 codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-sli.yaml create mode 100644 codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-slx.yaml create mode 100644 codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-taskset.yaml create mode 100644 codebundles/k8s-airflow-http-health/.test/Taskfile.yaml create mode 100644 codebundles/k8s-airflow-http-health/.test/kubernetes/manifest.yaml create mode 100644 codebundles/k8s-airflow-http-health/README.md create mode 100755 codebundles/k8s-airflow-http-health/_airflow_http_portforward_helper.sh create mode 100755 codebundles/k8s-airflow-http-health/check-airflow-api-health.sh create mode 100755 codebundles/k8s-airflow-http-health/check-airflow-scheduler-http-health.sh create mode 100755 codebundles/k8s-airflow-http-health/check-airflow-webserver-health.sh create mode 100755 codebundles/k8s-airflow-http-health/resolve-airflow-base-url.sh create mode 100644 codebundles/k8s-airflow-http-health/runbook.robot create mode 100755 codebundles/k8s-airflow-http-health/sli-airflow-http-score.sh create mode 100644 codebundles/k8s-airflow-http-health/sli.robot create mode 100755 codebundles/k8s-airflow-http-health/verify-airflow-webserver-service.sh diff --git a/codebundles/k8s-airflow-http-health/.runwhen/generation-rules/k8s-airflow-http-health.yaml b/codebundles/k8s-airflow-http-health/.runwhen/generation-rules/k8s-airflow-http-health.yaml new file mode 100644 index 00000000..dd1cdc2d --- /dev/null +++ b/codebundles/k8s-airflow-http-health/.runwhen/generation-rules/k8s-airflow-http-health.yaml @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: kubernetes + generationRules: + - resourceTypes: + - service + matchRules: + # Airflow Helm charts expose the web UI on port 8080. Matching on 8080 + # avoids unrelated Services that include "airflow" in the release name + # but use other ports (Redis, Postgres, etc.). + - type: and + matches: + - type: pattern + pattern: "airflow" + properties: [name] + mode: substring + - type: pattern + pattern: "^8080$" + properties: [spec/ports/port] + mode: exact + slxs: + - baseName: airflow-http-hc + qualifiers: ["resource", "namespace", "cluster"] + baseTemplateName: k8s-airflow-http-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: k8s-airflow-http-health-taskset.yaml diff --git a/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-sli.yaml b/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-sli.yaml new file mode 100644 index 00000000..64d36432 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-sli.yaml @@ -0,0 +1,53 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} + runwhen.com/sli: "true" +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{default_location}} + description: Aggregates Airflow webserver /health, API reachability, and Kubernetes Service presence for {{match_resource.resource.metadata.name}}. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-airflow-http-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: CONTEXT + value: "{{context}}" + - name: NAMESPACE + value: "{{match_resource.resource.metadata.namespace}}" + - name: AIRFLOW_WEBSERVER_SERVICE_NAME + value: "{{match_resource.resource.metadata.name}}" + - name: PROXY_BASE_URL + value: "" + - name: AIRFLOW_HTTP_PORT + value: "8080" + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{custom.kubernetes_distribution_binary | default('kubectl')}}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}} + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-slx.yaml b/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-slx.yaml new file mode 100644 index 00000000..8faad2be --- /dev/null +++ b/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-slx.yaml @@ -0,0 +1,25 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/svc.svg + alias: Airflow HTTP Health for {{match_resource.resource.metadata.name}} + asMeasuredBy: Airflow /health JSON, REST API reachability, and Kubernetes Service/Endpoints presence. + configProvided: + - name: AIRFLOW_WEBSERVER_SERVICE_NAME + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: The Airflow webserver Service should serve healthy HTTP endpoints and have backing Endpoints. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only diff --git a/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-taskset.yaml b/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-taskset.yaml new file mode 100644 index 00000000..39c44f3c --- /dev/null +++ b/codebundles/k8s-airflow-http-health/.runwhen/templates/k8s-airflow-http-health-taskset.yaml @@ -0,0 +1,49 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Airflow webserver HTTP/API health and Kubernetes Service correlation for {{match_resource.resource.metadata.name}}. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-airflow-http-health/runbook.robot + configProvided: + - name: CONTEXT + value: "{{context}}" + - name: NAMESPACE + value: "{{match_resource.resource.metadata.namespace}}" + - name: AIRFLOW_WEBSERVER_SERVICE_NAME + value: "{{match_resource.resource.metadata.name}}" + - name: PROXY_BASE_URL + value: "" + - name: AIRFLOW_HTTP_PORT + value: "8080" + - name: AIRFLOW_SCHEDULER_SERVICE_NAME + value: "" + - name: AIRFLOW_TRIGGERER_SERVICE_NAME + value: "" + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{custom.kubernetes_distribution_binary | default('kubectl')}}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}} + - name: airflow_api_credentials + workspaceKey: {{custom.airflow_api_credentials_secret_name | default("airflow_api_credentials")}} + {% endif %} diff --git a/codebundles/k8s-airflow-http-health/.test/Taskfile.yaml b/codebundles/k8s-airflow-http-health/.test/Taskfile.yaml new file mode 100644 index 00000000..90af747b --- /dev/null +++ b/codebundles/k8s-airflow-http-health/.test/Taskfile.yaml @@ -0,0 +1,167 @@ +version: "3" + +tasks: + default: + desc: "Run/refresh config and RunWhen Local discovery" + cmds: + - task: check-unpushed-commits + - task: generate-rwl-config + - task: run-rwl-discovery + + clean: + desc: "Run cleanup tasks" + cmds: + - task: remove-kubernetes-objects + - task: delete-slxs + - task: clean-rwl-discovery + + build-infra: + desc: "Build test infrastructure" + cmds: + - task: create-kubernetes-objects + + create-kubernetes-objects: + desc: "Apply manifests from kubernetes directory using kubectl" + cmds: + - kubectl apply -f kubernetes/ + silent: true + + remove-kubernetes-objects: + desc: "Delete kubernetes objects" + cmds: + - kubectl delete -f kubernetes/ --ignore-not-found=true + silent: true + + check-unpushed-commits: + desc: Check if outstanding commits or file updates need to be pushed before testing. + vars: + BASE_DIR: "../" + cmds: + - | + echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "Uncommitted changes found:" + echo "$UNCOMMITTED_FILES" + exit 1 + fi + - | + git fetch origin + UNPUSHED_FILES=$(git diff --name-only origin/$(git rev-parse --abbrev-ref HEAD) HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNPUSHED_FILES" ]; then + echo "Unpushed commits found:" + echo "$UNPUSHED_FILES" + exit 1 + fi + silent: true + + generate-rwl-config: + desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + cmds: + - | + repo_url=$(git config --get remote.origin.url) + branch_name=$(git rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N) + if [ -z "$namespace" ]; then + echo "Could not determine namespace from kubernetes/manifest.yaml" + exit 1 + fi + + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig.secret + namespaceLODs: + $namespace: detailed + namespaces: + - $namespace + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + airflow_api_credentials_secret_name: "airflow_api_credentials" + EOF + echo "Generated workspaceInfo.yaml (namespace=$namespace, codebundle=$codebundle)" + silent: true + + run-rwl-discovery: + desc: "Run RunWhen Local Discovery on test infrastructure" + cmds: + - | + CONTAINER_NAME="RunWhenLocal" + + if [ ! -f "kubeconfig.secret" ]; then + echo "Missing kubeconfig.secret in $(pwd)." + echo "Drop a kubeconfig file named 'kubeconfig.secret' into this directory (pointing at the test cluster) and re-run." + exit 1 + fi + + if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Stopping and removing existing container $CONTAINER_NAME..." + docker stop $CONTAINER_NAME && docker rm $CONTAINER_NAME + elif docker ps -a -q --filter "name=$CONTAINER_NAME" | grep -q .; then + echo "Removing existing stopped container $CONTAINER_NAME..." + docker rm $CONTAINER_NAME + else + echo "No existing container named $CONTAINER_NAME found." + fi + + echo "Cleaning up output directory..." + sudo rm -rf output || { echo "Failed to remove output directory"; exit 1; } + mkdir -p output && chmod 777 output || { echo "Failed to set permissions on output directory"; exit 1; } + + echo "Starting new container $CONTAINER_NAME..." + docker run --name $CONTAINER_NAME -p 8081:8081 -v "$(pwd)":/shared -d ghcr.io/runwhen-contrib/runwhen-local:latest || { + echo "Failed to start container"; exit 1; + } + + echo "Running workspace builder script in container..." + docker exec -w /workspace-builder $CONTAINER_NAME ./run.sh $1 --verbose || { + echo "Error executing workspace builder script in container"; exit 1; + } + + echo "Review generated config files under output/workspaces/" + silent: true + + validate-generation-rules: + desc: "Validate YAML files in .runwhen/generation-rules" + cmds: + - | + for cmd in curl yq ajv; do + command -v $cmd >/dev/null || { echo "Missing $cmd"; exit 1; } + done + temp_dir=$(mktemp -d) + curl -s -o "$temp_dir/generation-rule-schema.json" \ + https://raw.githubusercontent.com/runwhen-contrib/runwhen-local/refs/heads/main/src/generation-rule-schema.json + for yaml_file in ../.runwhen/generation-rules/*.yaml; do + json_file="$temp_dir/$(basename "${yaml_file%.*}.json")" + yq -o=json "$yaml_file" > "$json_file" + ajv validate -s "$temp_dir/generation-rule-schema.json" -d "$json_file" --spec=draft2020 --strict=false + done + rm -rf "$temp_dir" + silent: true + + delete-slxs: + desc: "No-op placeholder (optional platform upload flow)" + cmds: + - | + echo "Optional - configure RW_WORKSPACE, RW_API, RW_PAT for platform upload flows." + silent: true + + clean-rwl-discovery: + desc: "Clean RunWhen Local discovery output" + cmds: + - | + sudo rm -rf output || rm -rf output + rm -f workspaceInfo.yaml + silent: true diff --git a/codebundles/k8s-airflow-http-health/.test/kubernetes/manifest.yaml b/codebundles/k8s-airflow-http-health/.test/kubernetes/manifest.yaml new file mode 100644 index 00000000..c9094e77 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/.test/kubernetes/manifest.yaml @@ -0,0 +1,77 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test-airflow-http-health + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: fake-airflow-http + namespace: test-airflow-http-health +data: + default.conf: | + server { + listen 8080; + location /health { + default_type application/json; + return 200 '{"metadatabase":{"status":"healthy"},"scheduler":{"status":"healthy"},"triggerer":{"status":null},"dag_processor":{"status":null}}'; + } + location /api/v1/version { + default_type application/json; + return 200 '{"version":"2.8.0","git_version":""}'; + } + location /api/v1/health { + default_type application/json; + return 200 '{"metadatabase":{"status":"healthy"}}'; + } + location /api/v2/monitor/health { + default_type application/json; + return 404 '{}'; + } + } + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fake-airflow-webserver + namespace: test-airflow-http-health +spec: + replicas: 1 + selector: + matchLabels: + app: fake-airflow-webserver + template: + metadata: + labels: + app: fake-airflow-webserver + spec: + containers: + - name: nginx + image: nginx:1.25-alpine + ports: + - containerPort: 8080 + volumeMounts: + - name: cfg + mountPath: /etc/nginx/conf.d + volumes: + - name: cfg + configMap: + name: fake-airflow-http + +--- +apiVersion: v1 +kind: Service +metadata: + name: airflow-webserver + namespace: test-airflow-http-health + labels: + app.kubernetes.io/name: airflow +spec: + selector: + app: fake-airflow-webserver + ports: + - port: 8080 + name: http + protocol: TCP diff --git a/codebundles/k8s-airflow-http-health/README.md b/codebundles/k8s-airflow-http-health/README.md new file mode 100644 index 00000000..e7cd2363 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/README.md @@ -0,0 +1,60 @@ +# Kubernetes Airflow HTTP/API Health + +This CodeBundle checks Apache Airflow webserver availability using HTTP GET probes against the webserver `/health` endpoint and read-only REST routes, then correlates results with Kubernetes Service and Endpoints objects. Optional checks target scheduler or triggerer Services when those names are configured. + +## Overview + +- **Connectivity**: Resolves `PROXY_BASE_URL` or uses `kubectl port-forward` to the webserver Service +- **Webserver health**: Validates `/health` JSON (metadata DB, scheduler, and optional components when reported) +- **REST API**: Probes `/api/v1/health`, `/api/v2/monitor/health`, or `/api/v1/version` with optional credentials +- **Kubernetes context**: Confirms the webserver Service exists, has Endpoints, and port alignment +- **Optional tiers**: When `AIRFLOW_SCHEDULER_SERVICE_NAME` or `AIRFLOW_TRIGGERER_SERVICE_NAME` is set, performs lightweight HTTP attempts (many charts do not expose HTTP here) + +## Configuration + +### Required variables + +These are imported via `RW.Core.Import User Variable` in `runbook.robot`: + +- `CONTEXT`: Kubernetes context name for `kubectl` and port-forward +- `NAMESPACE`: Namespace where Airflow runs +- `AIRFLOW_WEBSERVER_SERVICE_NAME`: Kubernetes Service name for the Airflow webserver + +### Optional variables + +- `PROXY_BASE_URL`: Full HTTP base URL for the web UI (for example `http://airflow-webserver.my-ns.svc.cluster.local:8080`). Leave empty to use automatic `kubectl port-forward` to the Service +- `AIRFLOW_HTTP_PORT`: Service port for the web UI/API (default: `8080`) +- `AIRFLOW_SCHEDULER_SERVICE_NAME`: Optional Service name for extra HTTP checks +- `AIRFLOW_TRIGGERER_SERVICE_NAME`: Optional Service name for Airflow 2 triggerer HTTP checks +- `KUBERNETES_DISTRIBUTION_BINARY`: `kubectl` or `oc` (default: `kubectl`) + +### Secrets + +- `kubeconfig`: Standard kubeconfig used for `kubectl` and optional port-forward +- `airflow_api_credentials`: Optional JSON for authenticated REST routes, for example `{"token":"..."}` or `{"username":"admin","password":"..."}` + +## Tasks overview + +### Resolve Airflow Webserver Base URL + +Validates that either `PROXY_BASE_URL` responds on `GET /health` or that port-forward to `AIRFLOW_WEBSERVER_SERVICE_NAME` works. Surfaces connectivity and early misconfiguration issues. + +### Check Airflow Webserver Health Endpoint + +Calls `GET /health` and inspects JSON status fields (for example `metadatabase`, `scheduler`) when present, flagging unhealthy or missing states. + +### Check Airflow REST API Health or Version + +Tries read-only API paths in order (version-dependent across Airflow 2.x and newer). Uses `airflow_api_credentials` when the API rejects anonymous access. + +### Verify Kubernetes Service and Endpoints for Webserver + +Uses `kubectl` to confirm the Service exists, has ready Endpoints, and exposes the expected `AIRFLOW_HTTP_PORT`. + +### Optional Check Scheduler or Triggerer HTTP Health + +If optional Service names are set, checks endpoints and attempts HTTP on the first Service port or documents that no HTTP listener responded (common when charts do not expose HTTP on these tiers). + +## SLI + +`sli.robot` aggregates three binary dimensions (webserver `/health`, API reachability, Service existence) into a single 0–1 score for periodic monitoring. diff --git a/codebundles/k8s-airflow-http-health/_airflow_http_portforward_helper.sh b/codebundles/k8s-airflow-http-health/_airflow_http_portforward_helper.sh new file mode 100755 index 00000000..4ae0b2cf --- /dev/null +++ b/codebundles/k8s-airflow-http-health/_airflow_http_portforward_helper.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# ----------------------------------------------------------------------------- +# Shared helper: ensure PROXY_BASE_URL is reachable for Airflow HTTP checks. +# +# If PROXY_BASE_URL is non-empty, use it as-is. +# Otherwise start kubectl port-forward to svc/${AIRFLOW_WEBSERVER_SERVICE_NAME} +# and export PROXY_BASE_URL=http://127.0.0.1:. +# +# Required for port-forward: +# CONTEXT, NAMESPACE, AIRFLOW_WEBSERVER_SERVICE_NAME +# Optional: +# AIRFLOW_HTTP_PORT (default: 8080) +# AIRFLOW_LOCAL_PORT (ephemeral if unset) +# KUBERNETES_DISTRIBUTION_BINARY (default: kubectl) +# AIRFLOW_PF_WAIT_SECS (default: 15) +# +# Exports: +# PROXY_BASE_URL +# AIRFLOW_PF_PID +# ----------------------------------------------------------------------------- + +_airflow_pick_free_port() { + python3 - <<'PY' 2>/dev/null || echo "" +import socket +s = socket.socket() +s.bind(("127.0.0.1", 0)) +print(s.getsockname()[1]) +s.close() +PY +} + +_airflow_wait_for_port() { + local host="$1" port="$2" max="${3:-15}" + local i=0 + while (( i < max )); do + if (exec 3<>"/dev/tcp/${host}/${port}") 2>/dev/null; then + exec 3>&- 3<&- 2>/dev/null || true + return 0 + fi + sleep 1 + i=$((i + 1)) + done + return 1 +} + +ensure_airflow_proxy_base_url() { + if [[ -n "${PROXY_BASE_URL:-}" ]]; then + export PROXY_BASE_URL + return 0 + fi + + : "${CONTEXT:?PROXY_BASE_URL empty and CONTEXT not set — cannot start port-forward}" + : "${NAMESPACE:?PROXY_BASE_URL empty and NAMESPACE not set — cannot start port-forward}" + : "${AIRFLOW_WEBSERVER_SERVICE_NAME:?PROXY_BASE_URL empty and AIRFLOW_WEBSERVER_SERVICE_NAME not set — cannot start port-forward}" + + local kbin="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" + local remote_port="${AIRFLOW_HTTP_PORT:-8080}" + local local_port="${AIRFLOW_LOCAL_PORT:-}" + local wait_secs="${AIRFLOW_PF_WAIT_SECS:-15}" + + if ! command -v "$kbin" >/dev/null 2>&1; then + echo "ERROR: PROXY_BASE_URL not set and ${kbin} not found on PATH; cannot establish port-forward." >&2 + return 1 + fi + + if [[ -z "$local_port" ]]; then + local_port="$(_airflow_pick_free_port)" + if [[ -z "$local_port" ]]; then + local_port="$remote_port" + fi + fi + + echo "PROXY_BASE_URL not provided; starting ${kbin} port-forward to svc/${AIRFLOW_WEBSERVER_SERVICE_NAME} ${local_port}:${remote_port} in ns ${NAMESPACE} (context ${CONTEXT})." + + local pf_log + pf_log="$(mktemp)" + "$kbin" --context "$CONTEXT" -n "$NAMESPACE" port-forward "svc/${AIRFLOW_WEBSERVER_SERVICE_NAME}" "${local_port}:${remote_port}" \ + >"$pf_log" 2>&1 & + AIRFLOW_PF_PID=$! + export AIRFLOW_PF_PID + + trap '_airflow_cleanup_portforward' EXIT INT TERM + + if ! _airflow_wait_for_port "127.0.0.1" "$local_port" "$wait_secs"; then + echo "ERROR: kubectl port-forward did not become ready within ${wait_secs}s." >&2 + echo "port-forward log (truncated):" >&2 + head -c 2000 "$pf_log" >&2 || true + rm -f "$pf_log" || true + _airflow_cleanup_portforward + return 1 + fi + + rm -f "$pf_log" || true + + export PROXY_BASE_URL="http://127.0.0.1:${local_port}" + echo "PROXY_BASE_URL=${PROXY_BASE_URL} (via port-forward pid ${AIRFLOW_PF_PID})" + return 0 +} + +_airflow_cleanup_portforward() { + if [[ -n "${AIRFLOW_PF_PID:-}" ]] && kill -0 "$AIRFLOW_PF_PID" 2>/dev/null; then + kill "$AIRFLOW_PF_PID" 2>/dev/null || true + wait "$AIRFLOW_PF_PID" 2>/dev/null || true + fi + AIRFLOW_PF_PID="" +} diff --git a/codebundles/k8s-airflow-http-health/check-airflow-api-health.sh b/codebundles/k8s-airflow-http-health/check-airflow-api-health.sh new file mode 100755 index 00000000..112737dd --- /dev/null +++ b/codebundles/k8s-airflow-http-health/check-airflow-api-health.sh @@ -0,0 +1,77 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# Probes read-only REST routes: /api/v1/health, /api/v2/monitor/health, /api/v1/version +# Optional AIRFLOW_API_CREDENTIALS JSON for Bearer or basic auth. +# ----------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/_airflow_http_portforward_helper.sh" +ensure_airflow_proxy_base_url + +OUTPUT_FILE="${OUTPUT_FILE:-check_airflow_api_health_issues.json}" +issues_json='[]' +BASE_URL="${PROXY_BASE_URL%/}" +MAX_TIME="${CURL_MAX_TIME:-25}" + +curl_auth=() +if [[ -n "${AIRFLOW_API_CREDENTIALS:-}" ]] && echo "${AIRFLOW_API_CREDENTIALS}" | jq -e . >/dev/null 2>&1; then + token=$(echo "${AIRFLOW_API_CREDENTIALS}" | jq -r '.token // .bearer_token // empty') + user=$(echo "${AIRFLOW_API_CREDENTIALS}" | jq -r '.username // .user // empty') + password=$(echo "${AIRFLOW_API_CREDENTIALS}" | jq -r '.password // empty') + if [[ -n "$token" && "$token" != "null" ]]; then + curl_auth=(-H "Authorization: Bearer ${token}") + elif [[ -n "$user" && "$user" != "null" && -n "$password" && "$password" != "null" ]]; then + curl_auth=(-u "${user}:${password}") + fi +fi + +try_paths=( + "/api/v1/health" + "/api/v2/monitor/health" + "/api/v1/version" +) + +ok=0 +last_code="000" +last_path="" +for path in "${try_paths[@]}"; do + tmpf=$(mktemp) + c=$(curl -sS --max-time "$MAX_TIME" "${curl_auth[@]}" -o "$tmpf" -w "%{http_code}" "${BASE_URL}${path}" 2>/dev/null || echo "000") + body=$(cat "$tmpf" || true) + rm -f "$tmpf" + last_code="$c" + last_path="$path" + echo "GET ${BASE_URL}${path} -> HTTP ${c}" + preview=$(echo "$body" | head -c 200 | tr '\n' ' ') + [[ -n "$preview" ]] && echo " body: ${preview}" + + if [[ "$c" == "200" ]]; then + ok=1 + break + fi + # Unauthenticated caller: 401/403 indicates the API router is responding + if [[ "$c" == "401" || "$c" == "403" ]] && [[ ${#curl_auth[@]} -eq 0 ]]; then + echo "API responded with ${c} (auth required). Configure airflow_api_credentials to probe authenticated routes." + ok=1 + break + fi +done + +if [[ "$ok" -eq 0 ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Airflow REST API probe did not succeed" \ + --arg details "Tried ${try_paths[*]} on ${BASE_URL}. Last: ${last_path} HTTP ${last_code}. Supply AIRFLOW_API_CREDENTIALS JSON if RBAC blocks anonymous access." \ + --argjson severity 2 \ + --arg next_steps "Verify Airflow version and API path, enable REST API, and add token or basic-auth credentials if required." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') +fi + +echo "$issues_json" | jq . >"$OUTPUT_FILE" +echo "API health check complete. Issues written to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/k8s-airflow-http-health/check-airflow-scheduler-http-health.sh b/codebundles/k8s-airflow-http-health/check-airflow-scheduler-http-health.sh new file mode 100755 index 00000000..d563ff40 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/check-airflow-scheduler-http-health.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# Optional HTTP GETs for scheduler/triggerer Services when names are set. +# Uses the first TCP port on each Service. If no route returns HTTP 200, +# logs an informational note (many charts do not expose HTTP on these tiers). +# ----------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/_airflow_http_portforward_helper.sh" + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +OUTPUT_FILE="${OUTPUT_FILE:-check_airflow_scheduler_http_issues.json}" +issues_json='[]' +KBIN="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +MAX_TIME="${CURL_MAX_TIME:-12}" + +check_optional_service() { + local svc_name="$1" + local role_label="$2" + if [[ -z "$svc_name" ]]; then + echo "${role_label}: service name not set — skipping." + return 0 + fi + + echo "--- Optional ${role_label}: svc/${svc_name} ---" + + if ! svc_json=$("$KBIN" get svc "$svc_name" -n "$NAMESPACE" --context "$CONTEXT" -o json 2>/dev/null); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Optional ${role_label} Service \`${svc_name}\` not found" \ + --arg details "Service name was set but kubectl get svc failed." \ + --argjson severity 3 \ + --arg next_steps "Fix the Service name or clear AIRFLOW_SCHEDULER_SERVICE_NAME / AIRFLOW_TRIGGERER_SERVICE_NAME if this tier is not in use." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + return 0 + fi + + ep_addrs=$("$KBIN" get endpoints "$svc_name" -n "$NAMESPACE" --context "$CONTEXT" -o json 2>/dev/null \ + | jq '[.subsets[]?.addresses[]?] | length' 2>/dev/null || echo 0) + echo "Endpoints ready addresses: ${ep_addrs}" + if [[ "${ep_addrs:-0}" -eq 0 ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Optional ${role_label} Service \`${svc_name}\` has no ready endpoints" \ + --arg details "No backing addresses; HTTP checks against this tier will not work until Pods register." \ + --argjson severity 3 \ + --arg next_steps "Inspect workloads, selectors, and readiness for ${svc_name}." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + return 0 + fi + + remote_port=$(echo "$svc_json" | jq -r '.spec.ports[0].port // empty') + if [[ -z "$remote_port" || "$remote_port" == "null" ]]; then + echo "No Service ports; skipping HTTP probe." + return 0 + fi + + export AIRFLOW_WEBSERVER_SERVICE_NAME="$svc_name" + export AIRFLOW_HTTP_PORT="$remote_port" + unset PROXY_BASE_URL + + if ! ensure_airflow_proxy_base_url; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot port-forward to optional ${role_label} Service \`${svc_name}\`" \ + --arg details "kubectl port-forward failed for ${svc_name}:${remote_port}." \ + --argjson severity 4 \ + --arg next_steps "Verify RBAC for port-forward to this Service." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + _airflow_cleanup_portforward 2>/dev/null || true + return 0 + fi + + BASE_URL="${PROXY_BASE_URL%/}" + local found=0 + for path in "/health" "/metrics" "/"; do + code=$(curl -sS --max-time "$MAX_TIME" -o /dev/null -w "%{http_code}" "${BASE_URL}${path}" 2>/dev/null || echo "000") + echo "GET ${BASE_URL}${path} -> HTTP ${code}" + if [[ "$code" == "200" ]]; then + found=1 + break + fi + done + + if [[ "$found" -eq 0 ]]; then + echo "NOTE: No HTTP 200 from /health, /metrics, or / on port ${remote_port}. Many Helm charts do not expose an HTTP listener here; this is informational." + fi + + _airflow_cleanup_portforward 2>/dev/null || true +} + +check_optional_service "${AIRFLOW_SCHEDULER_SERVICE_NAME:-}" "scheduler" +check_optional_service "${AIRFLOW_TRIGGERER_SERVICE_NAME:-}" "triggerer" + +echo "$issues_json" | jq . >"$OUTPUT_FILE" +echo "Optional scheduler/triggerer HTTP check complete. Issues written to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/k8s-airflow-http-health/check-airflow-webserver-health.sh b/codebundles/k8s-airflow-http-health/check-airflow-webserver-health.sh new file mode 100755 index 00000000..bd66dcf3 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/check-airflow-webserver-health.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# GET /health and evaluate Airflow 2.x-style JSON (metadatabase, scheduler, ...). +# ----------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/_airflow_http_portforward_helper.sh" +ensure_airflow_proxy_base_url + +OUTPUT_FILE="${OUTPUT_FILE:-check_airflow_webserver_health_issues.json}" +issues_json='[]' +BASE_URL="${PROXY_BASE_URL%/}" +MAX_TIME="${CURL_MAX_TIME:-25}" + +tmpf=$(mktemp) +code=$(curl -sS --max-time "$MAX_TIME" -o "$tmpf" -w "%{http_code}" "${BASE_URL}/health" 2>/dev/null || echo "000") +raw=$(cat "$tmpf" || true) +rm -f "$tmpf" + +echo "GET ${BASE_URL}/health -> HTTP ${code}" + +if [[ "$code" != "200" ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Airflow webserver /health HTTP failure" \ + --arg details "Expected HTTP 200 from ${BASE_URL}/health. Got HTTP ${code}." \ + --argjson severity 3 \ + --arg next_steps "Check webserver Pods, readiness probes, and whether PROXY_BASE_URL or port-forward targets the correct Service port." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" | jq . >"$OUTPUT_FILE" + cat "$OUTPUT_FILE" + exit 0 +fi + +if ! echo "$raw" | jq -e . >/dev/null 2>&1; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Airflow /health body is not valid JSON" \ + --arg details "Response preview: $(echo "$raw" | head -c 500 | tr '\n' ' ')" \ + --argjson severity 3 \ + --arg next_steps "Confirm you are hitting the Airflow webserver and not a proxy error page." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" | jq . >"$OUTPUT_FILE" + cat "$OUTPUT_FILE" + exit 0 +fi + +# Critical subsystems: metadatabase must be healthy when present; scheduler when status is non-null +_check_component() { + local key="$1" + local label="$2" + local status + status=$(echo "$raw" | jq -r --arg k "$key" '.[$k].status // empty') + if [[ -z "$status" || "$status" == "null" ]]; then + echo "Component ${label}: not reported (optional or not deployed)" + return 0 + fi + if [[ "$status" != "healthy" ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Airflow ${label} reports unhealthy in /health JSON" \ + --arg details "${label} status=${status} (from ${BASE_URL}/health)" \ + --argjson severity 3 \ + --arg next_steps "Inspect ${label} Pods/logs and related dependencies (for example metadata DB for metadatabase, scheduler Pods for scheduler)." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + else + echo "Component ${label}: healthy" + fi +} + +_check_component "metadatabase" "metadatabase" +_check_component "scheduler" "scheduler" +_check_component "triggerer" "triggerer" +_check_component "dag_processor" "DAG processor" + +echo "$issues_json" | jq . >"$OUTPUT_FILE" +echo "Webserver health check complete. Issues written to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/k8s-airflow-http-health/resolve-airflow-base-url.sh b/codebundles/k8s-airflow-http-health/resolve-airflow-base-url.sh new file mode 100755 index 00000000..8e501c0a --- /dev/null +++ b/codebundles/k8s-airflow-http-health/resolve-airflow-base-url.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# Validates PROXY_BASE_URL or establishes kubectl port-forward, then verifies +# the webserver responds on GET /health (read-only smoke check). +# Writes issues JSON to OUTPUT_FILE (default: resolve_airflow_base_url_issues.json). +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${AIRFLOW_WEBSERVER_SERVICE_NAME:?Must set AIRFLOW_WEBSERVER_SERVICE_NAME}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/_airflow_http_portforward_helper.sh" + +OUTPUT_FILE="${OUTPUT_FILE:-resolve_airflow_base_url_issues.json}" +issues_json='[]' +MAX_TIME="${CURL_MAX_TIME:-20}" + +if ! ensure_airflow_proxy_base_url; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Cannot reach Airflow webserver base URL for \`${AIRFLOW_WEBSERVER_SERVICE_NAME}\`" \ + --arg details "PROXY_BASE_URL was unset and kubectl port-forward to the Service failed or kubectl is unavailable." \ + --argjson severity 3 \ + --arg next_steps "Verify kubeconfig RBAC for port-forward, Service name and namespace, and that the webserver Pod is running." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" | jq . >"$OUTPUT_FILE" + cat "$OUTPUT_FILE" + exit 0 +fi + +BASE_URL="${PROXY_BASE_URL%/}" +tmpf=$(mktemp) +code=$(curl -sS --max-time "$MAX_TIME" -o "$tmpf" -w "%{http_code}" "${BASE_URL}/health" 2>/dev/null || echo "000") +body=$(head -c 400 "$tmpf" || true) +rm -f "$tmpf" + +echo "GET ${BASE_URL}/health -> HTTP ${code}" + +if [[ "$code" != "200" ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Airflow webserver /health not reachable at resolved base URL" \ + --arg details "Expected HTTP 200 from ${BASE_URL}/health after resolving PROXY_BASE_URL. Got HTTP ${code}. Body preview: ${body}" \ + --argjson severity 3 \ + --arg next_steps "Confirm the Airflow web UI is up, Ingress or Service port matches AIRFLOW_HTTP_PORT, and network policy allows the runner to reach the Service." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') +fi + +if [[ "$code" == "200" ]] && ! jq -e . >/dev/null 2>&1 <<<"$body"; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Airflow /health returned non-JSON response" \ + --arg details "Expected JSON from ${BASE_URL}/health. Preview: ${body}" \ + --argjson severity 2 \ + --arg next_steps "Verify this is the Airflow webserver and not an ingress error page." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') +fi + +echo "$issues_json" | jq . >"$OUTPUT_FILE" +echo "Base URL resolution complete. Issues written to $OUTPUT_FILE" +cat "$OUTPUT_FILE" diff --git a/codebundles/k8s-airflow-http-health/runbook.robot b/codebundles/k8s-airflow-http-health/runbook.robot new file mode 100644 index 00000000..bad69013 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/runbook.robot @@ -0,0 +1,269 @@ +*** Settings *** +Documentation Exposes Apache Airflow webserver (and optionally scheduler/triggerer) health through HTTP probes, REST checks, and kubectl Service/Endpoints correlation. +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes Airflow HTTP/API Health +Metadata Supports Kubernetes AKS EKS GKE OpenShift Airflow HTTP + +Force Tags Kubernetes Airflow HTTP webserver health + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Resolve Airflow Webserver Base URL for `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] Confirms PROXY_BASE_URL or kubectl port-forward can reach the webserver /health endpoint before deeper checks run. + [Tags] Kubernetes Airflow connectivity access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=resolve-airflow-base-url.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./resolve-airflow-base-url.sh + ${issues}= RW.CLI.Run Cli + ... cmd=cat resolve_airflow_base_url_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for resolve base URL task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Airflow webserver should be reachable at PROXY_BASE_URL or via port-forward + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Resolve Airflow base URL results:\n${result.stdout} + +Check Airflow Webserver Health Endpoint for `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] GETs /health and validates JSON status fields for metadatabase, scheduler, and optional components where reported. + [Tags] Kubernetes Airflow webserver access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-airflow-webserver-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-airflow-webserver-health.sh + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_airflow_webserver_health_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for webserver health task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Airflow /health JSON should report healthy state for active subsystems + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Webserver /health results:\n${result.stdout} + +Check Airflow REST API Health or Version for `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] Probes read-only API routes such as /api/v1/health or /api/v1/version; optional airflow_api_credentials for authenticated clusters. + [Tags] Kubernetes Airflow api access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-airflow-api-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-airflow-api-health.sh + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_airflow_api_health_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for API health task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=At least one read-only API route should respond when the webserver API is enabled + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Airflow REST API probe results:\n${result.stdout} + +Verify Kubernetes Service and Endpoints for Webserver `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] Uses kubectl to confirm the Service exists, Endpoints back it, and ports align with AIRFLOW_HTTP_PORT for triage of networking vs application faults. + [Tags] Kubernetes Airflow service access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=verify-airflow-webserver-service.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./verify-airflow-webserver-service.sh + ${issues}= RW.CLI.Run Cli + ... cmd=cat verify_airflow_webserver_service_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for Service verification task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Service should exist with Endpoints and an HTTP port matching probes + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Kubernetes Service verification:\n${result.stdout} + +Optional Check Scheduler or Triggerer HTTP Health Related to `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] When AIRFLOW_SCHEDULER_SERVICE_NAME or AIRFLOW_TRIGGERER_SERVICE_NAME is set, attempts lightweight HTTP port-forward probes; documents skip when charts expose no HTTP listener. + [Tags] Kubernetes Airflow scheduler access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-airflow-scheduler-http-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=false + ... cmd_override=./check-airflow-scheduler-http-health.sh + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_airflow_scheduler_http_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for optional scheduler HTTP task, defaulting to empty list. WARN + ${issue_list}= Create List + END + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Optional scheduler/triggerer Service should exist with endpoints when those tiers are enabled + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + RW.Core.Add Pre To Report Optional scheduler/triggerer HTTP results:\n${result.stdout} + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + TRY + ${AIRFLOW_API_CREDENTIALS}= RW.Core.Import Secret + ... airflow_api_credentials + ... type=string + ... description=Optional JSON with token or username/password for Airflow REST API (for example {\"username\":\"...\",\"password\":\"...\"}). + ... pattern=.* + Set Suite Variable ${AIRFLOW_API_CREDENTIALS} ${AIRFLOW_API_CREDENTIALS} + EXCEPT + Log airflow_api_credentials secret not provided; API probes run without authenticated access where allowed. INFO + Set Suite Variable ${AIRFLOW_API_CREDENTIALS} ${EMPTY} + END + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context for kubectl and port-forward. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Namespace where Airflow runs. + ... pattern=\w* + ${AIRFLOW_WEBSERVER_SERVICE_NAME}= RW.Core.Import User Variable AIRFLOW_WEBSERVER_SERVICE_NAME + ... type=string + ... description=Kubernetes Service name for the Airflow webserver. + ... pattern=.* + ${PROXY_BASE_URL}= RW.Core.Import User Variable PROXY_BASE_URL + ... type=string + ... description=Optional full base URL for HTTP checks; leave empty to port-forward to the webserver Service. + ... pattern=.* + ... default= + ${AIRFLOW_HTTP_PORT}= RW.Core.Import User Variable AIRFLOW_HTTP_PORT + ... type=string + ... description=Service port for the Airflow web UI/API. + ... pattern=^\d+$ + ... default=8080 + ${AIRFLOW_SCHEDULER_SERVICE_NAME}= RW.Core.Import User Variable AIRFLOW_SCHEDULER_SERVICE_NAME + ... type=string + ... description=Optional Service name for scheduler-side HTTP checks when exposed. + ... pattern=.* + ... default= + ${AIRFLOW_TRIGGERER_SERVICE_NAME}= RW.Core.Import User Variable AIRFLOW_TRIGGERER_SERVICE_NAME + ... type=string + ... description=Optional Service name for Airflow 2 triggerer HTTP checks when exposed. + ... pattern=.* + ... default= + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... enum=[kubectl,oc] + ... default=kubectl + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${AIRFLOW_WEBSERVER_SERVICE_NAME} ${AIRFLOW_WEBSERVER_SERVICE_NAME} + Set Suite Variable ${PROXY_BASE_URL} ${PROXY_BASE_URL} + Set Suite Variable ${AIRFLOW_HTTP_PORT} ${AIRFLOW_HTTP_PORT} + Set Suite Variable ${AIRFLOW_SCHEDULER_SERVICE_NAME} ${AIRFLOW_SCHEDULER_SERVICE_NAME} + Set Suite Variable ${AIRFLOW_TRIGGERER_SERVICE_NAME} ${AIRFLOW_TRIGGERER_SERVICE_NAME} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + ${env}= Create Dictionary + ... CONTEXT=${CONTEXT} + ... NAMESPACE=${NAMESPACE} + ... AIRFLOW_WEBSERVER_SERVICE_NAME=${AIRFLOW_WEBSERVER_SERVICE_NAME} + ... PROXY_BASE_URL=${PROXY_BASE_URL} + ... AIRFLOW_HTTP_PORT=${AIRFLOW_HTTP_PORT} + ... AIRFLOW_SCHEDULER_SERVICE_NAME=${AIRFLOW_SCHEDULER_SERVICE_NAME} + ... AIRFLOW_TRIGGERER_SERVICE_NAME=${AIRFLOW_TRIGGERER_SERVICE_NAME} + ... KUBERNETES_DISTRIBUTION_BINARY=${KUBERNETES_DISTRIBUTION_BINARY} + ... AIRFLOW_API_CREDENTIALS=${AIRFLOW_API_CREDENTIALS} + ... KUBECONFIG=./${kubeconfig.key} + Set Suite Variable ${env} ${env} diff --git a/codebundles/k8s-airflow-http-health/sli-airflow-http-score.sh b/codebundles/k8s-airflow-http-health/sli-airflow-http-score.sh new file mode 100755 index 00000000..7039c8f4 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/sli-airflow-http-score.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -uo pipefail +# ----------------------------------------------------------------------------- +# Single-line JSON for sli.robot: webserver_health, api_reachability, k8s_service. +# Diagnostics to stderr; port-forward banner redirected to stderr. +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${AIRFLOW_WEBSERVER_SERVICE_NAME:?Must set AIRFLOW_WEBSERVER_SERVICE_NAME}" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck disable=SC1091 +source "${SCRIPT_DIR}/_airflow_http_portforward_helper.sh" +ensure_airflow_proxy_base_url 1>&2 || true + +BASE_URL="${PROXY_BASE_URL%/}" +MAX_TIME="${CURL_MAX_TIME:-15}" +KBIN="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" + +# --- webserver /health --- +ws_score=0 +tmpf=$(mktemp) +rc=$(curl -sS --max-time "$MAX_TIME" -o "$tmpf" -w "%{http_code}" "${BASE_URL}/health" 2>/dev/null || echo "000") +body=$(cat "$tmpf" || true) +rm -f "$tmpf" +if [[ "$rc" == "200" ]] && echo "$body" | jq -e . >/dev/null 2>&1; then + mb=$(echo "$body" | jq -r '.metadatabase.status // empty') + if [[ -z "$mb" || "$mb" == "null" ]]; then + ws_score=1 + elif [[ "$mb" == "healthy" ]]; then + ws_score=1 + else + ws_score=0 + fi +fi + +# --- API quick probe (no auth in SLI) --- +api_score=0 +for path in "/api/v1/health" "/api/v1/version"; do + c=$(curl -sS --max-time "$MAX_TIME" -o /dev/null -w "%{http_code}" "${BASE_URL}${path}" 2>/dev/null || echo "000") + if [[ "$c" == "200" || "$c" == "401" || "$c" == "403" ]]; then + api_score=1 + break + fi +done + +# --- Service exists --- +k8s_score=0 +if command -v "$KBIN" &>/dev/null; then + if "$KBIN" get svc "$AIRFLOW_WEBSERVER_SERVICE_NAME" -n "$NAMESPACE" --context "$CONTEXT" --request-timeout=8s &>/dev/null; then + k8s_score=1 + fi +fi + +jq -cn --argjson w "$ws_score" --argjson a "$api_score" --argjson k "$k8s_score" \ + '{webserver_health:$w, api_reachability:$a, kubernetes_service:$k}' diff --git a/codebundles/k8s-airflow-http-health/sli.robot b/codebundles/k8s-airflow-http-health/sli.robot new file mode 100644 index 00000000..c50b4d53 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/sli.robot @@ -0,0 +1,106 @@ +*** Settings *** +Documentation Measures Airflow webserver availability using GET /health, a lightweight REST API probe, and Kubernetes Service presence. Produces a value between 0 (failing) and 1 (healthy). +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes Airflow HTTP/API Health SLI +Metadata Supports Kubernetes AKS EKS GKE OpenShift Airflow + +Suite Setup Suite Initialization +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library Collections + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration. + ... pattern=\w* + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context for kubectl-backed checks in the SLI. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Namespace where Airflow runs. + ... pattern=\w* + ${PROXY_BASE_URL}= RW.Core.Import User Variable PROXY_BASE_URL + ... type=string + ... description=Optional base URL for Airflow HTTP. Leave empty to port-forward to the webserver Service. + ... pattern=.* + ... default= + ${AIRFLOW_WEBSERVER_SERVICE_NAME}= RW.Core.Import User Variable AIRFLOW_WEBSERVER_SERVICE_NAME + ... type=string + ... description=Kubernetes Service name for the Airflow webserver. + ... pattern=.* + ${AIRFLOW_HTTP_PORT}= RW.Core.Import User Variable AIRFLOW_HTTP_PORT + ... type=string + ... description=Service port for the Airflow web UI/API. + ... pattern=^\d+$ + ... default=8080 + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... enum=[kubectl,oc] + ... default=kubectl + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${PROXY_BASE_URL} ${PROXY_BASE_URL} + Set Suite Variable ${AIRFLOW_WEBSERVER_SERVICE_NAME} ${AIRFLOW_WEBSERVER_SERVICE_NAME} + Set Suite Variable ${AIRFLOW_HTTP_PORT} ${AIRFLOW_HTTP_PORT} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + ${env}= Create Dictionary + ... CONTEXT=${CONTEXT} + ... NAMESPACE=${NAMESPACE} + ... PROXY_BASE_URL=${PROXY_BASE_URL} + ... AIRFLOW_WEBSERVER_SERVICE_NAME=${AIRFLOW_WEBSERVER_SERVICE_NAME} + ... AIRFLOW_HTTP_PORT=${AIRFLOW_HTTP_PORT} + ... KUBERNETES_DISTRIBUTION_BINARY=${KUBERNETES_DISTRIBUTION_BINARY} + ... KUBECONFIG=./${kubeconfig.key} + Set Suite Variable ${env} ${env} + + +*** Tasks *** +Collect Airflow HTTP Sub-Scores for Service `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] Fetches webserver /health, API reachability, and Kubernetes Service scores as binary 0/1 values. + [Tags] access:read-only data:metrics + ${raw}= RW.CLI.Run Bash File + ... bash_file=sli-airflow-http-score.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=120 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./sli-airflow-http-score.sh + TRY + ${scores}= Evaluate + ... next((json.loads(l) for l in reversed((r'''${raw.stdout}''').splitlines()) if l.strip().startswith('{') and l.strip().endswith('}'))) + ... json + EXCEPT + Log SLI score JSON parse failed; scoring all dimensions as 0. WARN + Log ${raw.stdout} WARN + ${scores}= Create Dictionary webserver_health=0 api_reachability=0 kubernetes_service=0 + END + ${wv}= Get From Dictionary ${scores} webserver_health + ${av}= Get From Dictionary ${scores} api_reachability + ${kv}= Get From Dictionary ${scores} kubernetes_service + ${wv}= Convert To Number ${wv} + ${av}= Convert To Number ${av} + ${kv}= Convert To Number ${kv} + Set Suite Variable ${webserver_health_score} ${wv} + Set Suite Variable ${api_score} ${av} + Set Suite Variable ${kubernetes_service_score} ${kv} + RW.Core.Push Metric ${wv} sub_name=webserver_health + RW.Core.Push Metric ${av} sub_name=api_reachability + RW.Core.Push Metric ${kv} sub_name=kubernetes_service + +Generate Aggregate Airflow HTTP Health Score for Service `${AIRFLOW_WEBSERVER_SERVICE_NAME}` + [Documentation] Averages sub-scores into the final 0-1 health metric used for alerting. + [Tags] access:read-only data:metrics + ${health_score}= Evaluate (${webserver_health_score} + ${api_score} + ${kubernetes_service_score}) / 3 + ${health_score}= Convert To Number ${health_score} 2 + ${report_msg}= Set Variable Airflow HTTP health score: ${health_score} (webserver_health=${webserver_health_score}, api_reachability=${api_score}, kubernetes_service=${kubernetes_service_score}) + RW.Core.Add To Report ${report_msg} + RW.Core.Push Metric ${health_score} diff --git a/codebundles/k8s-airflow-http-health/verify-airflow-webserver-service.sh b/codebundles/k8s-airflow-http-health/verify-airflow-webserver-service.sh new file mode 100755 index 00000000..1d1f2727 --- /dev/null +++ b/codebundles/k8s-airflow-http-health/verify-airflow-webserver-service.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# kubectl: Service and Endpoints for the Airflow webserver (no HTTP calls). +# ----------------------------------------------------------------------------- +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" +: "${AIRFLOW_WEBSERVER_SERVICE_NAME:?Must set AIRFLOW_WEBSERVER_SERVICE_NAME}" + +OUTPUT_FILE="${OUTPUT_FILE:-verify_airflow_webserver_service_issues.json}" +issues_json='[]' +KBIN="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +PORT="${AIRFLOW_HTTP_PORT:-8080}" + +echo "Verifying svc/${AIRFLOW_WEBSERVER_SERVICE_NAME} in ns ${NAMESPACE} (context ${CONTEXT}), expected port ${PORT}." + +if ! command -v "$KBIN" &>/dev/null; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Kubernetes CLI not found for Airflow Service verification" \ + --arg details "Expected ${KBIN} on PATH." \ + --argjson severity 3 \ + --arg next_steps "Install kubectl or set KUBERNETES_DISTRIBUTION_BINARY to oc for OpenShift." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" | jq . >"$OUTPUT_FILE" + cat "$OUTPUT_FILE" + exit 0 +fi + +if ! svc_json=$("$KBIN" get svc "$AIRFLOW_WEBSERVER_SERVICE_NAME" -n "$NAMESPACE" --context "$CONTEXT" -o json 2>/dev/null); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Kubernetes Service \`${AIRFLOW_WEBSERVER_SERVICE_NAME}\` not found in namespace \`${NAMESPACE}\`" \ + --arg details "kubectl get svc failed for context ${CONTEXT}." \ + --argjson severity 3 \ + --arg next_steps "Verify AIRFLOW_WEBSERVER_SERVICE_NAME, namespace, and kubeconfig context." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') + echo "$issues_json" | jq . >"$OUTPUT_FILE" + cat "$OUTPUT_FILE" + exit 0 +fi + +svc_type=$(echo "$svc_json" | jq -r '.spec.type // ""') +svc_cluster_ip=$(echo "$svc_json" | jq -r '.spec.clusterIP // ""') +svc_ports=$(echo "$svc_json" | jq -c '[.spec.ports[]? | {port, targetPort, protocol, name}]' 2>/dev/null || echo "[]") +svc_selector=$(echo "$svc_json" | jq -c '.spec.selector // {}' 2>/dev/null || echo "{}") +echo "Service: type=${svc_type} clusterIP=${svc_cluster_ip} selector=${svc_selector}" +echo " ports=${svc_ports}" + +ep_addrs=$("$KBIN" get endpoints "$AIRFLOW_WEBSERVER_SERVICE_NAME" -n "$NAMESPACE" --context "$CONTEXT" -o json 2>/dev/null \ + | jq '[.subsets[]?.addresses[]?] | length' 2>/dev/null || echo 0) +echo "Endpoints: ready_addresses=${ep_addrs}" + +if [[ "${ep_addrs:-0}" -eq 0 ]]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "No endpoints ready for Service \`${AIRFLOW_WEBSERVER_SERVICE_NAME}\` in \`${NAMESPACE}\`" \ + --arg details "Endpoints show zero addresses; HTTP checks may fail due to missing backing Pods." \ + --argjson severity 3 \ + --arg next_steps "Check Deployment/StatefulSet Pods, selectors, and readiness probes for the Airflow webserver workload." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') +fi + +port_match=$(echo "$svc_json" | jq --arg p "$PORT" '[.spec.ports[]? | select(.port == ($p|tonumber))] | length' 2>/dev/null || echo 0) +echo "Port check: expected=${PORT} match_count=${port_match}" +if [[ "${port_match:-0}" -eq 0 ]]; then + ports=$(echo "$svc_json" | jq -c '[.spec.ports[]?.port]' 2>/dev/null || echo "[]") + issues_json=$(echo "$issues_json" | jq \ + --arg title "Service port may not match AIRFLOW_HTTP_PORT for \`${AIRFLOW_WEBSERVER_SERVICE_NAME}\`" \ + --arg details "Expected port ${PORT} on Service. Found ports: ${ports}" \ + --argjson severity 2 \ + --arg next_steps "Align AIRFLOW_HTTP_PORT and PROXY_BASE_URL with spec.ports." \ + '. += [{ + "title": $title, + "details": $details, + "severity": $severity, + "next_steps": $next_steps + }]') +fi + +echo "$issues_json" | jq . >"$OUTPUT_FILE" +echo "Kubernetes service verification complete. Issues written to $OUTPUT_FILE" +cat "$OUTPUT_FILE"