diff --git a/codebundles/k8s-victoriametrics-healthcheck/.runwhen/generation-rules/k8s-victoriametrics-healthcheck.yaml b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/generation-rules/k8s-victoriametrics-healthcheck.yaml new file mode 100644 index 00000000..d06ca613 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/generation-rules/k8s-victoriametrics-healthcheck.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - namespace + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: k8s-vm-hc + shortenedBaseName: k8s-vm-hc + qualifiers: ["namespace", "cluster"] + baseTemplateName: k8s-victoriametrics-healthcheck + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: k8s-victoriametrics-healthcheck-taskset.yaml diff --git a/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-sli.yaml b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-sli.yaml new file mode 100644 index 00000000..59017ee0 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-sli.yaml @@ -0,0 +1,48 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{default_location}} + description: Scores VictoriaMetrics workload readiness and PVC binding for the namespace (0–1); use the runbook for deep triage. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-victoriametrics-healthcheck/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 180 + configProvided: + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary | default("kubectl")}} + - name: NAMESPACE + value: "{{match_resource.resource.metadata.name}}" + - name: CONTEXT + value: "{{context}}" + - name: VM_LABEL_SELECTOR + value: "{{custom.vm_label_selector | default('')}}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}} + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-slx.yaml b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-slx.yaml new file mode 100644 index 00000000..02350a6c --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-slx.yaml @@ -0,0 +1,25 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/sts.svg + alias: {{namespace.name}} VictoriaMetrics Health Check + asMeasuredBy: Aggregate readiness of VictoriaMetrics pods and storage PVCs plus HTTP/cluster checks in the runbook. + configProvided: + - name: NAMESPACE + value: {{match_resource.resource.metadata.name}} + owners: + - {{workspace.owner_email}} + statement: VictoriaMetrics workloads in this namespace should be ready, storage bound, and serving healthy HTTP endpoints. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: access + value: read-only diff --git a/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-taskset.yaml b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-taskset.yaml new file mode 100644 index 00000000..8fcf2659 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/.runwhen/templates/k8s-victoriametrics-healthcheck-taskset.yaml @@ -0,0 +1,41 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Validates VictoriaMetrics pods, PVCs, HTTP health, cluster status, and recent error logs in the namespace. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-victoriametrics-healthcheck/runbook.robot + configProvided: + - name: KUBERNETES_DISTRIBUTION_BINARY + value: {{custom.kubernetes_distribution_binary | default("kubectl")}} + - name: NAMESPACE + value: "{{match_resource.resource.metadata.name}}" + - name: CONTEXT + value: "{{context}}" + - name: VM_LABEL_SELECTOR + value: "{{custom.vm_label_selector | default('')}}" + - name: VM_DEPLOYMENT_MODE + value: "{{custom.vm_deployment_mode | default('auto')}}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{custom.kubeconfig_secret_name | default("kubeconfig")}} + {% endif %} diff --git a/codebundles/k8s-victoriametrics-healthcheck/.test/Taskfile.yaml b/codebundles/k8s-victoriametrics-healthcheck/.test/Taskfile.yaml new file mode 100644 index 00000000..83d77e80 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/.test/Taskfile.yaml @@ -0,0 +1,146 @@ +version: "3" + +tasks: + default: + desc: "Run/refresh config" + cmds: + - task: check-unpushed-commits + - task: generate-rwl-config + - task: run-rwl-discovery + + clean: + desc: "Run cleanup tasks" + cmds: + - task: remove-kubernetes-objects + - task: delete-slxs + - task: clean-rwl-discovery + + build-infra: + desc: "Build test infrastructure" + cmds: + - task: create-kubernetes-objects + + create-kubernetes-objects: + desc: "Apply manifests from kubernetes directory using kubectl" + cmds: + - kubectl apply -f kubernetes/* + silent: true + + remove-kubernetes-objects: + desc: "Delete kubernetes objects" + cmds: + - kubectl delete -f kubernetes/* || true + silent: true + + check-unpushed-commits: + desc: Check if outstanding commits or file updates need to be pushed before testing. + vars: + BASE_DIR: "../" + cmds: + - | + echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "Uncommitted changes found:" + echo "$UNCOMMITTED_FILES" + exit 1 + fi + - | + git fetch origin 2>/dev/null || true + UNPUSHED_FILES=$(git diff --name-only origin/$(git rev-parse --abbrev-ref HEAD 2>/dev/null) HEAD 2>/dev/null | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNPUSHED_FILES" ]; then + echo "Unpushed commits found:" + echo "$UNPUSHED_FILES" + exit 1 + fi + silent: true + + generate-rwl-config: + desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + cmds: + - | + repo_url=$(git config --get remote.origin.url) + branch_name=$(git rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N) + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig + namespaceLODs: + $namespace: detailed + namespaces: + - $namespace + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + EOF + silent: true + + run-rwl-discovery: + desc: "Run RunWhen Local Discovery on test infrastructure" + cmds: + - | + CONTAINER_NAME="RunWhenLocal" + if docker ps -q --filter "name=$CONTAINER_NAME" | grep -q .; then + docker stop $CONTAINER_NAME && docker rm $CONTAINER_NAME + elif docker ps -a -q --filter "name=$CONTAINER_NAME" | grep -q .; then + docker rm $CONTAINER_NAME + fi + rm -rf output 2>/dev/null || true + mkdir -p output && chmod 777 output 2>/dev/null || true + kubeconfig=$(echo "$RW_FROM_FILE" | jq -r .kubeconfig) + docker run --name $CONTAINER_NAME -p 8081:8081 \ + -v "$(pwd)":/shared \ + -v "$kubeconfig":/shared/kubeconfig \ + -d ghcr.io/runwhen-contrib/runwhen-local:latest || { echo "Failed to start container"; exit 1; } + docker exec -w /workspace-builder $CONTAINER_NAME ./run.sh $1 --verbose || { echo "Discovery script failed"; exit 1; } + echo "Review generated config under output/workspaces/" + silent: true + + validate-generation-rules: + desc: "Validate YAML files in .runwhen/generation-rules" + cmds: + - | + for cmd in curl yq ajv; do + command -v $cmd >/dev/null || { echo "Error: $cmd is required."; exit 1; } + done + temp_dir=$(mktemp -d) + curl -s -o "$temp_dir/generation-rule-schema.json" \ + https://raw.githubusercontent.com/runwhen-contrib/runwhen-local/refs/heads/main/src/generation-rule-schema.json + for yaml_file in ../.runwhen/generation-rules/*.yaml; do + echo "Validating $yaml_file" + json_file="$temp_dir/$(basename "${yaml_file%.*}.json")" + yq -o=json "$yaml_file" > "$json_file" + ajv validate -s "$temp_dir/generation-rule-schema.json" -d "$json_file" \ + --spec=draft2020 --strict=false && echo "$yaml_file is valid." || echo "$yaml_file is invalid." + done + rm -rf "$temp_dir" + silent: true + + delete-slxs: + desc: "Delete SLX objects from RunWhen Platform (optional)" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + RW_API_URL: "{{.RW_API}}" + RW_PAT: "{{.RW_PAT}}" + cmds: + - echo "Optional: set RW_WORKSPACE RW_API RW_PAT to use platform delete" + silent: true + + clean-rwl-discovery: + desc: "Clean RunWhen Local discovery output" + cmds: + - rm -rf output + - rm -f workspaceInfo.yaml + silent: true diff --git a/codebundles/k8s-victoriametrics-healthcheck/.test/kubernetes/manifest.yaml b/codebundles/k8s-victoriametrics-healthcheck/.test/kubernetes/manifest.yaml new file mode 100644 index 00000000..8f63ff02 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/.test/kubernetes/manifest.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test-vm-health + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: fake-victoria-metrics-single + namespace: test-vm-health + labels: + app.kubernetes.io/name: victoria-metrics-single + app.kubernetes.io/component: single-binary +spec: + replicas: 1 + selector: + matchLabels: + app: fake-vm + template: + metadata: + labels: + app: fake-vm + app.kubernetes.io/name: victoria-metrics-single + spec: + containers: + - name: pause + image: registry.k8s.io/pause:3.9 diff --git a/codebundles/k8s-victoriametrics-healthcheck/README.md b/codebundles/k8s-victoriametrics-healthcheck/README.md new file mode 100644 index 00000000..b3721cb2 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/README.md @@ -0,0 +1,62 @@ +# Kubernetes VictoriaMetrics Health Check + +This CodeBundle validates [VictoriaMetrics](https://docs.victoriametrics.com/) workloads on Kubernetes: operator-style pod readiness, vmstorage PVC health, in-pod HTTP `/health` probes, optional vmselect cluster status JSON, and recent container log signatures for errors. Use it per namespace where VictoriaMetrics components run. + +## Overview + +- **Workload readiness**: Discovers Deployments, StatefulSets, DaemonSets, and pods that match common VictoriaMetrics labels (or an optional label selector) and reports CrashLoopBackOff, image pull failures, Pending pods, and rollout conditions that are not healthy. +- **Storage**: Flags VM-related PVCs that are not `Bound` or show binding or resize problems. +- **HTTP health**: Runs `kubectl exec` to hit `http://127.0.0.1:/health` inside each running component pod using default ports (vmselect 8481, vminsert 8480, vmstorage 8482, single-node/vmagent 8429). +- **Cluster status**: When `VM_DEPLOYMENT_MODE` is `cluster` or `auto` and a vmselect pod exists, fetches cluster status JSON from vmselect and surfaces degraded signals when the response suggests unhealthy storage or nodes. +- **Logs**: Greps recent logs for ERROR, panic, or fatal patterns on VictoriaMetrics-labeled pods. +- **SLI**: A lightweight `sli.robot` scores namespace health from VM pod readiness and VM-related PVC binding (0–1). + +## Configuration + +### Required variables + +- `CONTEXT`: Kubernetes context name to use. +- `NAMESPACE`: Namespace where VictoriaMetrics workloads are deployed. + +### Optional variables + +- `KUBERNETES_DISTRIBUTION_BINARY`: `kubectl`-compatible CLI (default: `kubectl`). +- `VM_LABEL_SELECTOR`: Optional Kubernetes label selector string (e.g. `app.kubernetes.io/instance=my-vm`) to narrow which pods and workloads are considered. If empty, the scripts use built-in VictoriaMetrics label and name heuristics. +- `VM_DEPLOYMENT_MODE`: `single`, `cluster`, or `auto` (default: `auto`). Controls whether the vmselect cluster status task runs (`single` skips it; `auto` runs it when a vmselect pod is found). + +### Optional environment (scripts only) + +These are read by bash scripts when set in the environment; they are not Robot imports: + +- `VM_LOG_TAIL_LINES`: Tail length for log scan (default: `120`). +- `VM_LOG_SINCE`: `kubectl logs --since` window (default: `15m`). + +### Secrets + +- `kubeconfig`: Standard kubeconfig file for cluster access (same as other Kubernetes CodeBundles). + +## Tasks overview + +### Verify VictoriaMetrics workload pod readiness + +Correlates VictoriaMetrics-tagged controllers and pods with Ready status, waiting reasons, and workload conditions. + +### Check VictoriaMetrics storage PVCs + +Evaluates PVCs likely tied to VictoriaMetrics (name patterns, labels, and StatefulSet volume claim templates) for phases other than `Bound` and for failing conditions. + +### Probe VictoriaMetrics HTTP health endpoints + +Uses `kubectl exec` and `wget`/`curl` inside each running pod to call `/health` on the documented default port for that component. + +### Check VictoriaMetrics cluster status API (vmselect) + +When applicable, queries vmselect for JSON cluster status (tries `/api/v1/status/cluster` on port 8481) and raises issues when the API is unreachable or the payload suggests degraded storage. + +### Scan VictoriaMetrics recent logs for errors + +Collects recent container logs and matches error/panic/fatal signatures to surface runtime failures. + +### SLI (`sli.robot`) + +Computes a 0–1 score from VM workload readiness and VM-related PVC binding; sub-metrics `vm_readiness` and `vm_pvc` are published for drill-down. diff --git a/codebundles/k8s-victoriametrics-healthcheck/check-vm-cluster-status.sh b/codebundles/k8s-victoriametrics-healthcheck/check-vm-cluster-status.sh new file mode 100755 index 00000000..0dbaa05b --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/check-vm-cluster-status.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# When cluster mode applies, queries vmselect cluster status JSON and flags problems. +# Respects VM_DEPLOYMENT_MODE: single | cluster | auto +# ----------------------------------------------------------------------------- + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +OUTPUT_FILE="${OUTPUT_FILE:-vm_cluster_status_issues.json}" +MODE="${VM_DEPLOYMENT_MODE:-auto}" +issues_json='[]' + +LABEL_ARGS=() +if [[ -n "${VM_LABEL_SELECTOR:-}" ]]; then + LABEL_ARGS=(-l "${VM_LABEL_SELECTOR}") +fi + +append_issue() { + local title="$1" + local details="$2" + local severity="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if [[ "$MODE" == "single" ]]; then + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + echo "Skipped cluster status (VM_DEPLOYMENT_MODE=single). Wrote $OUTPUT_FILE" + exit 0 +fi + +if ! pods_json=$("$KUBECTL" get pods -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null); then + append_issue "Cannot list pods for cluster status in \`${NAMESPACE}\`" "kubectl get pods failed." 4 "Verify kube access." + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + exit 0 +fi + +vmselect_pod=$(echo "$pods_json" | jq -r ' + [.items[] | + select(.status.phase=="Running") | + select( + ((.metadata.labels["app.kubernetes.io/component"] // "") == "vmselect") + or ((.metadata.name // "") | test("vmselect"; "i")) + ) | + .metadata.name + ] | first // empty') + +if [[ -z "$vmselect_pod" ]]; then + if [[ "$MODE" == "cluster" ]]; then + append_issue "No vmselect pod found in \`${NAMESPACE}\`" "VM_DEPLOYMENT_MODE=cluster but no running vmselect pod matched." 3 "Verify VictoriaMetrics cluster install and labels." + fi + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + echo "No vmselect pod for cluster status. Wrote $OUTPUT_FILE" + exit 0 +fi + +STATUS_URLS=( + "http://127.0.0.1:8481/api/v1/status/cluster" + "http://127.0.0.1:8481/prometheus/api/v1/status/cluster" +) + +raw="" +for url in "${STATUS_URLS[@]}"; do + raw="" + raw=$("$KUBECTL" exec -n "$NAMESPACE" --context "$CONTEXT" "$vmselect_pod" -- \ + sh -c "(wget -qO- --timeout=5 \"$url\" 2>/dev/null) || (curl -sS --max-time 5 \"$url\" 2>/dev/null)" 2>/dev/null || true) + if [[ -n "$raw" ]] && echo "$raw" | jq -e . >/dev/null 2>&1; then + break + fi + raw="" +done + +if [[ -z "$raw" ]] || ! echo "$raw" | jq -e . >/dev/null 2>&1; then + append_issue "Cluster status API unreachable from \`${vmselect_pod}\`" "Could not fetch valid JSON from vmselect (tried /api/v1/status/cluster on port 8481). Path may differ by VictoriaMetrics version." 3 "Confirm version-specific cluster status URL in https://docs.victoriametrics.com/" + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + exit 0 +fi + +compact=$(echo "$raw" | jq -c . 2>/dev/null | head -c 4000) + +if echo "$raw" | jq -r '.. | strings? | .' 2>/dev/null | grep -qiE 'unhealthy|dead|offline'; then + append_issue "vmselect cluster status may report degraded storage or nodes" "${compact}" 3 "Review vmstorage pods, network paths from vmselect to vmstorage, and VictoriaMetrics cluster troubleshooting docs." +fi + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Cluster status check completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/k8s-victoriametrics-healthcheck/check-vm-http-health.sh b/codebundles/k8s-victoriametrics-healthcheck/check-vm-http-health.sh new file mode 100755 index 00000000..7a10cb2c --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/check-vm-http-health.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Probes /health on VictoriaMetrics component pods via kubectl exec (localhost). +# ----------------------------------------------------------------------------- + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +OUTPUT_FILE="${OUTPUT_FILE:-vm_http_health_issues.json}" +issues_json='[]' + +LABEL_ARGS=() +if [[ -n "${VM_LABEL_SELECTOR:-}" ]]; then + LABEL_ARGS=(-l "${VM_LABEL_SELECTOR}") +fi + +append_issue() { + local title="$1" + local details="$2" + local severity="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +guess_port() { + local name="$1" + local comp="${2:-}" + local lc="${name}${comp}" + case "$lc" in + *vmselect*) echo 8481 ;; + *vm-insert*|*vminsert*) echo 8480 ;; + *vmstorage*|*vm-storage*) echo 8482 ;; + *vmagent*) echo 8429 ;; + *victoria-metrics*|*vmsingle*|*single*) + echo 8429 + ;; + *) echo 8429 ;; + esac +} + +http_get() { + local pod="$1" + local port="$2" + if ! out=$("$KUBECTL" exec -n "$NAMESPACE" --context "$CONTEXT" "$pod" -- \ + sh -c "(command -v wget >/dev/null && wget -qO- --timeout=4 http://127.0.0.1:${port}/health) || (command -v curl >/dev/null && curl -sS --max-time 4 http://127.0.0.1:${port}/health) || echo __EXEC_FAIL__" 2>/dev/null); then + echo "__EXEC_FAIL__" + return + fi + echo "$out" +} + +if ! pods_json=$("$KUBECTL" get pods -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null); then + append_issue "Cannot list pods for HTTP health in \`${NAMESPACE}\`" "kubectl get pods failed." 4 "Verify kube access." + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + exit 0 +fi + +mapfile -t all_pods < <(echo "$pods_json" | jq -c '.items[]') +for row in "${all_pods[@]:-}"; do + [[ -z "${row:-}" ]] && continue + echo "$row" | jq -e ' + select( + ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmselect|vminsert|vmstorage|vmagent"; "i")) + or ((.metadata.labels["app.kubernetes.io/component"] // "") | test("vmselect|vminsert|vmstorage|vmagent|single-binary"; "i")) + or ((.metadata.name // "") | test("vmselect|vminsert|vmstorage|vmagent|victoria-metrics"; "i")) + ) + ' >/dev/null 2>&1 || continue + + pname=$(echo "$row" | jq -r '.metadata.name') + phase=$(echo "$row" | jq -r '.status.phase // ""') + [[ "$phase" != "Running" ]] && continue + + comp=$(echo "$row" | jq -r '.metadata.labels["app.kubernetes.io/component"] // ""') + port=$(guess_port "$pname" "$comp") + + body=$(http_get "$pname" "$port" || echo "__EXEC_FAIL__") + if [[ "$body" == "__EXEC_FAIL__" ]] || [[ -z "$body" ]]; then + append_issue "HTTP /health probe failed for pod \`${pname}\` (port ${port})" "kubectl exec did not return a response from http://127.0.0.1:${port}/health." 3 "Confirm the container image includes wget/curl and the process listens on port ${port}." + continue + fi + if echo "$body" | grep -qiE 'ok|healthy'; then + continue + fi + # Some builds return plain body; treat non-empty short response as OK if no error + if [[ ${#body} -lt 400 ]] && ! echo "$body" | grep -qiE 'error|fail'; then + continue + fi + append_issue "Unexpected /health body from pod \`${pname}\`" "Port ${port}. Body (truncated): $(echo "$body" | head -c 300)" 2 "Review application logs for ${pname} and VictoriaMetrics version-specific health output." +done + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "HTTP health check completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/k8s-victoriametrics-healthcheck/check-vm-recent-error-logs.sh b/codebundles/k8s-victoriametrics-healthcheck/check-vm-recent-error-logs.sh new file mode 100755 index 00000000..bd198417 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/check-vm-recent-error-logs.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Scans recent container logs on VictoriaMetrics pods for error/panic/fatal lines. +# ----------------------------------------------------------------------------- + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +OUTPUT_FILE="${OUTPUT_FILE:-vm_recent_error_logs_issues.json}" +LOG_TAIL="${VM_LOG_TAIL_LINES:-120}" +LOG_SINCE="${VM_LOG_SINCE:-15m}" +issues_json='[]' + +LABEL_ARGS=() +if [[ -n "${VM_LABEL_SELECTOR:-}" ]]; then + LABEL_ARGS=(-l "${VM_LABEL_SELECTOR}") +fi + +append_issue() { + local title="$1" + local details="$2" + local severity="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if ! pods_json=$("$KUBECTL" get pods -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null); then + append_issue "Cannot list pods for log scan in \`${NAMESPACE}\`" "kubectl get pods failed." 4 "Verify kube access." + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + exit 0 +fi + +mapfile -t all_pods < <(echo "$pods_json" | jq -c '.items[]') +for row in "${all_pods[@]:-}"; do + [[ -z "${row:-}" ]] && continue + echo "$row" | jq -e ' + select( + ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmselect|vminsert|vmstorage|vmagent"; "i")) + or ((.metadata.labels["app.kubernetes.io/component"] // "") | test("vmselect|vminsert|vmstorage|vmagent|single-binary"; "i")) + or ((.metadata.name // "") | test("vmselect|vminsert|vmstorage|vmagent|victoria-metrics"; "i")) + ) + ' >/dev/null 2>&1 || continue + + pname=$(echo "$row" | jq -r '.metadata.name') + phase=$(echo "$row" | jq -r '.status.phase // ""') + [[ "$phase" != "Running" ]] && continue + + containers=$(echo "$row" | jq -r '.spec.containers[].name') + while IFS= read -r cname; do + [[ -z "$cname" ]] && continue + if ! logs=$("$KUBECTL" logs -n "$NAMESPACE" --context "$CONTEXT" "$pname" -c "$cname" --tail="$LOG_TAIL" --since="$LOG_SINCE" 2>/dev/null); then + continue + fi + matches=$(echo "$logs" | grep -iE '(^|[^a-z])(ERROR|panic|fatal|FATAL|PANIC)' | grep -viE 'level=info' | head -15 || true) + if [[ -n "$matches" ]]; then + mtrunc="${matches:0:1800}" + append_issue "Error signatures in logs for \`${pname}\` container \`${cname}\`" "${mtrunc}" 2 "kubectl logs ${pname} -n ${NAMESPACE} -c ${cname} --context ${CONTEXT} --tail=200; correlate with ingestion/query failures." + fi + done <<<"$containers" +done + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Log scan completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/k8s-victoriametrics-healthcheck/check-vm-storage-pvcs.sh b/codebundles/k8s-victoriametrics-healthcheck/check-vm-storage-pvcs.sh new file mode 100755 index 00000000..f97be93c --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/check-vm-storage-pvcs.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Lists PVCs used by VictoriaMetrics storage workloads; flags bad phases and events. +# ----------------------------------------------------------------------------- + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +OUTPUT_FILE="${OUTPUT_FILE:-vm_storage_pvc_issues.json}" +issues_json='[]' + +LABEL_ARGS=() +if [[ -n "${VM_LABEL_SELECTOR:-}" ]]; then + LABEL_ARGS=(-l "${VM_LABEL_SELECTOR}") +fi + +append_issue() { + local title="$1" + local details="$2" + local severity="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if ! pvc_json=$("$KUBECTL" get pvc -n "$NAMESPACE" --context "$CONTEXT" -o json 2>/dev/null); then + append_issue "Cannot list PVCs in \`${NAMESPACE}\`" "kubectl get pvc failed." 4 "Verify RBAC and namespace." + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + exit 0 +fi + +# Resolve StatefulSet volume claim names tied to VM workloads +sts_json=$("$KUBECTL" get statefulset -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null || echo '{"items":[]}') +vm_claims=$(echo "$sts_json" | jq -r ' + [.items[] | + select( + ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmstorage|vmselect|vminsert|vmagent"; "i")) + or ((.metadata.name // "") | test("vmstorage|vmselect|vminsert"; "i")) + ) | + .spec.volumeClaimTemplates[]?.metadata.name // empty + ] | unique | .[]' 2>/dev/null || true) + +mapfile -t pvc_items < <(echo "$pvc_json" | jq -c '.items[]') +for pvc in "${pvc_items[@]:-}"; do + [[ -z "${pvc:-}" ]] && continue + pname=$(echo "$pvc" | jq -r '.metadata.name') + phase=$(echo "$pvc" | jq -r '.status.phase // "Unknown"') + # Match VM-related PVCs by name/label or STS template + match=false + if echo "$pvc" | jq -e '.metadata.labels["app.kubernetes.io/name"]? | test("victoria-metrics|vmstorage|vmselect|vminsert|vmagent"; "i")' >/dev/null 2>&1; then + match=true + fi + if [[ "$pname" =~ (vmstorage|vm-select|vm-insert|victoria-metrics|vm-) ]]; then + match=true + fi + if echo "$vm_claims" | grep -qE "^${pname}$|^-?${pname}-[0-9]+$" 2>/dev/null; then + match=true + fi + [[ "$match" == "false" ]] && continue + + if [[ "$phase" != "Bound" ]]; then + cap=$(echo "$pvc" | jq -r '.status.capacity.storage // "unknown"') + append_issue "PVC \`${pname}\` phase ${phase} in \`${NAMESPACE}\`" "Capacity (if bound): ${cap}. Phase indicates binding or provisioning problem." 3 "kubectl describe pvc ${pname} -n ${NAMESPACE} --context ${CONTEXT}; check StorageClass and provisioner events." + fi + + # Volume binding failures from conditions + while IFS= read -r cond; do + [[ -z "$cond" ]] && continue + ctype=$(echo "$cond" | jq -r '.type') + cstat=$(echo "$cond" | jq -r '.status') + cmsg=$(echo "$cond" | jq -r '.message // ""') + if [[ "$cstat" == "False" ]] && [[ "$ctype" =~ (Resizing|FileSystemResize|VolumeBinding) ]]; then + append_issue "PVC \`${pname}\` condition ${ctype} is False" "${cmsg}" 3 "Review storage class, quota, and events for ${pname}." + fi + done < <(echo "$pvc" | jq -c '.status.conditions[]? // empty') +done + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "PVC analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/k8s-victoriametrics-healthcheck/check-vm-workload-readiness.sh b/codebundles/k8s-victoriametrics-healthcheck/check-vm-workload-readiness.sh new file mode 100755 index 00000000..c72d1e78 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/check-vm-workload-readiness.sh @@ -0,0 +1,112 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Discovers VictoriaMetrics-labeled workloads and reports unhealthy pods or +# rollout conditions. Writes JSON array issues to OUTPUT_FILE. +# Env: CONTEXT, NAMESPACE, KUBERNETES_DISTRIBUTION_BINARY, VM_LABEL_SELECTOR (optional) +# ----------------------------------------------------------------------------- + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +OUTPUT_FILE="${OUTPUT_FILE:-vm_workload_readiness_issues.json}" +issues_json='[]' + +LABEL_ARGS=() +if [[ -n "${VM_LABEL_SELECTOR:-}" ]]; then + LABEL_ARGS=(-l "${VM_LABEL_SELECTOR}") +fi + +append_issue() { + local title="$1" + local details="$2" + local severity="$3" + local next_steps="$4" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if ! pods_json=$("$KUBECTL" get pods -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null); then + append_issue "Cannot list pods in namespace \`${NAMESPACE}\`" "kubectl get pods failed; verify context, kubeconfig, and RBAC." 4 "Confirm kubeconfig secret, context name, and namespace exist." + echo "$issues_json" | jq '.' >"$OUTPUT_FILE" + echo "Wrote $OUTPUT_FILE" + exit 0 +fi + +is_vm_pod() { + echo "$1" | jq -e ' + select( + ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmselect|vminsert|vmstorage|vmagent"; "i")) + or ((.metadata.labels["app.kubernetes.io/name"] // "") | test("^vm-"; "i")) + or ((.metadata.labels["app.kubernetes.io/component"] // "") | test("vmselect|vminsert|vmstorage|vmagent|single-binary"; "i")) + or ((.metadata.name // "") | test("vmselect|vminsert|vmstorage|vmagent|victoria-metrics"; "i")) + ) + ' >/dev/null 2>&1 +} + +mapfile -t all_pods < <(echo "$pods_json" | jq -c '.items[]') +for row in "${all_pods[@]:-}"; do + is_vm_pod "$row" || continue + pname=$(echo "$row" | jq -r '.metadata.name') + phase=$(echo "$row" | jq -r '.status.phase // "Unknown"') + if [[ "$phase" == "Pending" ]]; then + append_issue "Pod \`${pname}\` stuck Pending in \`${NAMESPACE}\`" "VictoriaMetrics workload pod not scheduled." 3 "kubectl describe pod ${pname} -n ${NAMESPACE} --context ${CONTEXT}" + continue + fi + while IFS= read -r cs; do + [[ -z "$cs" ]] && continue + wr=$(echo "$cs" | jq -r '.state.waiting.reason // empty') + tr=$(echo "$cs" | jq -r '.state.terminated.reason // empty') + cname=$(echo "$cs" | jq -r '.name // "container"') + if [[ "$wr" =~ ^(CrashLoopBackOff|ImagePullBackOff|ErrImagePull|CreateContainerConfigError)$ ]]; then + append_issue "Container \`${cname}\` unhealthy on \`${pname}\` (${wr})" "Namespace \`${NAMESPACE}\`, phase=${phase}." 3 "kubectl logs ${pname} -n ${NAMESPACE} --context ${CONTEXT} -c ${cname}" + elif [[ "$tr" =~ ^(OOMKilled|Error)$ ]]; then + append_issue "Container \`${cname}\` terminated (${tr}) on \`${pname}\`" "Namespace \`${NAMESPACE}\`." 3 "Review logs and resource limits for ${pname}." + fi + done < <(echo "$row" | jq -c '.status.containerStatuses[]? // empty') + if echo "$row" | jq -e '.status.conditions[]? | select(.type=="Ready" and .status=="False")' >/dev/null 2>&1; then + if [[ "$phase" == "Running" ]]; then + nr=$(echo "$row" | jq -r '.status.conditions[]? | select(.type=="Ready") | .reason // "NotReady"') + append_issue "Pod \`${pname}\` not Ready in \`${NAMESPACE}\`" "Ready condition: ${nr}" 3 "Check readiness probes and dependencies for ${pname}." + fi + fi +done + +for kind in deployment statefulset daemonset; do + if ! res_json=$("$KUBECTL" get "$kind" -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null); then + continue + fi + mapfile -t witems < <(echo "$res_json" | jq -c '.items[] | + select( + ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmselect|vminsert|vmstorage|vmagent"; "i")) + or ((.metadata.labels["app.kubernetes.io/component"] // "") | test("vmselect|vminsert|vmstorage|vmagent|single-binary"; "i")) + or ((.metadata.name // "") | test("vmselect|vminsert|vmstorage|vmagent|victoria"; "i")) + )') + for item in "${witems[@]:-}"; do + [[ -z "${item:-}" ]] && continue + iname=$(echo "$item" | jq -r '.metadata.name') + while IFS= read -r cond; do + [[ -z "$cond" ]] && continue + ctype=$(echo "$cond" | jq -r '.type') + cstat=$(echo "$cond" | jq -r '.status') + creason=$(echo "$cond" | jq -r '.reason // ""') + cmsg=$(echo "$cond" | jq -r '.message // ""') + if [[ "$cstat" == "False" ]]; then + if [[ "$ctype" == "Available" ]]; then + append_issue "${kind}/${iname} is not Available (${creason})" "${cmsg}" 3 "kubectl describe ${kind} ${iname} -n ${NAMESPACE} --context ${CONTEXT}" + elif [[ "$ctype" == "Progressing" ]] && echo "${creason}${cmsg}" | grep -qiE 'ProgressDeadlineExceeded|ReplicaFailure|Failed|error'; then + append_issue "${kind}/${iname} rollout stalled (${creason})" "${cmsg}" 3 "kubectl describe ${kind} ${iname} -n ${NAMESPACE} --context ${CONTEXT}" + fi + fi + done < <(echo "$item" | jq -c '.status.conditions[]? // empty') + done +done + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Analysis completed. Results saved to $OUTPUT_FILE" diff --git a/codebundles/k8s-victoriametrics-healthcheck/runbook.robot b/codebundles/k8s-victoriametrics-healthcheck/runbook.robot new file mode 100644 index 00000000..bbb74d8d --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/runbook.robot @@ -0,0 +1,263 @@ +*** Settings *** +Documentation Validates VictoriaMetrics workloads on Kubernetes: pod readiness, PVC health, HTTP /health probes, vmselect cluster status when applicable, and recent error logs. +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes VictoriaMetrics Health Check +Metadata Supports Kubernetes AKS EKS GKE OpenShift VictoriaMetrics + +Force Tags Kubernetes VictoriaMetrics Health Namespace + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library RW.K8sHelper + +Suite Setup Suite Initialization + + +*** Tasks *** +Verify VictoriaMetrics Workload Pod Readiness for Namespace `${NAMESPACE}` + [Documentation] Lists Deployments, StatefulSets, and DaemonSets scoped to VictoriaMetrics labels and reports pods not Ready, CrashLoopBackOff, ImagePullBackOff, or failed rollout conditions. + [Tags] Kubernetes VictoriaMetrics access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vm-workload-readiness.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=./check-vm-workload-readiness.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat vm_workload_readiness_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for workload readiness task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VictoriaMetrics workloads in `${NAMESPACE}` should have healthy pods and rollouts + ... actual=Unhealthy workload or pod state detected for VictoriaMetrics components + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VictoriaMetrics workload readiness analysis:\n${result.stdout} + +Check VictoriaMetrics Storage PVCs in Namespace `${NAMESPACE}` + [Documentation] Lists PVCs associated with VictoriaMetrics storage (especially vmstorage) and flags Pending, Failed, Lost, or binding problems. + [Tags] Kubernetes VictoriaMetrics storage access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vm-storage-pvcs.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=false + ... cmd_override=./check-vm-storage-pvcs.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat vm_storage_pvc_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for PVC task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VictoriaMetrics PVCs in `${NAMESPACE}` should be Bound and healthy + ... actual=Storage claim issue detected for VictoriaMetrics workloads + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VictoriaMetrics PVC analysis:\n${result.stdout} + +Probe VictoriaMetrics HTTP Health Endpoints in Namespace `${NAMESPACE}` + [Documentation] For each running VictoriaMetrics component pod, curls localhost /health via kubectl exec using default ports per component (single, vmselect, vminsert, vmstorage, vmagent). + [Tags] Kubernetes VictoriaMetrics access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vm-http-health.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=240 + ... include_in_history=false + ... show_in_rwl_cheatsheet=false + ... cmd_override=./check-vm-http-health.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat vm_http_health_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for HTTP health task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VictoriaMetrics `/health` endpoints should respond successfully inside each component pod + ... actual=HTTP health check failed or returned unexpected body for a VictoriaMetrics pod + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VictoriaMetrics HTTP health probes:\n${result.stdout} + +Check VictoriaMetrics Cluster Status API for vmselect in Namespace `${NAMESPACE}` + [Documentation] When cluster mode is active, requests vmselect cluster status JSON and flags degraded storage or unreachable status APIs. + [Tags] Kubernetes VictoriaMetrics cluster access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vm-cluster-status.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=false + ... cmd_override=./check-vm-cluster-status.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat vm_cluster_status_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for cluster status task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=vmselect cluster status should report healthy storage connectivity + ... actual=Cluster status check reported a problem or could not query the API + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VictoriaMetrics cluster status:\n${result.stdout} + +Scan VictoriaMetrics Recent Logs for Errors in Namespace `${NAMESPACE}` + [Documentation] Greps recent container logs on VictoriaMetrics pods for ERROR, panic, or fatal patterns to catch runtime failures not visible from phase alone. + [Tags] Kubernetes VictoriaMetrics logs access:read-only data:logs-regexp + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-vm-recent-error-logs.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=300 + ... include_in_history=false + ... show_in_rwl_cheatsheet=false + ... cmd_override=./check-vm-recent-error-logs.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat vm_recent_error_logs_issues.json + ... env=${env} + ... include_in_history=false + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for log scan task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=VictoriaMetrics logs should be free of repeated ERROR, panic, or fatal lines under normal operation + ... actual=Error signature lines found in recent VictoriaMetrics container logs + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report VictoriaMetrics log scan:\n${result.stdout} + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... enum=[kubectl,oc] + ... default=kubectl + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context to use. + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=Namespace where VictoriaMetrics workloads run. + ... pattern=\w* + ${VM_LABEL_SELECTOR}= RW.Core.Import User Variable VM_LABEL_SELECTOR + ... type=string + ... description=Optional label selector to scope pods (e.g. app.kubernetes.io/instance=my-vm). + ... pattern=.* + ... default=${EMPTY} + ${VM_DEPLOYMENT_MODE}= RW.Core.Import User Variable VM_DEPLOYMENT_MODE + ... type=string + ... description=single, cluster, or auto (detect vmselect vs single-node). + ... pattern=\w* + ... default=auto + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${VM_LABEL_SELECTOR} ${VM_LABEL_SELECTOR} + Set Suite Variable ${VM_DEPLOYMENT_MODE} ${VM_DEPLOYMENT_MODE} + Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}", "NAMESPACE":"${NAMESPACE}", "VM_LABEL_SELECTOR":"${VM_LABEL_SELECTOR}", "VM_DEPLOYMENT_MODE":"${VM_DEPLOYMENT_MODE}"} + + RW.K8sHelper.Verify Cluster Connectivity + ... binary=${KUBERNETES_DISTRIBUTION_BINARY} + ... context=${CONTEXT} + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} diff --git a/codebundles/k8s-victoriametrics-healthcheck/sli-vm-metrics.sh b/codebundles/k8s-victoriametrics-healthcheck/sli-vm-metrics.sh new file mode 100755 index 00000000..4b6379d2 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/sli-vm-metrics.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Lightweight JSON metrics for sli.robot: readiness_score and pvc_score (0 or 1). +# Env: CONTEXT, NAMESPACE, KUBERNETES_DISTRIBUTION_BINARY, VM_LABEL_SELECTOR (optional) +# ----------------------------------------------------------------------------- + +: "${CONTEXT:?Must set CONTEXT}" +: "${NAMESPACE:?Must set NAMESPACE}" + +KUBECTL="${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +LABEL_ARGS=() +if [[ -n "${VM_LABEL_SELECTOR:-}" ]]; then + LABEL_ARGS=(-l "${VM_LABEL_SELECTOR}") +fi + +pods_json=$("$KUBECTL" get pods -n "$NAMESPACE" --context "$CONTEXT" "${LABEL_ARGS[@]}" -o json 2>/dev/null || echo '{"items":[]}') + +unready=$( + echo "$pods_json" | jq '[.items[] | + select( + ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmselect|vminsert|vmstorage|vmagent"; "i")) + or ((.metadata.labels["app.kubernetes.io/component"] // "") | test("vmselect|vminsert|vmstorage|vmagent|single-binary"; "i")) + or ((.metadata.name // "") | test("vmselect|vminsert|vmstorage|vmagent|victoria-metrics"; "i")) + ) | + select(.status.phase=="Running") | + select(.status.conditions[]? | select(.type=="Ready" and .status=="False")) + ] | length' +) + +[[ "$unready" -eq 0 ]] && readiness_score=1 || readiness_score=0 + +pvc_json=$("$KUBECTL" get pvc -n "$NAMESPACE" --context "$CONTEXT" -o json 2>/dev/null || echo '{"items":[]}') + +bad_pvc=$( + echo "$pvc_json" | jq '[.items[] | + select( + ((.metadata.name // "") | test("vmstorage|vmselect|vminsert|victoria-metrics|vm-"; "i")) + or ((.metadata.labels["app.kubernetes.io/name"] // "") | test("victoria-metrics|vmstorage"; "i")) + ) | + select(.status.phase != "Bound") + ] | length' +) + +[[ "$bad_pvc" -eq 0 ]] && pvc_score=1 || pvc_score=0 + +jq -n --argjson readiness_score "$readiness_score" --argjson pvc_score "$pvc_score" \ + '{readiness_score: $readiness_score, pvc_score: $pvc_score}' diff --git a/codebundles/k8s-victoriametrics-healthcheck/sli.robot b/codebundles/k8s-victoriametrics-healthcheck/sli.robot new file mode 100644 index 00000000..ed2fa250 --- /dev/null +++ b/codebundles/k8s-victoriametrics-healthcheck/sli.robot @@ -0,0 +1,78 @@ +*** Settings *** +Documentation Measures VictoriaMetrics namespace health using pod readiness and PVC binding for discovered workloads. Produces a value between 0 (failing) and 1 (healthy). +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes VictoriaMetrics Health Check +Metadata Supports Kubernetes AKS EKS GKE OpenShift VictoriaMetrics + +Suite Setup Suite Initialization +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform +Library OperatingSystem +Library Collections + + +*** Keywords *** +Suite Initialization + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=The kubernetes kubeconfig yaml containing connection configuration used to connect to cluster(s). + ... pattern=\w* + ${NAMESPACE}= RW.Core.Import User Variable NAMESPACE + ... type=string + ... description=The Kubernetes namespace where VictoriaMetrics runs. + ... pattern=\w* + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Which Kubernetes context to operate within. + ... pattern=\w* + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Which binary to use for Kubernetes CLI commands. + ... enum=[kubectl,oc] + ... default=kubectl + ${VM_LABEL_SELECTOR}= RW.Core.Import User Variable VM_LABEL_SELECTOR + ... type=string + ... description=Optional label selector to scope VictoriaMetrics pods. + ... pattern=.* + ... default=${EMPTY} + Set Suite Variable ${kubeconfig} ${kubeconfig} + Set Suite Variable ${NAMESPACE} ${NAMESPACE} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${VM_LABEL_SELECTOR} ${VM_LABEL_SELECTOR} + Set Suite Variable ${env} {"KUBECONFIG":"./${kubeconfig.key}", "KUBERNETES_DISTRIBUTION_BINARY":"${KUBERNETES_DISTRIBUTION_BINARY}", "CONTEXT":"${CONTEXT}", "NAMESPACE":"${NAMESPACE}", "VM_LABEL_SELECTOR":"${VM_LABEL_SELECTOR}"} + + +*** Tasks *** +Collect VictoriaMetrics SLI Metrics in Namespace `${NAMESPACE}` + [Documentation] Runs a lightweight kubectl+jq check for unready VM pods and unbound VM-related PVCs; emits binary sub-scores. + [Tags] access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sli-vm-metrics.sh + ... env=${env} + ... secret_file__kubeconfig=${kubeconfig} + ... timeout_seconds=30 + ... include_in_history=false + ... cmd_override=./sli-vm-metrics.sh + + TRY + ${metrics}= Evaluate json.loads(r'''${result.stdout}''') json + ${rs}= Evaluate int($metrics['readiness_score']) + ${ps}= Evaluate int($metrics['pvc_score']) + EXCEPT + Log SLI JSON parse failed; scoring degraded to 0. WARN + ${rs}= Set Variable 0 + ${ps}= Set Variable 0 + END + + RW.Core.Push Metric ${rs} sub_name=vm_readiness + RW.Core.Push Metric ${ps} sub_name=vm_pvc + + ${health_score}= Evaluate (float(${rs}) + float(${ps})) / 2 + ${health_score}= Convert to Number ${health_score} 2 + RW.Core.Add to Report VictoriaMetrics health score: ${health_score} (readiness=${rs}, pvc=${ps}) + RW.Core.Push Metric ${health_score}