diff --git a/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/generation-rules/k8s-pgbouncer-prometheus-health.yaml b/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/generation-rules/k8s-pgbouncer-prometheus-health.yaml new file mode 100644 index 00000000..6f073de7 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/generation-rules/k8s-pgbouncer-prometheus-health.yaml @@ -0,0 +1,21 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + generationRules: + - resourceTypes: + - service + matchRules: + - type: pattern + pattern: "pgbouncer" + properties: ["name"] + mode: substring + slxs: + - baseName: pgb-pgb-hc + shortenedBaseName: pgb-pgb-hc + qualifiers: ["resource", "namespace", "cluster"] + baseTemplateName: k8s-pgbouncer-prometheus-health + levelOfDetail: basic + outputItems: + - type: slx + - type: runbook + templateName: k8s-pgbouncer-prometheus-health-taskset.yaml diff --git a/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/templates/k8s-pgbouncer-prometheus-health-slx.yaml b/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/templates/k8s-pgbouncer-prometheus-health-slx.yaml new file mode 100644 index 00000000..c45472e8 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/templates/k8s-pgbouncer-prometheus-health-slx.yaml @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{ slx_name }} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/kubernetes/resources/labeled/svc.svg + alias: {{ match_resource.resource.metadata.name }} Kubernetes PgBouncer Prometheus Health + asMeasuredBy: PgBouncer pool saturation, wait queues, exporter availability, and capacity signals from Prometheus. + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{ workspace.owner_email }} + statement: Monitor PgBouncer connection pool health using Prometheus metrics from the pgbouncer exporter for this service scope. + additionalContext: + {% include "kubernetes-hierarchy.yaml" ignore missing %} + qualified_name: "{{ match_resource.qualified_name }}" + tags: + {% include "kubernetes-tags.yaml" ignore missing %} + - name: platform + value: kubernetes + - name: resource + value: service + - name: scope + value: namespace + - name: access + value: read-only diff --git a/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/templates/k8s-pgbouncer-prometheus-health-taskset.yaml b/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/templates/k8s-pgbouncer-prometheus-health-taskset.yaml new file mode 100644 index 00000000..20991eb1 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/.runwhen/templates/k8s-pgbouncer-prometheus-health-taskset.yaml @@ -0,0 +1,53 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Evaluates PgBouncer pool health via Prometheus metrics for service {{ match_resource.resource.metadata.name }} in namespace {{ match_resource.resource.metadata.namespace }}. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/k8s-pgbouncer-prometheus-health/runbook.robot + configProvided: + - name: PROMETHEUS_URL + value: "{{ custom.prometheus_url | default('https://prometheus.example/api/v1') }}" + - name: PGBOUNCER_JOB_LABEL + value: "{{ custom.pgbouncer_job_label | default('job=\"pgbouncer-exporter\"') }}" + - name: METRIC_NAMESPACE_FILTER + value: 'kubernetes_namespace="{{ match_resource.resource.metadata.namespace }}"' + - name: EXPECTED_POOL_MODE + value: "{{ custom.pgbouncer_expected_pool_mode | default('transaction') }}" + - name: CONTEXT + value: "{{ context }}" + - name: KUBERNETES_DISTRIBUTION_BINARY + value: "{{ custom.kubernetes_distribution_binary | default('kubectl') }}" + - name: PGBOUNCER_NAMESPACE + value: "{{ match_resource.resource.metadata.namespace }}" + - name: PGBOUNCER_POD_LABEL_SELECTOR + value: "{{ custom.pgbouncer_pod_label_selector | default('app.kubernetes.io/name=pgbouncer-exporter') }}" + - name: CLIENT_SATURATION_PERCENT_THRESHOLD + value: "{{ custom.pgbouncer_client_saturation_percent | default('80') }}" + - name: MAX_WAIT_SECONDS_THRESHOLD + value: "{{ custom.pgbouncer_max_wait_seconds | default('1') }}" + secretsProvided: + {% if wb_version %} + {% include "kubernetes-auth.yaml" ignore missing %} + {% else %} + - name: kubeconfig + workspaceKey: {{ custom.kubeconfig_secret_name | default("kubeconfig") }} + {% endif %} + - name: prometheus_bearer_token + workspaceKey: {{ custom.prometheus_bearer_token_secret | default("AUTH DETAILS NOT FOUND") }} diff --git a/codebundles/k8s-pgbouncer-prometheus-health/.test/Taskfile.yaml b/codebundles/k8s-pgbouncer-prometheus-health/.test/Taskfile.yaml new file mode 100644 index 00000000..5b6c1b8b --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/.test/Taskfile.yaml @@ -0,0 +1,95 @@ +version: "3" + +tasks: + default: + desc: "Run/refresh config" + cmds: + - task: check-unpushed-commits + - task: generate-rwl-config + - task: run-rwl-discovery + + clean: + desc: "Run cleanup tasks" + cmds: + - task: remove-kubernetes-objects + - task: clean-rwl-discovery + + build-infra: + desc: "Build test infrastructure" + cmds: + - task: create-kubernetes-objects + + create-kubernetes-objects: + desc: "Apply manifests from kubernetes directory using kubectl" + cmds: + - kubectl apply -f kubernetes/* + silent: true + + remove-kubernetes-objects: + desc: "Delete kubernetes objects" + cmds: + - kubectl delete -f kubernetes/* || true + silent: true + + check-unpushed-commits: + desc: Check if outstanding commits or file updates need to be pushed before testing. + vars: + BASE_DIR: "../" + cmds: + - | + echo "Checking for uncommitted changes in $BASE_DIR and $BASE_DIR.runwhen, excluding '.test'..." + UNCOMMITTED_FILES=$(git diff --name-only HEAD | grep -E "^${BASE_DIR}(\.runwhen|[^/]+)" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED_FILES" ]; then + echo "Uncommitted changes found:" + echo "$UNCOMMITTED_FILES" + exit 1 + fi + echo "No uncommitted changes in specified directories." + silent: true + + generate-rwl-config: + desc: "Generate RunWhen Local configuration (workspaceInfo.yaml)" + env: + RW_WORKSPACE: '{{.RW_WORKSPACE | default "my-workspace"}}' + cmds: + - | + repo_url=$(git config --get remote.origin.url) + branch_name=$(git rev-parse --abbrev-ref HEAD) + codebundle=$(basename "$(dirname "$PWD")") + namespace=$(yq e 'select(.kind == "Namespace") | .metadata.name' kubernetes/manifest.yaml -N) + cat < workspaceInfo.yaml + workspaceName: "$RW_WORKSPACE" + workspaceOwnerEmail: authors@runwhen.com + defaultLocation: location-01 + defaultLOD: none + cloudConfig: + kubernetes: + kubeconfigFile: /shared/kubeconfig + namespaceLODs: + $namespace: detailed + namespaces: + - $namespace + codeCollections: + - repoURL: "$repo_url" + branch: "$branch_name" + codeBundles: ["$codebundle"] + custom: + kubeconfig_secret_name: "kubeconfig" + kubernetes_distribution_binary: kubectl + EOF + silent: true + + run-rwl-discovery: + desc: "Run RunWhen Local Discovery on test infrastructure" + cmds: + - | + echo "Discovery requires docker and a valid kubeconfig; see README in parent CodeBundle." + silent: true + + clean-rwl-discovery: + desc: "Check and clean up RunWhen Local discovery output" + cmds: + - | + rm -rf output + rm -f workspaceInfo.yaml + silent: true diff --git a/codebundles/k8s-pgbouncer-prometheus-health/.test/kubernetes/manifest.yaml b/codebundles/k8s-pgbouncer-prometheus-health/.test/kubernetes/manifest.yaml new file mode 100644 index 00000000..7448f801 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/.test/kubernetes/manifest.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: test-pgbouncer-hc +--- +apiVersion: v1 +kind: Service +metadata: + name: pgbouncer-proxy + namespace: test-pgbouncer-hc + labels: + app.kubernetes.io/name: pgbouncer +spec: + ports: + - port: 5432 + targetPort: 5432 + selector: + app.kubernetes.io/name: pgbouncer diff --git a/codebundles/k8s-pgbouncer-prometheus-health/README.md b/codebundles/k8s-pgbouncer-prometheus-health/README.md new file mode 100644 index 00000000..eafa10f4 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/README.md @@ -0,0 +1,85 @@ +# Kubernetes PgBouncer Prometheus Health + +This CodeBundle evaluates PgBouncer connection pool health using Prometheus metrics from the [prometheus-community/pgbouncer_exporter](https://github.com/prometheus-community/pgbouncer_exporter) (or compatible scrapes), with optional kubectl validation of `pool_mode` when a kubeconfig and namespace are supplied. + +## Overview + +- **Exporter and process availability**: detects `pgbouncer_up` failures per scrape target. +- **Saturation and waits**: compares active and waiting clients to `pgbouncer_config_max_client_connections`, flags wait queues, max wait time, and server-side balance patterns. +- **Configuration drift**: optional pool mode validation via metric labels or `pgbouncer.ini` read through kubectl. +- **Distribution and outliers**: ranks per-database load, highlights pod-level skew, and estimates capacity when app and pooler replica inputs are provided. +- **Growth**: uses Prometheus `rate()` over a configurable window to spot sustained connection growth. + +## Configuration + +### Required variables + +- `PROMETHEUS_URL`: Base URL for the Prometheus or Thanos querier API (for example `https://prometheus.example/api/v1` or `https://thanos.example/api/v1`). +- `PGBOUNCER_JOB_LABEL`: Prometheus label matchers inside `{...}` for the exporter scrape, for example `job="pgbouncer-exporter"`. +- `EXPECTED_POOL_MODE`: Expected pool mode string: `transaction`, `session`, or `statement`. + +### Optional variables + +- `CONTEXT`: Kubernetes context for kubectl when kubeconfig is configured. +- `METRIC_NAMESPACE_FILTER`: Extra label matchers (comma-separated) such as `kubernetes_namespace="my-namespace"` to narrow series. +- `CLIENT_SATURATION_PERCENT_THRESHOLD`: Percent of `max_client_conn` above which saturation is raised (default `80`). +- `MAX_WAIT_SECONDS_THRESHOLD`: Maximum acceptable `pgbouncer_pools_client_maxwait_seconds` (default `1`). +- `CLIENT_WAITING_THRESHOLD`: Raise when the sum of waiting connections is greater than this value (default `0`). +- `DATABASE_HOTSPOT_PERCENT_THRESHOLD`: Flag databases whose share of connections exceeds this percent of the total (default `50`). +- `POD_OUTLIER_RATIO`: Flag pods whose per-pod client active sum exceeds the fleet mean times this ratio (default `2.0`). +- `GROWTH_RATE_WINDOW_MINUTES`: Lookback for Prometheus range queries used in growth detection (default `15`). +- `CONNECTION_GROWTH_RATE_THRESHOLD`: Average `rate()` of client active connections (per second) that triggers growth issues (default `0.1`). +- `KUBERNETES_DISTRIBUTION_BINARY`: CLI binary for kubectl (default `kubectl`). +- `PGBOUNCER_NAMESPACE`: Namespace used to locate a pod for optional pool mode inspection (often the same as the PgBouncer workload namespace). +- `PGBOUNCER_POD_LABEL_SELECTOR`: Label selector for the pod that mounts `pgbouncer.ini` (default `app.kubernetes.io/name=pgbouncer-exporter`; change to your PgBouncer pod labels if the exporter runs as a sidecar elsewhere). +- `PGBOUNCER_PGBOUNCER_CONTAINER`: Optional container name for `kubectl exec` when the pod is multi-container. +- `APP_REPLICAS`: Application replica count for the capacity SLI (optional). +- `APP_DB_POOL_SIZE`: Per-replica application DB pool size for the capacity SLI (optional). +- `PGBOUNCER_REPLICAS`: PgBouncer replica count for the capacity SLI (optional). + +### Secrets + +- `prometheus_bearer_token`: Bearer token for authenticated Prometheus read APIs when required (plain text or OAuth token). +- `kubeconfig`: Standard kubeconfig used for optional kubectl-based pool mode checks. + +## Tasks overview + +### Check PgBouncer Exporter and Process Availability + +Fails when `pgbouncer_up` is `0` for any filtered target or when no series are returned. + +### Check Client Connection Saturation vs max_client_conn + +Compares `(sum(client_active) + sum(client_waiting)) / max(max_client_conn)` to the percent threshold. + +### Check Client Wait Queue Buildup + +Raises when the sum of `pgbouncer_pools_client_waiting_connections` is above `CLIENT_WAITING_THRESHOLD`. + +### Check Max Client Wait Time Spikes + +Compares `max(pgbouncer_pools_client_maxwait_seconds)` to `MAX_WAIT_SECONDS_THRESHOLD`. + +### Check Server Pool Balance vs Client Waits + +Flags clients waiting while server idle connections exist, and clients waiting alongside elevated `server_used` counts. + +### Validate Pool Mode from Metrics or Config + +Prefers a `pool_mode` label on metrics if present; otherwise attempts to read `pool_mode` from common `pgbouncer.ini` paths via kubectl when `PGBOUNCER_NAMESPACE` and kubeconfig are set. + +### Analyze Per-Database Connection Distribution + +Uses `pgbouncer_databases_current_connections` when available, otherwise `pgbouncer_pools_client_active_connections` by `database` label, to find hotspots. + +### Aggregate Health Across PgBouncer Pods and Flag Outliers + +Compares per-pod `sum(client_active)` against the fleet mean using `POD_OUTLIER_RATIO`. + +### Detect Abnormal Client Connection Growth Rate + +Runs a range query on `rate(pgbouncer_pools_client_active_connections[5m])` and compares average rates to `CONNECTION_GROWTH_RATE_THRESHOLD`. + +### Compute Capacity Planning SLI (App Demand vs PgBouncer Capacity) + +When `APP_REPLICAS`, `APP_DB_POOL_SIZE`, and `PGBOUNCER_REPLICAS` are all set, compares `APP_REPLICAS * APP_DB_POOL_SIZE` to `max(pgbouncer_config_max_client_connections) * PGBOUNCER_REPLICAS`. diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-capacity-sli.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-capacity-sli.sh new file mode 100755 index 00000000..b3149222 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-capacity-sli.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_capacity_sli_output.json" +issues_json='[]' + +if [ -z "${APP_REPLICAS:-}" ] || [ -z "${APP_DB_POOL_SIZE:-}" ] || [ -z "${PGBOUNCER_REPLICAS:-}" ]; then + echo "$issues_json" > "$OUTPUT_FILE" + echo "Capacity SLI skipped (set APP_REPLICAS, APP_DB_POOL_SIZE, PGBOUNCER_REPLICAS)." + jq '.' "$OUTPUT_FILE" + exit 0 +fi + +wm=$(wrap_metric pgbouncer_config_max_client_connections) +q="max(${wm})" +echo "Instant query: $q" + +raw=$(prometheus_instant_query "$q" || true) +if ! prometheus_query_status_ok "${raw:-}" 2>/dev/null; then + echo '[]' | jq \ + --arg title "Prometheus Error for Capacity SLI" \ + --arg details "Could not read max client connections from metrics." \ + --arg severity "2" \ + --arg next_steps "Confirm pgbouncer_config_max_client_connections is scraped." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]' > "$OUTPUT_FILE" + exit 0 +fi + +maxc=$(echo "$raw" | jq -r '.data.result[0].value[1] // "0"') +demand=$(awk -v r="$APP_REPLICAS" -v p="$APP_DB_POOL_SIZE" 'BEGIN {printf "%.0f", r * p}') +cap=$(awk -v m="$maxc" -v pr="$PGBOUNCER_REPLICAS" 'BEGIN {printf "%.0f", m * pr}') + +if [ "${cap:-0}" -eq 0 ]; then + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +ratio=$(awk -v d="$demand" -v c="$cap" 'BEGIN {printf "%.4f", d / c}') + +awk -v r="$ratio" 'BEGIN {exit !((r + 0) >= 1.0)}' && { + issues_json=$(echo "$issues_json" | jq \ + --arg title "Capacity SLI: App Demand Meets or Exceeds PgBouncer Capacity" \ + --arg details "Estimated demand is ${demand} (APP_REPLICAS * APP_DB_POOL_SIZE) vs capacity ${cap} (max_client_conn * PGBOUNCER_REPLICAS). Ratio ${ratio}." \ + --arg severity "2" \ + --arg next_steps "Increase PgBouncer replicas or max_client_conn, reduce per-app pool size or app replicas, or add pooler shards." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +} + +awk -v r="$ratio" 'BEGIN {exit !((r + 0) >= 0.85 && (r + 0) < 1.0)}' && { + issues_json=$(echo "$issues_json" | jq \ + --arg title "Capacity SLI: Approaching PgBouncer Limit" \ + --arg details "Demand/capacity ratio is ${ratio} (warning band >= 0.85). Demand ${demand}, capacity ${cap}." \ + --arg severity "1" \ + --arg next_steps "Plan capacity increases before saturation causes client waits and errors." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +} + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-client-saturation.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-client-saturation.sh new file mode 100755 index 00000000..a835bde8 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-client-saturation.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_client_saturation_output.json" +issues_json='[]' +THRESHOLD="${CLIENT_SATURATION_PERCENT_THRESHOLD:-80}" + +wm_a=$(wrap_metric pgbouncer_pools_client_active_connections) +wm_w=$(wrap_metric pgbouncer_pools_client_waiting_connections) +wm_m=$(wrap_metric pgbouncer_config_max_client_connections) +# Active + waiting vs configured max client connections +q="(sum(${wm_a}) + sum(${wm_w})) / clamp_min(max(${wm_m}), 1) * 100" +filt="$(metric_label_filter)" + +echo "Instant query: $q" + +if ! raw=$(prometheus_instant_query "$q"); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Query Failed for Client Saturation" \ + --arg details "curl to Prometheus failed while evaluating saturation ratio." \ + --arg severity "3" \ + --arg next_steps "Verify PROMETHEUS_URL and network access to Prometheus." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +if ! prometheus_query_status_ok "$raw"; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Error Evaluating Client Saturation" \ + --arg details "$(echo "$raw" | jq -c .)" \ + --arg severity "3" \ + --arg next_steps "Confirm metrics pgbouncer_pools_* and pgbouncer_config_max_client_connections exist for filter: {$filt}" \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +pct=$(echo "$raw" | jq -r '.data.result[0].value[1] // empty') +if [ -z "$pct" ] || [ "$pct" = "null" ]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "No Data for Client Saturation Ratio" \ + --arg details "Prometheus returned no vector value for the saturation expression." \ + --arg severity "2" \ + --arg next_steps "Check that the pgbouncer exporter is scraped and label filters match your deployment." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +else + cmp=$(awk -v p="$pct" -v t="$THRESHOLD" 'BEGIN {if (p+0 > t+0) exit 0; else exit 1}') + if [ "$cmp" -eq 0 ]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "PgBouncer Client Saturation Above ${THRESHOLD}%" \ + --arg details "Estimated saturation is ${pct}% (active+waiting vs max_client_conn). Threshold: ${THRESHOLD}%." \ + --arg severity "3" \ + --arg next_steps "Increase max_client_conn, scale PgBouncer replicas, reduce app pool sizes, or investigate connection leaks." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi +fi + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-client-waiting.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-client-waiting.sh new file mode 100755 index 00000000..1708876b --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-client-waiting.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_client_waiting_output.json" +issues_json='[]' +WAIT_THRESHOLD="${CLIENT_WAITING_THRESHOLD:-0}" + +q="sum($(wrap_metric pgbouncer_pools_client_waiting_connections))" +echo "Instant query: $q" + +if ! raw=$(prometheus_instant_query "$q"); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Query Failed for Client Waiting Connections" \ + --arg details "curl to Prometheus failed." \ + --arg severity "3" \ + --arg next_steps "Verify PROMETHEUS_URL and credentials." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +if ! prometheus_query_status_ok "$raw"; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Error for Waiting Connections" \ + --arg details "$(echo "$raw" | jq -c .)" \ + --arg severity "3" \ + --arg next_steps "Confirm pgbouncer_pools_client_waiting_connections is exported for your filters." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +waiters=$(echo "$raw" | jq -r '.data.result[0].value[1] // "0"') +awk -v w="$waiters" -v t="$WAIT_THRESHOLD" 'BEGIN {exit !(w > t)}' && { + issues_json=$(echo "$issues_json" | jq \ + --arg title "PgBouncer Client Wait Queue Elevated" \ + --arg details "sum(pgbouncer_pools_client_waiting_connections) is ${waiters} (threshold > ${WAIT_THRESHOLD}). Clients are waiting for server connections." \ + --arg severity "3" \ + --arg next_steps "Increase pool capacity (default_pool_size, max_db_connections), add replicas, or reduce app-side connection demand." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +} + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-connection-growth-rate.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-connection-growth-rate.sh new file mode 100755 index 00000000..d21d14a1 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-connection-growth-rate.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_connection_growth_rate_output.json" +WIN_M="${GROWTH_RATE_WINDOW_MINUTES:-15}" +RATE_THR="${CONNECTION_GROWTH_RATE_THRESHOLD:-0.1}" + +wm=$(wrap_metric pgbouncer_pools_client_active_connections) +end=$(date +%s) +start=$((end - WIN_M * 60)) +step="30s" + +q="rate(${wm}[5m])" +echo "Range query: $q from $start to $end" + +raw=$(prometheus_range_query "$q" "$start" "$end" "$step" || true) + +if ! prometheus_query_status_ok "${raw:-}" 2>/dev/null; then + echo '[]' | jq \ + --arg title "Prometheus Range Query Failed for Connection Growth" \ + --arg details "Could not evaluate rate() over the lookback window." \ + --arg severity "2" \ + --arg next_steps "Confirm Prometheus supports range queries and that a 5m window has sufficient samples." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]' > "$OUTPUT_FILE" + exit 0 +fi + +issues_json=$(echo "$raw" | jq -c --argjson thr "$RATE_THR" ' + .data.result as $r | + if ($r | length) == 0 then [] + else + $r | map( + . as $series | + ($series.values | map(.[1] | tonumber) | add / length) as $avg | + select($avg > $thr) | + ($series.metric.pod // $series.metric.kubernetes_pod_name // "unknown") as $pod | + { + title: ("Sustained Client Connection Growth for Pod `" + $pod + "`"), + details: ("Average rate of client_active_connections over the window is approximately " + ($avg|tostring) + " conn/s (threshold " + ($thr|tostring) + ")."), + severity: 3, + next_steps: "Check for connection leaks in apps, pooler misconfiguration, or traffic shifts; compare with deployment rollouts." + } + ) + end +') + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-exporter-up.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-exporter-up.sh new file mode 100755 index 00000000..58fbfa3f --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-exporter-up.sh @@ -0,0 +1,69 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_exporter_up_output.json" +issues_json='[]' + +q="$(wrap_metric pgbouncer_up)" +echo "Instant query: $q" + +if ! raw=$(prometheus_instant_query "$q"); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Query Failed for \`pgbouncer_up\`" \ + --arg details "curl to Prometheus instant query API failed. Check PROMETHEUS_URL and network." \ + --arg severity "4" \ + --arg next_steps "Verify PROMETHEUS_URL, bearer token, and that Prometheus is reachable from the runner." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + echo "Prometheus request failed." + exit 0 +fi + +if ! prometheus_query_status_ok "$raw"; then + err=$(echo "$raw" | jq -r '.error // .data // .') + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Returned Error for \`pgbouncer_up\`" \ + --arg details "Response: $err" \ + --arg severity "4" \ + --arg next_steps "Fix the PromQL query or Prometheus availability; confirm pgbouncer_up exists for your scrape config." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +count=$(echo "$raw" | jq '.data.result | length') +if [ "${count:-0}" -eq 0 ]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "No \`pgbouncer_up\` Series Found for Label Filter" \ + --arg details "Prometheus returned zero time series for pgbouncer_up with the configured job/namespace filters." \ + --arg severity "3" \ + --arg next_steps "Verify ServiceMonitor/PodMonitor targets, PGBOUNCER_JOB_LABEL, and METRIC_NAMESPACE_FILTER against live /metrics or Prometheus targets UI." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +else + down=$(echo "$raw" | jq -c '[.data.result[]? | select((.value[1] | tonumber) < 1)]') + dcount=$(echo "$down" | jq 'length') + if [ "$dcount" -gt 0 ]; then + for i in $(seq 0 $((dcount - 1))); do + pod=$(echo "$down" | jq -r ".[$i].metric.pod // .[$i].metric.kubernetes_pod_name // .[$i].metric.instance // \"unknown\"") + val=$(echo "$down" | jq -r ".[$i].value[1]") + issues_json=$(echo "$issues_json" | jq \ + --arg title "PgBouncer Exporter Unhealthy (\`pgbouncer_up=0\`) for \`$pod\`" \ + --arg details "pgbouncer_up reports $val for this target. The exporter cannot reach PgBouncer or the process is down." \ + --arg severity "4" \ + --arg next_steps "Check PgBouncer and exporter pods, logs, and network between exporter admin port and PgBouncer. Confirm scrape job matches ${PGBOUNCER_JOB_LABEL}." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + done + fi +fi + +echo "$issues_json" > "$OUTPUT_FILE" +echo "Wrote $OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-max-wait-time.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-max-wait-time.sh new file mode 100755 index 00000000..8810ab38 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-max-wait-time.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_max_wait_time_output.json" +issues_json='[]' +MAXW="${MAX_WAIT_SECONDS_THRESHOLD:-1}" + +q="max($(wrap_metric pgbouncer_pools_client_maxwait_seconds))" +echo "Instant query: $q" + +if ! raw=$(prometheus_instant_query "$q"); then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Query Failed for Max Client Wait" \ + --arg details "curl to Prometheus failed." \ + --arg severity "3" \ + --arg next_steps "Verify PROMETHEUS_URL and credentials." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +if ! prometheus_query_status_ok "$raw"; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Error for Max Client Wait" \ + --arg details "$(echo "$raw" | jq -c .)" \ + --arg severity "3" \ + --arg next_steps "Confirm pgbouncer_pools_client_maxwait_seconds exists." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +mx=$(echo "$raw" | jq -r '.data.result[0].value[1] // "0"') +awk -v w="$mx" -v t="$MAXW" 'BEGIN {exit !(w > t)}' && { + issues_json=$(echo "$issues_json" | jq \ + --arg title "PgBouncer Max Client Wait Exceeds ${MAXW}s" \ + --arg details "max(pgbouncer_pools_client_maxwait_seconds) is ${mx}s across filtered series. SLO threshold: ${MAXW}s." \ + --arg severity "3" \ + --arg next_steps "Investigate pool exhaustion, slow upstream queries, or mis-sized pools; correlate with waiting connections and per-database load." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +} + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-per-database-distribution.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-per-database-distribution.sh new file mode 100755 index 00000000..cd0197e6 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-per-database-distribution.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_per_database_distribution_output.json" +HOTSPOT="${DATABASE_HOTSPOT_PERCENT_THRESHOLD:-50}" + +wm_db=$(wrap_metric pgbouncer_databases_current_connections) +wm_pool=$(wrap_metric pgbouncer_pools_client_active_connections) + +q="sum by (database) (${wm_db})" +echo "Trying: $q" +raw=$(prometheus_instant_query "$q" || true) + +if ! prometheus_query_status_ok "${raw:-}" 2>/dev/null || [ "$(echo "${raw:-{}}" | jq '.data.result | length')" -eq 0 ]; then + q="sum by (database) (${wm_pool})" + echo "Fallback: $q" + raw=$(prometheus_instant_query "$q" || true) +fi + +if ! prometheus_query_status_ok "${raw:-}" 2>/dev/null; then + echo '[]' | jq \ + --arg title "Prometheus Error for Per-Database Distribution" \ + --arg details "Could not query per-database connection metrics." \ + --arg severity "2" \ + --arg next_steps "Confirm pgbouncer_databases_current_connections or pgbouncer_pools_client_active_connections with database label is exported." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]' > "$OUTPUT_FILE" + exit 0 +fi + +total=$(echo "$raw" | jq '[.data.result[]? | (.value[1] | tonumber)] | add // 0') + +issues_json=$(echo "$raw" | jq -c \ + --argjson thr "$HOTSPOT" \ + --argjson total "$total" \ + ' + if ($total == 0) then [] + else + [.data.result[] | + (.value[1] | tonumber) as $v | + (($v / $total) * 100) as $pct | + select($pct > $thr) | + { + title: ("Database Hotspot: `" + (.metric.database // "unknown") + "`"), + details: ("Approximately " + (($pct * 10 | floor) / 10 | tostring) + "% of connections (" + ($v|tostring) + " of " + ($total|tostring) + "). Threshold: " + ($thr|tostring) + "%."), + severity: 2, + next_steps: "Investigate heavy consumers of this database entry; consider separate pools or sharding." + } + ] + end + ') + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-pod-outliers.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-pod-outliers.sh new file mode 100755 index 00000000..e886a0e1 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-pod-outliers.sh @@ -0,0 +1,54 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_pod_outliers_output.json" +RATIO="${POD_OUTLIER_RATIO:-2.0}" + +wm=$(wrap_metric pgbouncer_pools_client_active_connections) +q="sum by (pod) (${wm})" +echo "Instant query: $q" + +raw=$(prometheus_instant_query "$q" || true) +if ! prometheus_query_status_ok "${raw:-}" 2>/dev/null; then + echo '[]' | jq \ + --arg title "Prometheus Error for Pod Outlier Detection" \ + --arg details "Could not query per-pod client active connections." \ + --arg severity "3" \ + --arg next_steps "Verify Prometheus and that pod label exists on pool metrics." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]' > "$OUTPUT_FILE" + exit 0 +fi + +issues_json=$(echo "$raw" | jq -c --argjson ratio "$RATIO" ' + .data.result as $r | + if ($r | length) == 0 then [] + else + ($r | map(.value[1] | tonumber)) as $vals | + (($vals | add) / ($vals | length)) as $mean | + if ($mean == 0) then [] + else + $r | map( + (.value[1] | tonumber) as $v | + (.metric.pod // .metric.kubernetes_pod_name // "unknown") as $pod | + select($v > ($mean * $ratio)) | + { + title: ("PgBouncer Pod Outlier: `" + $pod + "`"), + details: ("Pod has client_active sum " + ($v|tostring) + " vs fleet mean " + ($mean|tostring) + " (ratio threshold " + ($ratio|tostring) + "x)."), + severity: 3, + next_steps: "Investigate this replica for skewed traffic, local saturation, or failing readiness; verify Service sessionAffinity and endpoints." + } + ) + end + end +') + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-pool-mode.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-pool-mode.sh new file mode 100755 index 00000000..1c385aa4 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-pool-mode.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" +: "${EXPECTED_POOL_MODE:?Must set EXPECTED_POOL_MODE}" + +OUTPUT_FILE="check_pool_mode_output.json" +issues_json='[]' + +exp=$(echo "$EXPECTED_POOL_MODE" | tr '[:upper:]' '[:lower:]') + +# Try label on pgbouncer_up series (some environments add static labels) +q="$(wrap_metric pgbouncer_up)" +raw=$(prometheus_instant_query "$q" || true) +observed="" + +if [ -n "${raw:-}" ] && prometheus_query_status_ok "$raw"; then + observed=$(echo "$raw" | jq -r '[.data.result[]?.metric | to_entries[] | select(.key|test("pool_?mode";"i")) | .value] | first // empty' | tr '[:upper:]' '[:lower:]') +fi + +if [ -z "$observed" ] && [ -n "${PGBOUNCER_NAMESPACE:-}" ] && [ -n "${KUBECONFIG:-}" ]; then + kb=$(kubectl_bin) + ctx=$(kubectl_context_args) + sel="${PGBOUNCER_POD_LABEL_SELECTOR:-app.kubernetes.io/name=pgbouncer-exporter}" + pod=$($kb $ctx get pods -n "$PGBOUNCER_NAMESPACE" -l "$sel" -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || true) + if [ -n "$pod" ]; then + ctr="${PGBOUNCER_PGBOUNCER_CONTAINER:-}" + cargs=() + if [ -n "$ctr" ]; then + cargs=(-c "$ctr") + fi + ini=$($kb $ctx exec -n "$PGBOUNCER_NAMESPACE" "${cargs[@]}" "$pod" -- sh -c 'for f in /etc/pgbouncer/pgbouncer.ini /etc/pgbouncer.ini /opt/bitnami/pgbouncer/conf/pgbouncer.ini; do [ -f "$f" ] && cat "$f" && break; done' 2>/dev/null || true) + observed=$(echo "$ini" | awk -F= '/^[[:space:]]*pool_mode/ {gsub(/^[[:space:]]+|[[:space:]]+$/, "", $2); print tolower($2); exit}') + fi +fi + +if [ -n "$observed" ]; then + if [ "$observed" != "$exp" ]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "PgBouncer Pool Mode Drift" \ + --arg details "Observed pool_mode is '${observed}' but EXPECTED_POOL_MODE is '${exp}'." \ + --arg severity "3" \ + --arg next_steps "Align pgbouncer.ini pool_mode with application transaction patterns; redeploy if intentional change, or fix misconfiguration." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + fi +else + issues_json=$(echo "$issues_json" | jq \ + --arg title "Pool Mode Could Not Be Verified" \ + --arg details "No pool_mode label found on metrics and kubectl could not read pgbouncer.ini (set PGBOUNCER_NAMESPACE, kubeconfig, and PGBOUNCER_POD_LABEL_SELECTOR to enable exec-based checks)." \ + --arg severity "2" \ + --arg next_steps "Add a static pool_mode label in scrape config, or provide namespace/pod selector for kubectl access to the PgBouncer configuration." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +fi + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/check-server-pool-balance.sh b/codebundles/k8s-pgbouncer-prometheus-health/check-server-pool-balance.sh new file mode 100755 index 00000000..cbe1c278 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/check-server-pool-balance.sh @@ -0,0 +1,79 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=lib/prometheus-common.sh +source "${SCRIPT_DIR}/lib/prometheus-common.sh" + +: "${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" +: "${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + +OUTPUT_FILE="check_server_pool_balance_output.json" +issues_json='[]' + +wm_wait=$(wrap_metric pgbouncer_pools_client_waiting_connections) +wm_idle=$(wrap_metric pgbouncer_pools_server_idle_connections) +wm_used=$(wrap_metric pgbouncer_pools_server_used_connections) + +q1="sum(${wm_wait})" +q2="sum(${wm_idle})" +q3="sum(${wm_used})" + +echo "Queries: $q1 ; $q2 ; $q3" + +fetch_scalar() { + local query="$1" + local raw + if ! raw=$(prometheus_instant_query "$query"); then + echo "" + return 1 + fi + if ! prometheus_query_status_ok "$raw"; then + echo "" + return 1 + fi + echo "$raw" | jq -r '.data.result[0].value[1] // "0"' +} + +w=$(fetch_scalar "$q1") || w="" +if [ -z "$w" ]; then + issues_json=$(echo "$issues_json" | jq \ + --arg title "Prometheus Query Failed for Pool Balance" \ + --arg details "Could not evaluate waiting connections: $q1" \ + --arg severity "3" \ + --arg next_steps "Verify Prometheus connectivity and that pool metrics exist." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') + echo "$issues_json" > "$OUTPUT_FILE" + exit 0 +fi + +i=$(fetch_scalar "$q2") || i="" +u=$(fetch_scalar "$q3") || u="" + +awk -v w="${w:-0}" -v id="${i:-0}" 'BEGIN { + if (w > 0 && id > 0) exit 0 + exit 1 +}' && { + issues_json=$(echo "$issues_json" | jq \ + --arg title "Possible Pool Misconfiguration: Clients Waiting With Idle Servers" \ + --arg details "Clients are waiting (sum=${w}) while server_idle connections exist (sum=${i}). This pattern can indicate wrong pool_mode, routing, or auth/database mismatch." \ + --arg severity "3" \ + --arg next_steps "Verify pool_mode vs workload, check per-database pool routing, and review pgbouncer.ini auth and database definitions." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +} + +awk -v w="${w:-0}" -v us="${u:-0}" 'BEGIN { + if (w > 0 && us > 0) exit 0 + exit 1 +}' && { + issues_json=$(echo "$issues_json" | jq \ + --arg title "Server Pool Pressure With Concurrent Client Waits" \ + --arg details "Clients are waiting (sum=${w}) while server_used is non-zero (sum=${u}). Investigate pool sizing and upstream PostgreSQL capacity." \ + --arg severity "2" \ + --arg next_steps "Review default_pool_size and max_db_connections vs PostgreSQL max_connections; check for slow queries holding server slots." \ + '. += [{"title": $title, "details": $details, "severity": ($severity | tonumber), "next_steps": $next_steps}]') +} + +echo "$issues_json" > "$OUTPUT_FILE" +jq '.' "$OUTPUT_FILE" diff --git a/codebundles/k8s-pgbouncer-prometheus-health/lib/prometheus-common.sh b/codebundles/k8s-pgbouncer-prometheus-health/lib/prometheus-common.sh new file mode 100644 index 00000000..ff187cc5 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/lib/prometheus-common.sh @@ -0,0 +1,85 @@ +#!/usr/bin/env bash +# Shared helpers for Prometheus HTTP API queries (instant + range). +# shellcheck disable=SC2034 + +prometheus_api_base() { + local u="${PROMETHEUS_URL:?Must set PROMETHEUS_URL}" + u="${u%/}" + if [[ "$u" != *"/api/v1" ]]; then + u="${u}/api/v1" + fi + printf '%s' "$u" +} + +metric_label_filter() { + local j="${PGBOUNCER_JOB_LABEL:?Must set PGBOUNCER_JOB_LABEL}" + local ns="${METRIC_NAMESPACE_FILTER:-}" + if [ -n "$ns" ]; then + printf '%s,%s' "$j" "$ns" + else + printf '%s' "$j" + fi +} + +wrap_metric() { + local metric="$1" + printf '%s{%s}' "$metric" "$(metric_label_filter)" +} + +prometheus_instant_query() { + local query="$1" + local base url + base="$(prometheus_api_base)" + url="${base}/query" + if [ -n "${PROMETHEUS_BEARER_TOKEN:-}" ]; then + curl -sS -G --data-urlencode "query=${query}" -H "Authorization: Bearer ${PROMETHEUS_BEARER_TOKEN}" "$url" + else + curl -sS -G --data-urlencode "query=${query}" "$url" + fi +} + +prometheus_range_query() { + local query="$1" + local start="$2" + local end="$3" + local step="${4:-30s}" + local base url + base="$(prometheus_api_base)" + url="${base}/query_range" + if [ -n "${PROMETHEUS_BEARER_TOKEN:-}" ]; then + curl -sS -G \ + --data-urlencode "query=${query}" \ + --data-urlencode "start=${start}" \ + --data-urlencode "end=${end}" \ + --data-urlencode "step=${step}" \ + -H "Authorization: Bearer ${PROMETHEUS_BEARER_TOKEN}" \ + "$url" + else + curl -sS -G \ + --data-urlencode "query=${query}" \ + --data-urlencode "start=${start}" \ + --data-urlencode "end=${end}" \ + --data-urlencode "step=${step}" \ + "$url" + fi +} + +prometheus_query_status_ok() { + local json="$1" + local st + st=$(echo "$json" | jq -r '.status // "error"') + if [ "$st" = "success" ]; then + return 0 + fi + return 1 +} + +kubectl_bin() { + printf '%s' "${KUBERNETES_DISTRIBUTION_BINARY:-kubectl}" +} + +kubectl_context_args() { + if [ -n "${CONTEXT:-}" ]; then + printf '%s %s' "--context" "${CONTEXT}" + fi +} diff --git a/codebundles/k8s-pgbouncer-prometheus-health/runbook.robot b/codebundles/k8s-pgbouncer-prometheus-health/runbook.robot new file mode 100644 index 00000000..e4eceda5 --- /dev/null +++ b/codebundles/k8s-pgbouncer-prometheus-health/runbook.robot @@ -0,0 +1,560 @@ +*** Settings *** +Documentation Evaluates PgBouncer connection pool health using Prometheus metrics from the community pgbouncer exporter, with optional kubectl validation of pool mode. +Metadata Author rw-codebundle-agent +Metadata Display Name Kubernetes PgBouncer Prometheus Health +Metadata Supports Kubernetes PgBouncer Prometheus PostgreSQL Connection Pool +Force Tags Kubernetes PgBouncer Prometheus Health Metrics + +Library BuiltIn +Library String +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Check PgBouncer Exporter and Process Availability for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Fails when pgbouncer_up is zero for any scraped target, indicating exporter or PgBouncer process failure. + [Tags] kubernetes pgbouncer exporter availability access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-exporter-up.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-exporter-up.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_exporter_up_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for exporter check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=pgbouncer_up should be 1 for every scraped PgBouncer exporter target + ... actual=Exporter or PgBouncer process reported unhealthy for at least one target + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report PgBouncer exporter availability: + RW.Core.Add Pre To Report ${result.stdout} + +Check Client Connection Saturation vs max_client_conn for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Compares active and waiting client connections to pgbouncer_config_max_client_connections and flags sustained utilization above the configured percentage threshold. + [Tags] kubernetes pgbouncer saturation capacity access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-client-saturation.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-client-saturation.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_client_saturation_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for saturation check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Client active plus waiting connections should remain below the saturation threshold relative to max_client_conn + ... actual=Saturation ratio exceeded the configured threshold for the filtered PgBouncer targets + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Client saturation analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Check Client Wait Queue Buildup for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Alerts when pooled client waiting connections exceed the configured near-zero threshold, indicating pool exhaustion. + [Tags] kubernetes pgbouncer waiting queue access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-client-waiting.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-client-waiting.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_client_waiting_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for waiting check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=No sustained client wait queue beyond the configured threshold + ... actual=Client waiting connections reported above the threshold + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Client wait queue: + RW.Core.Add Pre To Report ${result.stdout} + +Check Max Client Wait Time Spikes for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Evaluates pgbouncer_pools_client_maxwait_seconds against the maximum acceptable wait SLO and flags breaches. + [Tags] kubernetes pgbouncer latency slo access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-max-wait-time.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-max-wait-time.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_max_wait_time_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for max wait check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=pgbouncer_pools_client_maxwait_seconds should remain below the configured SLO + ... actual=Maximum client wait time exceeded the configured threshold + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Max client wait time: + RW.Core.Add Pre To Report ${result.stdout} + +Check Server Pool Balance vs Client Waits for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Detects imbalance where clients wait while server-side idle capacity exists, or server pressure coincides with client waits. + [Tags] kubernetes pgbouncer balance pool access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-server-pool-balance.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-server-pool-balance.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_server_pool_balance_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for pool balance check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Server pools should balance with client demand without persistent idle capacity while clients wait + ... actual=Potential imbalance or pressure pattern detected from pool metrics + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Server pool balance: + RW.Core.Add Pre To Report ${result.stdout} + +Validate Pool Mode from Metrics or Config for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Confirms pool mode matches EXPECTED_POOL_MODE using metric labels when present, otherwise optional kubectl access to pgbouncer.ini in the target namespace. + [Tags] kubernetes pgbouncer pool_mode config access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-pool-mode.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-pool-mode.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_pool_mode_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for pool mode check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Observed pool_mode should match EXPECTED_POOL_MODE for the workload + ... actual=Pool mode drift or verification gap detected + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Pool mode validation: + RW.Core.Add Pre To Report ${result.stdout} + +Analyze Per-Database Connection Distribution for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Ranks databases by connection share to surface hotspots consuming a disproportionate fraction of the pool. + [Tags] kubernetes pgbouncer database distribution access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-per-database-distribution.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-per-database-distribution.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_per_database_distribution_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for per-database check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Per-database connection share should remain relatively balanced for the workload + ... actual=One or more databases exceed the hotspot percentage threshold + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Per-database distribution: + RW.Core.Add Pre To Report ${result.stdout} + +Aggregate Health Across PgBouncer Pods and Flag Outliers for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Summarizes per-pod client load and flags replicas that deviate from the fleet mean beyond the configured ratio. + [Tags] kubernetes pgbouncer outliers pods access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-pod-outliers.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-pod-outliers.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_pod_outliers_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for pod outlier check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=PgBouncer replicas should receive similar client load behind the Kubernetes Service + ... actual=One or more pods deviate from the fleet mean beyond the configured ratio + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Pod outlier analysis: + RW.Core.Add Pre To Report ${result.stdout} + +Detect Abnormal Client Connection Growth Rate for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] Uses a Prometheus range query over rate() to flag sustained growth in client active connections that may indicate leaks or abnormal load shifts. + [Tags] kubernetes pgbouncer growth rate access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-connection-growth-rate.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-connection-growth-rate.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_connection_growth_rate_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for growth rate check, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Client active connections should be stable relative to the rate threshold over the lookback window + ... actual=Sustained positive rate of client connection growth detected for one or more pods + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Connection growth rate: + RW.Core.Add Pre To Report ${result.stdout} + +Compute Capacity Planning SLI (App Demand vs PgBouncer Capacity) for Scope `${PGBOUNCER_JOB_LABEL}` + [Documentation] When optional replica and pool sizes are provided, estimates demand relative to aggregate PgBouncer max_client_conn capacity and warns when approaching saturation. + [Tags] kubernetes pgbouncer capacity planning access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-capacity-sli.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=PROMETHEUS_URL=${PROMETHEUS_URL} ./check-capacity-sli.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat check_capacity_sli_output.json + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for capacity SLI, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Planned application pool demand should remain below aggregate PgBouncer capacity with headroom + ... actual=Estimated demand ratio crossed a warning or critical threshold + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Capacity SLI: + RW.Core.Add Pre To Report ${result.stdout} + + +*** Keywords *** +Suite Initialization + TRY + ${prometheus_bearer_token}= RW.Core.Import Secret + ... prometheus_bearer_token + ... type=string + ... description=Bearer token for Prometheus read API when authentication is required. + ... pattern=\w* + ${prometheus_token_value}= Set Variable ${prometheus_bearer_token} + EXCEPT + Log prometheus_bearer_token secret not present; continuing without bearer auth. WARN + ${prometheus_token_value}= Set Variable ${EMPTY} + END + + TRY + ${kubeconfig}= RW.Core.Import Secret + ... kubeconfig + ... type=string + ... description=Kubeconfig for optional kubectl-based pool mode confirmation. + ... pattern=\w* + ${kubeconfig_path}= Set Variable ${kubeconfig.key} + EXCEPT + Log kubeconfig secret not present; kubectl-based checks may be skipped. WARN + ${kubeconfig_path}= Set Variable ${EMPTY} + END + + ${PROMETHEUS_URL}= RW.Core.Import User Variable PROMETHEUS_URL + ... type=string + ... description=Base URL for Prometheus or Thanos querier API (e.g. https://prometheus.example/api/v1/). + ... pattern=https?://.* + ${PGBOUNCER_JOB_LABEL}= RW.Core.Import User Variable PGBOUNCER_JOB_LABEL + ... type=string + ... description=Prometheus label matcher for the PgBouncer exporter job, e.g. job="pgbouncer-exporter". + ... pattern=.* + ${EXPECTED_POOL_MODE}= RW.Core.Import User Variable EXPECTED_POOL_MODE + ... type=string + ... description=Expected pool mode (transaction, session, or statement). + ... pattern=\w* + ${CONTEXT}= RW.Core.Import User Variable CONTEXT + ... type=string + ... description=Kubernetes context name for kubectl commands when kubeconfig is provided. + ... default= + ... pattern=.* + ${METRIC_NAMESPACE_FILTER}= RW.Core.Import User Variable METRIC_NAMESPACE_FILTER + ... type=string + ... description=Additional Prometheus label matcher for namespace or kubernetes_namespace. + ... default= + ... pattern=.* + ${CLIENT_SATURATION_PERCENT_THRESHOLD}= RW.Core.Import User Variable CLIENT_SATURATION_PERCENT_THRESHOLD + ... type=string + ... description=Alert when active plus waiting connections exceed this percent of max_client_conn. + ... default=80 + ... pattern=\w* + ${MAX_WAIT_SECONDS_THRESHOLD}= RW.Core.Import User Variable MAX_WAIT_SECONDS_THRESHOLD + ... type=string + ... description=Maximum acceptable pgbouncer_pools_client_maxwait_seconds. + ... default=1 + ... pattern=\w* + ${CLIENT_WAITING_THRESHOLD}= RW.Core.Import User Variable CLIENT_WAITING_THRESHOLD + ... type=string + ... description=Alert when sum of waiting connections is greater than this value. + ... default=0 + ... pattern=\w* + ${DATABASE_HOTSPOT_PERCENT_THRESHOLD}= RW.Core.Import User Variable DATABASE_HOTSPOT_PERCENT_THRESHOLD + ... type=string + ... description=Flag databases whose share of connections exceeds this percent of the total. + ... default=50 + ... pattern=\w* + ${POD_OUTLIER_RATIO}= RW.Core.Import User Variable POD_OUTLIER_RATIO + ... type=string + ... description=Flag pods whose per-pod client active sum exceeds the fleet mean times this ratio. + ... default=2.0 + ... pattern=[0-9.]+ + ${GROWTH_RATE_WINDOW_MINUTES}= RW.Core.Import User Variable GROWTH_RATE_WINDOW_MINUTES + ... type=string + ... description=Lookback window in minutes for Prometheus range queries on connection growth. + ... default=15 + ... pattern=\w* + ${CONNECTION_GROWTH_RATE_THRESHOLD}= RW.Core.Import User Variable CONNECTION_GROWTH_RATE_THRESHOLD + ... type=string + ... description=Average rate of client connections (per second) above which a growth issue is raised. + ... default=0.1 + ... pattern=[0-9.]+ + ${KUBERNETES_DISTRIBUTION_BINARY}= RW.Core.Import User Variable KUBERNETES_DISTRIBUTION_BINARY + ... type=string + ... description=Kubernetes CLI binary to use for optional kubectl checks. + ... default=kubectl + ... pattern=\w* + ${PGBOUNCER_NAMESPACE}= RW.Core.Import User Variable PGBOUNCER_NAMESPACE + ... type=string + ... description=Namespace containing a PgBouncer pod for optional pool_mode.ini inspection via kubectl. + ... default= + ... pattern=.* + ${PGBOUNCER_POD_LABEL_SELECTOR}= RW.Core.Import User Variable PGBOUNCER_POD_LABEL_SELECTOR + ... type=string + ... description=Label selector used to locate a PgBouncer pod for optional pool mode verification. + ... default=app.kubernetes.io/name=pgbouncer-exporter + ... pattern=.* + ${PGBOUNCER_PGBOUNCER_CONTAINER}= RW.Core.Import User Variable PGBOUNCER_PGBOUNCER_CONTAINER + ... type=string + ... description=Optional container name for kubectl exec when the pod has multiple containers. + ... default= + ... pattern=.* + ${APP_REPLICAS}= RW.Core.Import User Variable APP_REPLICAS + ... type=string + ... description=Application replica count for capacity SLI (optional). + ... default= + ... pattern=.* + ${APP_DB_POOL_SIZE}= RW.Core.Import User Variable APP_DB_POOL_SIZE + ... type=string + ... description=Per-app SQL pool size for capacity SLI (optional). + ... default= + ... pattern=.* + ${PGBOUNCER_REPLICAS}= RW.Core.Import User Variable PGBOUNCER_REPLICAS + ... type=string + ... description=PgBouncer deployment replica count for capacity SLI (optional). + ... default= + ... pattern=.* + + Set Suite Variable ${PROMETHEUS_URL} ${PROMETHEUS_URL} + Set Suite Variable ${PGBOUNCER_JOB_LABEL} ${PGBOUNCER_JOB_LABEL} + Set Suite Variable ${EXPECTED_POOL_MODE} ${EXPECTED_POOL_MODE} + Set Suite Variable ${CONTEXT} ${CONTEXT} + Set Suite Variable ${METRIC_NAMESPACE_FILTER} ${METRIC_NAMESPACE_FILTER} + Set Suite Variable ${CLIENT_SATURATION_PERCENT_THRESHOLD} ${CLIENT_SATURATION_PERCENT_THRESHOLD} + Set Suite Variable ${MAX_WAIT_SECONDS_THRESHOLD} ${MAX_WAIT_SECONDS_THRESHOLD} + Set Suite Variable ${CLIENT_WAITING_THRESHOLD} ${CLIENT_WAITING_THRESHOLD} + Set Suite Variable ${DATABASE_HOTSPOT_PERCENT_THRESHOLD} ${DATABASE_HOTSPOT_PERCENT_THRESHOLD} + Set Suite Variable ${POD_OUTLIER_RATIO} ${POD_OUTLIER_RATIO} + Set Suite Variable ${GROWTH_RATE_WINDOW_MINUTES} ${GROWTH_RATE_WINDOW_MINUTES} + Set Suite Variable ${CONNECTION_GROWTH_RATE_THRESHOLD} ${CONNECTION_GROWTH_RATE_THRESHOLD} + Set Suite Variable ${KUBERNETES_DISTRIBUTION_BINARY} ${KUBERNETES_DISTRIBUTION_BINARY} + Set Suite Variable ${PGBOUNCER_NAMESPACE} ${PGBOUNCER_NAMESPACE} + Set Suite Variable ${PGBOUNCER_POD_LABEL_SELECTOR} ${PGBOUNCER_POD_LABEL_SELECTOR} + Set Suite Variable ${PGBOUNCER_PGBOUNCER_CONTAINER} ${PGBOUNCER_PGBOUNCER_CONTAINER} + Set Suite Variable ${APP_REPLICAS} ${APP_REPLICAS} + Set Suite Variable ${APP_DB_POOL_SIZE} ${APP_DB_POOL_SIZE} + Set Suite Variable ${PGBOUNCER_REPLICAS} ${PGBOUNCER_REPLICAS} + + ${env}= Create Dictionary + ... PROMETHEUS_URL=${PROMETHEUS_URL} + ... PGBOUNCER_JOB_LABEL=${PGBOUNCER_JOB_LABEL} + ... EXPECTED_POOL_MODE=${EXPECTED_POOL_MODE} + ... CONTEXT=${CONTEXT} + ... METRIC_NAMESPACE_FILTER=${METRIC_NAMESPACE_FILTER} + ... CLIENT_SATURATION_PERCENT_THRESHOLD=${CLIENT_SATURATION_PERCENT_THRESHOLD} + ... MAX_WAIT_SECONDS_THRESHOLD=${MAX_WAIT_SECONDS_THRESHOLD} + ... CLIENT_WAITING_THRESHOLD=${CLIENT_WAITING_THRESHOLD} + ... DATABASE_HOTSPOT_PERCENT_THRESHOLD=${DATABASE_HOTSPOT_PERCENT_THRESHOLD} + ... POD_OUTLIER_RATIO=${POD_OUTLIER_RATIO} + ... GROWTH_RATE_WINDOW_MINUTES=${GROWTH_RATE_WINDOW_MINUTES} + ... CONNECTION_GROWTH_RATE_THRESHOLD=${CONNECTION_GROWTH_RATE_THRESHOLD} + ... KUBERNETES_DISTRIBUTION_BINARY=${KUBERNETES_DISTRIBUTION_BINARY} + ... PGBOUNCER_NAMESPACE=${PGBOUNCER_NAMESPACE} + ... PGBOUNCER_POD_LABEL_SELECTOR=${PGBOUNCER_POD_LABEL_SELECTOR} + ... PGBOUNCER_PGBOUNCER_CONTAINER=${PGBOUNCER_PGBOUNCER_CONTAINER} + ... APP_REPLICAS=${APP_REPLICAS} + ... APP_DB_POOL_SIZE=${APP_DB_POOL_SIZE} + ... PGBOUNCER_REPLICAS=${PGBOUNCER_REPLICAS} + ... PROMETHEUS_BEARER_TOKEN=${prometheus_token_value} + ... KUBECONFIG=${kubeconfig_path} + Set Suite Variable ${env} ${env}