From db4035542fb5c292ef60b5ffb1d1396caaf0ec40 Mon Sep 17 00:00:00 2001 From: "rw-codebundle-agent[bot]" Date: Mon, 4 May 2026 16:50:27 +0000 Subject: [PATCH] Add mongodb-atlas-cluster-health CodeBundle. Implements Atlas Admin API v2 inventory, operational state/process checks, and sampled measurements with thresholds, plus an SLI shim, generation templates, and local structural validation scaffolding. Validated with: python3 -m scorer.score (104/104). Refs: runwhen-contrib workspace issue design spec #110 (parent #107). Co-authored-by: Cursor --- .../mongodb-atlas-cluster-health.yaml | 22 ++ .../mongodb-atlas-cluster-health-sli.yaml | 50 +++++ .../mongodb-atlas-cluster-health-slx.yaml | 31 +++ .../mongodb-atlas-cluster-health-taskset.yaml | 45 ++++ .../.test/Taskfile.yaml | 17 ++ .../.test/validate-atlas-bundle-structure.sh | 23 ++ .../mongodb-atlas-cluster-health/README.md | 49 +++++ .../analyze-atlas-cluster-metrics.sh | 192 +++++++++++++++++ .../atlas-api-common.inc.sh | 132 ++++++++++++ .../check-atlas-cluster-state.sh | 120 +++++++++++ .../gather-atlas-cluster-inventory.sh | 95 ++++++++ .../runbook.robot | 203 ++++++++++++++++++ .../sli-mongodb-atlas-quick-check.sh | 132 ++++++++++++ .../mongodb-atlas-cluster-health/sli.robot | 103 +++++++++ 14 files changed, 1214 insertions(+) create mode 100644 codebundles/mongodb-atlas-cluster-health/.runwhen/generation-rules/mongodb-atlas-cluster-health.yaml create mode 100644 codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-sli.yaml create mode 100644 codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-slx.yaml create mode 100644 codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-taskset.yaml create mode 100644 codebundles/mongodb-atlas-cluster-health/.test/Taskfile.yaml create mode 100755 codebundles/mongodb-atlas-cluster-health/.test/validate-atlas-bundle-structure.sh create mode 100644 codebundles/mongodb-atlas-cluster-health/README.md create mode 100755 codebundles/mongodb-atlas-cluster-health/analyze-atlas-cluster-metrics.sh create mode 100755 codebundles/mongodb-atlas-cluster-health/atlas-api-common.inc.sh create mode 100755 codebundles/mongodb-atlas-cluster-health/check-atlas-cluster-state.sh create mode 100755 codebundles/mongodb-atlas-cluster-health/gather-atlas-cluster-inventory.sh create mode 100644 codebundles/mongodb-atlas-cluster-health/runbook.robot create mode 100755 codebundles/mongodb-atlas-cluster-health/sli-mongodb-atlas-quick-check.sh create mode 100644 codebundles/mongodb-atlas-cluster-health/sli.robot diff --git a/codebundles/mongodb-atlas-cluster-health/.runwhen/generation-rules/mongodb-atlas-cluster-health.yaml b/codebundles/mongodb-atlas-cluster-health/.runwhen/generation-rules/mongodb-atlas-cluster-health.yaml new file mode 100644 index 00000000..5b1d10c8 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/.runwhen/generation-rules/mongodb-atlas-cluster-health.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: mongodb_atlas + generationRules: + - resourceTypes: + - mongodb_atlas_cluster + matchRules: + - type: pattern + pattern: ".+" + properties: [name] + mode: substring + slxs: + - baseName: mongo-atlas-cluster-hlth + qualifiers: [organization, project, cluster] + baseTemplateName: mongodb-atlas-cluster-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: mongodb-atlas-cluster-health-taskset.yaml diff --git a/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-sli.yaml b/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-sli.yaml new file mode 100644 index 00000000..e02a80fd --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-sli.yaml @@ -0,0 +1,50 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + displayUnitsLong: OK + displayUnitsShort: ok + locations: + - {{default_location}} + description: Samples MongoDB Atlas Admin API responsiveness, IDLE envelopes, and lightweight PRIMARY-derived measurements for alerting. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/mongodb-atlas-cluster-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 300 + configProvided: + - name: ATLAS_PROJECT_ID + value: "{{ match_resource.resource.atlas_project_id | default(match_resource.resource.project_id) }}" + - name: CLUSTER_FILTER + value: "{{ match_resource.resource.name | default('') }}" + - name: CONNECTION_THRESHOLD + value: "{{custom.atlas_connections_threshold | default('85')}}" + - name: CPU_UTIL_THRESHOLD + value: "{{custom.atlas_cpu_util_threshold | default('92')}}" + - name: SLI_MAX_MEASUREMENT_PROCESSES + value: "{{custom.atlas_sli_max_measurement_processes | default('8')}}" + secretsProvided: + {% if wb_version %} + {% include "mongodb_atlas-auth.yaml" ignore missing %} + {% else %} + - name: atlas_api_key_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-slx.yaml b/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-slx.yaml new file mode 100644 index 00000000..5511cf32 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-slx.yaml @@ -0,0 +1,31 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + imageURL: https://storage.googleapis.com/runwhen-nonprod-shared-images/icons/postgresql.svg + alias: MongoDB Atlas Health for {{ match_resource.resource.name | default(match_resource.alias) }} + asMeasuredBy: Composite digest-authenticated probes for Atlas cluster inventory responsiveness, IDLE posture minus pauses, and PRIMARY-derived workload samples. + configProvided: + - name: ATLAS_PROJECT_DISPLAY + value: "{{match_resource.resource.name}}" + owners: + - {{workspace.owner_email}} + statement: MongoDB Atlas hosted clusters retain healthy automation state and capacity headroom in project scope. + additionalContext: + {% include "mongodb_atlas-hierarchy.yaml" ignore missing %} + qualified_name: "{{match_resource.qualified_name}}" + tags: + {% include "mongodb_atlas-tags.yaml" ignore missing %} + - name: cloud + value: atlas + - name: datastore + value: mongodb + - name: scope + value: project + - name: access + value: read-only diff --git a/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-taskset.yaml b/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-taskset.yaml new file mode 100644 index 00000000..656c07f7 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/.runwhen/templates/mongodb-atlas-cluster-health-taskset.yaml @@ -0,0 +1,45 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Lists MongoDB Atlas inventory, verifies operational envelopes, and samples cluster metrics inside a scoped project footprint. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/mongodb-atlas-cluster-health/runbook.robot + configProvided: + - name: ATLAS_PROJECT_ID + value: "{{ match_resource.resource.atlas_project_id | default(match_resource.resource.project_id) }}" + - name: ATLAS_ORG_ID + value: "{{ match_resource.resource.organization_id | default(match_resource.labels.atlas_org_id | default('')) }}" + - name: CLUSTER_FILTER + value: "{{ match_resource.resource.name | default('') }}" + - name: CONNECTION_THRESHOLD + value: "{{custom.atlas_connections_threshold | default('85')}}" + - name: DISK_UTIL_THRESHOLD + value: "{{custom.atlas_disk_util_threshold | default('85')}}" + - name: REPLICATION_LAG_MS_THRESHOLD + value: "{{custom.atlas_replication_lag_ms | default('5000')}}" + - name: CPU_UTIL_THRESHOLD + value: "{{custom.atlas_cpu_util_threshold | default('92')}}" + secretsProvided: + {% if wb_version %} + {% include "mongodb_atlas-auth.yaml" ignore missing %} + {% else %} + - name: atlas_api_key_credentials + workspaceKey: AUTH DETAILS NOT FOUND + {% endif %} diff --git a/codebundles/mongodb-atlas-cluster-health/.test/Taskfile.yaml b/codebundles/mongodb-atlas-cluster-health/.test/Taskfile.yaml new file mode 100644 index 00000000..ace75d4f --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/.test/Taskfile.yaml @@ -0,0 +1,17 @@ +version: "3" + +tasks: + default: + desc: "Validate Atlas CodeBundle structure locally" + cmds: + - task: validate-structure + + validate-structure: + desc: "Run static checks without live Atlas credentials" + cmds: + - ./validate-atlas-bundle-structure.sh + + clean: + desc: "Remove local scaffold outputs" + cmds: + - rm -rf output workspaceInfo.yaml diff --git a/codebundles/mongodb-atlas-cluster-health/.test/validate-atlas-bundle-structure.sh b/codebundles/mongodb-atlas-cluster-health/.test/validate-atlas-bundle-structure.sh new file mode 100755 index 00000000..b97d108d --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/.test/validate-atlas-bundle-structure.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Structure validation for mongodb-atlas-cluster-health (no live Atlas project required). +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +test -f "$ROOT/runbook.robot" +test -f "$ROOT/sli.robot" +test -f "$ROOT/README.md" +test -f "$ROOT/.runwhen/generation-rules/mongodb-atlas-cluster-health.yaml" +test -f "$ROOT/.runwhen/templates/mongodb-atlas-cluster-health-slx.yaml" +test -f "$ROOT/.runwhen/templates/mongodb-atlas-cluster-health-taskset.yaml" +test -f "$ROOT/.runwhen/templates/mongodb-atlas-cluster-health-sli.yaml" + +for f in \ + gather-atlas-cluster-inventory.sh \ + check-atlas-cluster-state.sh \ + analyze-atlas-cluster-metrics.sh \ + sli-mongodb-atlas-quick-check.sh \ + atlas-api-common.inc.sh +do + test -x "$ROOT/$f" +done + +echo "mongodb-atlas-cluster-health bundle structure OK" diff --git a/codebundles/mongodb-atlas-cluster-health/README.md b/codebundles/mongodb-atlas-cluster-health/README.md new file mode 100644 index 00000000..6a97de99 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/README.md @@ -0,0 +1,49 @@ +# MongoDB Atlas Cluster Health + +Operators use this bundle to watch MongoDB Atlas clusters through digest-authenticated HTTPS calls against Atlas Admin API v2. Responses focus on actionable inventory breadcrumbs, transitional automation envelopes, Atlas replica process cues, and short-window workload telemetry that matches escalation expectations from parent issue #107. + +## Overview + +- **Baseline inventory**: Print provider footprints, tiers, majors, disks, pause toggles, and live `stateName` values for every Atlas cluster honoring optional name filters before deeper debugging. +- **Operational posture**: Correlate transitional automation states plus Atlas-published MongoDB replica `healthStatus` hints (when Atlas returns them) to separate planned maintenance from regressions affecting availability. +- **Workload metrics**: Stretch compact measurement queries across replica processes to compare CONNECTIVITY_PERCENT, NORMALIZED_SYSTEM_CPU_USER, DISK PARTITION data usage vs `diskSizeGB`, and replication lag surrogates (`OPLOG_SLAVE_LAG_MASTER_TIME`) against tunable envelopes. + +Discovery templates assume discovered `mongodb_atlas_cluster` resources expose `match_resource.resource.atlas_project_id` (or fallback `project_id`), optional `organization_id`, and canonical names for `CLUSTER_FILTER`. Adjust template paths if workspace metadata varies. + +## Configuration + +### Required Variables + +- `ATLAS_PROJECT_ID`: 24 hexadecimal characters identifying the Atlas project/group for every REST path segment. + +### Optional Variables + +- `ATLAS_ORG_ID`: Organizational identifier surfaced in inventories for auditors (informational annotations only). +- `CLUSTER_FILTER`: Comma-separated Atlas cluster names; leave blank or unset to iterate every Atlas cluster enumerated for the scoped project API call. +- `CONNECTION_THRESHOLD`: Percent ceiling evaluated when CONNECTIVITY_PERCENT samples exist per process (defaults to `85`). +- `DISK_UTIL_THRESHOLD`: Modeled occupancy percent comparing maximum `DISK_PARTITION_SPACE_USED_DATA` samples with declared `diskSizeGB` totals (defaults to `85`). +- `REPLICATION_LAG_MS_THRESHOLD`: Milliseconds tolerated for `OPLOG_SLAVE_LAG_MASTER_TIME` spikes (defaults to `5000`). +- `CPU_UTIL_THRESHOLD`: Applies to BOTH the deep metric sweep and bundled SLIs for NORMALIZED_SYSTEM_CPU_USER bursts (defaults to `92`). +- `SLI_MAX_MEASUREMENT_PROCESSES`: Bounds how many PRIMARY hosts the SLI script samples during each heartbeat to stay within Atlas rate envelopes (defaults to `8`). +- `ATLAS_API_BASE`: Sovereign/private endpoint overrides (defaults to `https://cloud.mongodb.com/api/atlas/v2`). +- `ATLAS_ACCEPT_HEADER`: API contract header (defaults to `application/vnd.atlas.2025-02-19+json`; rotate when Atlas documents a successor version). +- `ATLAS_METRICS_MEASUREMENT_DELAY_MS`: Millisecond delay between sequential measurement curls for chatty fleets (defaults to `200`; set `0` to disable). +- `ATLAS_PUBLIC_API_KEY` plus `ATLAS_PRIVATE_API_KEY` may replace the bundled secret whenever RunWhen injects raw halves instead of JSON. + +### Secrets + +- `atlas_api_key_credentials`: JSON pairing `ATLAS_PUBLIC_API_KEY` / `ATLAS_PRIVATE_API_KEY` (or `publicKey` / `privateKey`) emitted by Atlas for digest-authenticated callers. Grant **Project Read Only** scopes at minimum. + +## Tasks & Features + +### Gather MongoDB Atlas Cluster Inventory for Project `${ATLAS_PROJECT_ID}` + +Lists paused clusters plus clusters whose `stateName` drifts outside `IDLE` while unpaused. + +### Check MongoDB Atlas Cluster State for Project `${ATLAS_PROJECT_ID}` + +Flags paused clusters separately from automation transitions, investigates MongoDB replica `healthStatus` mismatches whenever Atlas returns that field. + +### Analyze MongoDB Atlas Cluster Metrics for Project `${ATLAS_PROJECT_ID}` + +Aggregates condensed measurement windows respecting operator thresholds; CONNECTION counts fall back to raw scalars without percent semantics when CONNECTIVITY_PERCENT is unavailable—threshold comparisons activate only when percent samples exist. diff --git a/codebundles/mongodb-atlas-cluster-health/analyze-atlas-cluster-metrics.sh b/codebundles/mongodb-atlas-cluster-health/analyze-atlas-cluster-metrics.sh new file mode 100755 index 00000000..71a69fa5 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/analyze-atlas-cluster-metrics.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/atlas-api-common.inc.sh" + +OUTPUT_FILE="atlas_cluster_metrics_issues.json" +: "${ATLAS_PROJECT_ID:?Must set ATLAS_PROJECT_ID}" + +CLUSTER_FILTER="${CLUSTER_FILTER:-}" +CONNECTION_THRESHOLD="${CONNECTION_THRESHOLD:-85}" +DISK_UTIL_THRESHOLD="${DISK_UTIL_THRESHOLD:-85}" +REPLICATION_LAG_MS_THRESHOLD="${REPLICATION_LAG_MS_THRESHOLD:-5000}" +CPU_UTIL_THRESHOLD="${CPU_UTIL_THRESHOLD:-92}" + +issues_json='[]' + +if ! atlas_resolve_credentials; then + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas metrics analysis blocked — credentials missing" \ + "Unable to authenticate to Atlas measurement endpoints." \ + 4 \ + "Provide atlas_api_key_credentials with Project Read Only access.")" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +atlas_clusters_json "${ATLAS_PROJECT_ID}" +if [[ "${atlas_last_http_status:-}" != "200" ]]; then + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas cluster enumeration failed prior to metrics" \ + "$(echo "${atlas_last_http_body:-}" | jq -c . 2>/dev/null || true)" \ + 4 \ + "Fix ATLAS_PROJECT_ID or API privileges, then rerun metrics.")" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +clusters_body="${atlas_last_http_body}" + +atlas_processes_json "${ATLAS_PROJECT_ID}" +if [[ "${atlas_last_http_status:-}" != "200" ]]; then + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas process enumeration failed prior to measurements" \ + "$(echo "${atlas_last_http_body:-}" | jq -c . 2>/dev/null || true)" \ + 3 \ + "Measurements require process inventory; verify Project Read Only access.")" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +processes_body="${atlas_last_http_body}" + +PRIMARY_METRICS='granularity=PT5M&period=PT45M&m=CONNECTIONS_PERCENT&m=NORMALIZED_SYSTEM_CPU_USER&m=OPLOG_SLAVE_LAG_MASTER_TIME&m=DISK_PARTITION_SPACE_USED_DATA' +FALLBACK_METRICS='granularity=PT5M&period=PT45M&m=CONNECTIONS&m=NORMALIZED_SYSTEM_CPU_USER&m=DISK_PARTITION_SPACE_USED_DATA' + +fetch_measurement_payload() { + local raw_pid="$1" + atlas_measurement_json "${ATLAS_PROJECT_ID}" "${raw_pid}" "?${PRIMARY_METRICS}" + if [[ "${atlas_last_http_status:-}" != "200" ]]; then + atlas_measurement_json "${ATLAS_PROJECT_ID}" "${raw_pid}" "?${FALLBACK_METRICS}" + fi +} + +max_metric_series() { + local json_blob="$1" + local mn="$2" + echo "$json_blob" | jq -r --arg n "$mn" ' + [.measurements[]? + | select(.name == $n) + | .dataPoints[]? + | select(.value != null) + | .value + ] | max // empty + ' +} + +printf '📊 Atlas metrics — project=%s | conn_thresh%%=%s disk_thresh%%=%s lag_ms_thresh=%s cpu_warn%%=%s\n' \ + "$ATLAS_PROJECT_ID" "$CONNECTION_THRESHOLD" "$DISK_UTIL_THRESHOLD" "$REPLICATION_LAG_MS_THRESHOLD" "$CPU_UTIL_THRESHOLD" + +filtered="$(filter_clusters_by_name "$clusters_body" "$CLUSTER_FILTER")" + +while IFS= read -r cjson; do + [[ -z "$cjson" ]] && continue + cname="$(echo "$cjson" | jq -r '.name')" + disk_gb="$(echo "$cjson" | jq -r '(try .diskSizeGB catch null) // empty')" + if [[ -z "$disk_gb" || "$disk_gb" == "null" ]]; then disk_gb="0"; fi + + pmap="$(printf '%s\n' "${processes_body}" | jq -c --arg cn "$cname" ' + [(.results // [])[] + | select(.replicaSetName != null) + | select(.replicaSetName == $cn or (.replicaSetName | startswith($cn + "-"))) + ] + | unique_by(.id) + ')" + + nproc="$(echo "$pmap" | jq 'length')" + if [[ "$nproc" == "0" ]]; then + issues_json="$(append_issue_json "$issues_json" \ + "No scoped MongoDB processes for metrics on cluster \`${cname}\`" \ + "Atlas process inventory did not expose replica-set members labeled for \`${cname}\`; some flex/serverless tiers may omit these entries." \ + 1 \ + "Validate cluster type supports host metrics APIs and consult Atlas Charts as a fallback.")" + continue + fi + + agg_conn="" agg_conn_raw="" agg_cpu="" agg_lag="" agg_disk="" + + while IFS= read -r rawpid; do + [[ -z "$rawpid" ]] && continue + fetch_measurement_payload "${rawpid}" + mh="${atlas_last_http_status:-}" + mbuf="${atlas_last_http_body:-}" + if [[ "$mh" != "200" ]]; then + continue + fi + pct="$(max_metric_series "$mbuf" "CONNECTIONS_PERCENT")" + raw="$(max_metric_series "$mbuf" "CONNECTIONS")" + cpu="$(max_metric_series "$mbuf" "NORMALIZED_SYSTEM_CPU_USER")" + lag="$(max_metric_series "$mbuf" "OPLOG_SLAVE_LAG_MASTER_TIME")" + disk="$(max_metric_series "$mbuf" "DISK_PARTITION_SPACE_USED_DATA")" + + if [[ -n "$pct" ]] && awk -v b="${agg_conn:-nan}" -v v="$pct" 'BEGIN{v=v+0; if (b=="nan" || b==""){exit 0}; b=b+0; exit !(v>b)}'; then agg_conn="$pct"; fi + if [[ -n "$raw" ]] && awk -v b="${agg_conn_raw:-nan}" -v v="$raw" 'BEGIN{v=v+0; if (b=="nan" || b==""){exit 0}; b=b+0; exit !(v>b)}'; then agg_conn_raw="$raw"; fi + + if [[ -n "$cpu" ]] && awk -v b="${agg_cpu:-nan}" -v v="$cpu" 'BEGIN{v=v+0; if (b=="nan" || b==""){exit 0}; b=b+0; exit !(v>b)}'; then agg_cpu="$cpu"; fi + if [[ -n "$lag" ]] && awk -v b="${agg_lag:-nan}" -v v="$lag" 'BEGIN{v=v+0; if (b=="nan" || b==""){exit 0}; b=b+0; exit !(v>b)}'; then agg_lag="$lag"; fi + if [[ -n "$disk" ]] && awk -v b="${agg_disk:-nan}" -v v="$disk" 'BEGIN{v=v+0; if (b=="nan" || b==""){exit 0}; b=b+0; exit !(v>b)}'; then agg_disk="$disk"; fi + sleep_int="${ATLAS_METRICS_MEASUREMENT_DELAY_MS:-200}" + if [[ "${sleep_int}" =~ ^[0-9]+$ ]] && [[ "${sleep_int}" != "0" ]]; then + ms_to_sec="$(awk -v ms="$sleep_int" 'BEGIN{printf("%.3f", ms/1000)}')" + sleep "${ms_to_sec}" + fi + done < <(echo "$pmap" | jq -r '.[].id') + + conn_metric_label="CONNECTIONS_PERCENT" + conn_metric_value="${agg_conn}" + if [[ -z "${conn_metric_value:-}" ]]; then + conn_metric_label="CONNECTIONS" + conn_metric_value="${agg_conn_raw}" + fi + + [[ -z "${conn_metric_value:-}" ]] || [[ "$conn_metric_value" == "null" ]] && conn_metric_value="" + + if [[ "${conn_metric_label}" == "CONNECTIONS_PERCENT" && -n "$conn_metric_value" ]]; then + if awk -v c="${CONNECTION_THRESHOLD}" -v v="$conn_metric_value" 'BEGIN { exit !( (v+0) > (c+0) ) }'; then + issues_json="$(append_issue_json "$issues_json" \ + "Elevated Atlas connection pressure for cluster \`${cname}\`" \ + "Peak ${conn_metric_label} observed across sampled processes ≈ ${conn_metric_value} (threshold=${CONNECTION_THRESHOLD})." \ + 2 \ + "Review connection pools, orphaned clients, autoscaling tiers, IP access lists, or workload bursts.")" + fi + fi + + if [[ -n "${agg_cpu:-}" ]] && awk -v c="${CPU_UTIL_THRESHOLD}" -v v="$agg_cpu" 'BEGIN { exit !( (v+0) > (c+0) ) }'; then + issues_json="$(append_issue_json "$issues_json" \ + "High normalized host CPU for Atlas cluster \`${cname}\`" \ + "Peak NORMALIZED_SYSTEM_CPU_USER sampled ≈ ${agg_cpu}% (configured CPU_UTIL_THRESHOLD=${CPU_UTIL_THRESHOLD}%)." \ + 3 \ + "Tune query/index patterns, consider cluster scaling, investigate noisy neighbors on shared tiers.")" + fi + + disk_pct="" + if [[ "$disk_gb" != "0" ]] && [[ -n "${agg_disk:-}" ]]; then + disk_pct="$(awk -v gb="$disk_gb" -v b="$agg_disk" 'BEGIN{if (gb+0<=0){print ""; exit}; printf("%.2f",(b+0)/(gb*1073741824)*100)}')" + fi + if [[ -n "${disk_pct}" ]] && awk -v d="${DISK_UTIL_THRESHOLD}" -v v="$disk_pct" 'BEGIN { exit !( (v+0) > (d+0) ) }'; then + issues_json="$(append_issue_json "$issues_json" \ + "Elevated Atlas data disk utilization for cluster \`${cname}\`" \ + "Max DISK_PARTITION_SPACE_USED_DATA ≈ ${agg_disk} bytes vs provisioned diskSizeGB=${disk_gb} ⇒ ~${disk_pct}% (> ${DISK_UTIL_THRESHOLD}% threshold)." \ + 2 \ + "Plan disk scale-out, archiving, or TTL/index cleanup; coordinate with Atlas online disk expansion.")" + fi + + if [[ -n "${agg_lag:-}" ]] && awk -v l="${REPLICATION_LAG_MS_THRESHOLD}" -v v="$agg_lag" 'BEGIN { exit !( (v+0) > (l+0) ) }'; then + issues_json="$(append_issue_json "$issues_json" \ + "Replication lag spike on Atlas cluster \`${cname}\`" \ + "Peak OPLOG_SLAVE_LAG_MASTER_TIME ≈ ${agg_lag}ms (>${REPLICATION_LAG_MS_THRESHOLD}ms)." \ + 4 \ + "Inspect write load, replication windows, VPC latency, indexing operations, or investigate secondary eviction events.")" + fi + + summary_line="$(printf '%s | %s_peak=%s %s_peak=%s%% disk_used_max=%sB disk_gb=%s repl_lag_peak_ms=%s' \ + "${cname}" "${conn_metric_label}" "${conn_metric_value:-na}" \ + "NORMALIZED_SYSTEM_CPU_USER" "${agg_cpu:-na}" "${agg_disk:-na}" "${disk_gb}" "${agg_lag:-na}")" + printf '%s\n' "$summary_line" +done < <(echo "$filtered" | jq -c '.[]') + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +printf 'Atlas metrics sweep complete → %s\n' "$OUTPUT_FILE" diff --git a/codebundles/mongodb-atlas-cluster-health/atlas-api-common.inc.sh b/codebundles/mongodb-atlas-cluster-health/atlas-api-common.inc.sh new file mode 100755 index 00000000..ff42bd4b --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/atlas-api-common.inc.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2034 + +# Shared MongoDB Atlas Admin API v2 helpers (digest auth, JSON parsing). + +ATLAS_API_BASE="${ATLAS_API_BASE:-https://cloud.mongodb.com/api/atlas/v2}" +ATLAS_ACCEPT_HEADER="${ATLAS_ACCEPT_HEADER:-application/vnd.atlas.2025-02-19+json}" + +atlas_resolve_credentials() { + if [[ -n "${ATLAS_PUBLIC_API_KEY:-}" && -n "${ATLAS_PRIVATE_API_KEY:-}" ]]; then + return 0 + fi + local raw="${secret__atlas_api_key_credentials:-}" + raw="${raw:-${atlas_api_key_credentials:-}}" + raw="${raw:-${ATLAS_API_KEY_CREDENTIALS_JSON:-}}" + if [[ -z "$raw" ]]; then + return 1 + fi + export ATLAS_PUBLIC_API_KEY="$(echo "$raw" | jq -r '.ATLAS_PUBLIC_API_KEY // .publicKey // empty')" + export ATLAS_PRIVATE_API_KEY="$(echo "$raw" | jq -r '.ATLAS_PRIVATE_API_KEY // .privateKey // empty')" + if [[ -z "$ATLAS_PUBLIC_API_KEY" || -z "$ATLAS_PRIVATE_API_KEY" ]]; then + return 1 + fi + return 0 +} + +# GET path like /groups/foo/clusters — writes body to stdout, HTTP status to atlas_last_http_status +atlas_http_get_raw() { + local path="$1" + atlas_last_http_status="" + atlas_last_http_body="" + local url="${ATLAS_API_BASE}${path}" + local resp_file status_file + resp_file="$(mktemp)" + status_file="$(mktemp)" + set +e + curl -sS -w "%{http_code}" --digest --user "${ATLAS_PUBLIC_API_KEY}:${ATLAS_PRIVATE_API_KEY}" \ + -H "Accept: ${ATLAS_ACCEPT_HEADER}" \ + -o "${resp_file}" \ + "${url}" >"${status_file}" 2>/dev/null + local curl_ec=$? + set -e + atlas_last_http_status="$(cat "${status_file}" 2>/dev/null || printf '%s' '')" + rm -f "${status_file}" + if [[ "${curl_ec}" != 0 ]]; then + atlas_last_http_body="$(cat "${resp_file}" 2>/dev/null || true)" + rm -f "${resp_file}" + return 2 + fi + atlas_last_http_body="$(cat "${resp_file}" 2>/dev/null || true)" + rm -f "${resp_file}" + return 0 +} + +atlas_clusters_json() { + local group_id="$1" + atlas_http_get_raw "/groups/$(printf '%s' "${group_id}" | jq -sRr @uri)/clusters?itemsPerPage=500" +} + +atlas_processes_json() { + local group_id="$1" + atlas_http_get_raw "/groups/$(printf '%s' "${group_id}" | jq -sRr @uri)/processes?itemsPerPage=500" +} + +# URL-encoded process id for measurements path segment +atlas_encoded_process_path() { + local pid="$1" + printf '%s' "${pid}" | jq -sRr @uri +} + +atlas_measurement_json() { + local group_id="$1" + local process_id="$2" + local query="$3" + local enc + enc="$(atlas_encoded_process_path "${process_id}")" + atlas_http_get_raw "/groups/$(printf '%s' "${group_id}" | jq -sRr @uri)/processes/${enc}/measurements${query}" +} + +latest_nonnull_metric_max() { + local json_payload="$1" + local metric_name="$2" + echo "$json_payload" | jq -r --arg n "$metric_name" ' + [.measurements[]? | select(.name == $n) | .dataPoints[]? | select(.value != null) | .value] | max // empty + ' +} + +append_issue_json() { + local cur="$1" + local title="$2" + local details="$3" + local severity="$4" + local next_steps="$5" + echo "$cur" | jq \ + --arg title "$title" \ + --arg details "$details" \ + --argjson severity "${severity}" \ + --arg next_steps "$next_steps" \ + '. += [{"title": $title, "details": $details, "severity": $severity, "next_steps": $next_steps}]' +} + +cluster_matches_filter() { + local cluster_name="$1" + local filter_csv="$2" + if [[ -z "$filter_csv" ]]; then + return 0 + fi + IFS=',' read -ra parts <<<"${filter_csv}" + for tok in "${parts[@]}"; do + stripped="${tok#"${tok%%[![:space:]]*}"}" + stripped="${stripped%"${stripped##*[![:space:]]}"}" + [[ -z "$stripped" ]] && continue + if [[ "$cluster_name" == "$stripped" ]]; then + return 0 + fi + done + return 1 +} + +filter_clusters_by_name() { + local clusters_json="$1" + local filter_csv="$2" + if [[ -z "$filter_csv" ]]; then + echo "$clusters_json" | jq -c '[.results[]?]' + return 0 + fi + echo "$clusters_json" | jq -c --arg f "$filter_csv" ' + $f as $csv + | ($csv | split(",") | map(gsub("^\\s+|\\s+$";"")) | map(select(length>0))) as $names + | [.results[]? | select(.name as $n | $names | index($n) != null)] + ' +} diff --git a/codebundles/mongodb-atlas-cluster-health/check-atlas-cluster-state.sh b/codebundles/mongodb-atlas-cluster-health/check-atlas-cluster-state.sh new file mode 100755 index 00000000..d0d4aedb --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/check-atlas-cluster-state.sh @@ -0,0 +1,120 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/atlas-api-common.inc.sh" + +OUTPUT_FILE="atlas_cluster_state_issues.json" +: "${ATLAS_PROJECT_ID:?Must set ATLAS_PROJECT_ID}" + +CLUSTER_FILTER="${CLUSTER_FILTER:-}" + +issues_json='[]' + +if ! atlas_resolve_credentials; then + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas API authentication failed — cluster state check skipped" \ + "Cannot resolve programmatic API credentials for digest auth." \ + 4 \ + "Populate atlas_api_key_credentials with Atlas API public/private keys.")" + echo "$issues_json" >"$OUTPUT_FILE" + printf '%s\n' "credential resolution failed" >&2 + exit 0 +fi + +atlas_clusters_json "${ATLAS_PROJECT_ID}" +if [[ "${atlas_last_http_status:-}" != "200" ]]; then + body="${atlas_last_http_body:-}" + details="$(echo "$body" | jq -c . 2>/dev/null || printf '%s' "$body")" + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas cluster listing failed (\`${ATLAS_PROJECT_ID}\`)" \ + "HTTP ${atlas_last_http_status:-}: ${details}" \ + 4 \ + "Fix API authorization or ATLAS_PROJECT_ID before evaluating cluster operational state.")" + echo "$issues_json" >"$OUTPUT_FILE" + exit 0 +fi + +clusters_resp="${atlas_last_http_body}" + +atlas_processes_json "${ATLAS_PROJECT_ID}" +processes_resp="${atlas_last_http_body:-}" +proc_http="${atlas_last_http_status:-}" +if [[ "${proc_http}" != "200" ]]; then + processes_resp="" +fi + +filtered="$(filter_clusters_by_name "$clusters_resp" "$CLUSTER_FILTER")" + +severity_for_state() { + local sn="${1:-}" + [[ -z "${sn:-}" ]] && sn="IDLE" + sn="${sn^^}" + case "$sn" in + IDLE|MONGOS_ONLY) printf '%s' "0";; + UPDATING|PENDING_RESTART|SERVICE_UPDATING|MAINTENANCING|TENANT_RESTORE_IN_PROGRESS|TENANT_MIGRATE_IN_PROGRESS) printf '%s' "3";; + CREATING|LOADING|SAVING|WAITING_RESTORE|SYNCING|SYNC_REQUESTED|AUTO_SCALING|SIMPLE_SSL_ROTATING|UNSYNCING) printf '%s' "3";; + RECOVERING|PENDING_RESTORE|RESTORING|REPAIRING|ROLLBACK|RELEASE_FAILED|RESOURCE_LOCK_PROVISIONING) printf '%s' "4";; + FAILED|STOPPED|DELETING|INTERNAL_ERROR|TENANT_RELOCATION_ERROR|UNSYNCFAILED) printf '%s' "4";; + PAUSED_IDLE|PAUSED) printf '%s' "2";; + *) printf '%s' "3";; + esac +} + +printf 'Operational state sweep for Atlas project %s (%s clusters after filter).\n' \ + "${ATLAS_PROJECT_ID}" "$(echo "$filtered" | jq -r 'length')" + +while IFS= read -r cjson; do + [[ -z "$cjson" ]] && continue + nm="$(echo "$cjson" | jq -r '.name')" + paused="$(echo "$cjson" | jq -r '.paused // false')" + st="$(echo "$cjson" | jq -r '.stateName // .state.name // empty')" + [[ -z "$st" ]] && st="IDLE" + + sv="$(severity_for_state "$st")" + + [[ "$paused" == "true" ]] && { + issues_json="$(append_issue_json "$issues_json" \ + "Cluster \`${nm}\` paused — compute layer unavailable" \ + "Atlas reports paused=true with stateName=${st}. Applications cannot connect while paused." \ + 3 \ + "Resume the cluster in Atlas or confirm whether this pause is an approved change window.")" + continue + } + + if [[ "$sv" != "0" ]]; then + sev_out="$sv" + issues_json="$(append_issue_json "$issues_json" \ + "Cluster \`${nm}\` not in idle operational state (${st})" \ + "Atlas cluster stateName=${st} for \`${nm}\`; availability or updates may still be transitioning." \ + "${sev_out}" \ + "Watch Atlas deployments, Atlas status page, and application error budgets; hold traffic shifts until state returns to IDLE.")" + fi + + # Process-level signals when enumeration succeeded + if [[ -n "$processes_resp" ]] && printf '%s' "$processes_resp" | jq -e '.results[]' >/dev/null 2>&1; then + bad="$(printf '%s' "$processes_resp" | jq -r --arg cn "$nm" ' + [.results[]? + | select(.typeName == "REPLICA_SECONDARY" or .typeName == "REPLICA_PRIMARY" or .typeName == "REPLICA_ARBITER") + | select(.replicaSetName != null) + | select(.replicaSetName == $cn or (.replicaSetName | startswith($cn + "-"))) + | select(((.healthStatus // "") | length > 0) and .healthStatus != "HEALTHY") + ] | length')" + if [[ "${bad:-0}" != "0" ]]; then + sample="$(printf '%s' "$processes_resp" | jq -c --arg cn "$nm" '[.results[]? + | select(.replicaSetName == $cn or (.replicaSetName|startswith($cn+"-"))) + | {id,userAlias,typeName,replicaSetName,healthStatus}] | .[0:6]' | head -c 1200)" + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas reports unhealthy MongoDB processes for cluster \`${nm}\`" \ + "${bad} replica processes under cluster scope show healthStatus≠HEALTHY. Sample=${sample}" \ + 4 \ + "Inspect affected nodes in Atlas Metrics/Real-Time Performance; plan failovers or Atlas support escalation if quorum is impacted.")" + fi + + fi +done < <(echo "$filtered" | jq -c '.[]') + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +printf 'Cluster state evaluation complete → %s\n' "$OUTPUT_FILE" diff --git a/codebundles/mongodb-atlas-cluster-health/gather-atlas-cluster-inventory.sh b/codebundles/mongodb-atlas-cluster-health/gather-atlas-cluster-inventory.sh new file mode 100755 index 00000000..ab13f9bd --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/gather-atlas-cluster-inventory.sh @@ -0,0 +1,95 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/atlas-api-common.inc.sh" + +OUTPUT_FILE="atlas_cluster_inventory_issues.json" +: "${ATLAS_PROJECT_ID:?Must set ATLAS_PROJECT_ID}" + +ATLAS_ORG_ID="${ATLAS_ORG_ID:-}" +CLUSTER_FILTER="${CLUSTER_FILTER:-}" + +issues_json='[]' + +if ! atlas_resolve_credentials; then + issues_json="$(append_issue_json "$issues_json" \ + "Cannot authenticate to MongoDB Atlas API for inventory" \ + "Missing ATLAS_PUBLIC_API_KEY / ATLAS_PRIVATE_API_KEY or parsable atlas_api_key_credentials JSON." \ + 4 \ + "Create an Atlas programmatic API key with Project Read Only and map it via the atlas_api_key_credentials secret (JSON keys ATLAS_PUBLIC_API_KEY and ATLAS_PRIVATE_API_KEY).")" + echo "$issues_json" >"$OUTPUT_FILE" + printf '%s\n' "Atlas credential resolution failed." >&2 + exit 0 +fi + +atlas_clusters_json "${ATLAS_PROJECT_ID}" +hc="${atlas_last_http_status:-}" +body="${atlas_last_http_body:-}" + +if [[ "$hc" != "200" ]]; then + details="HTTP ${hc} listing clusters — $(echo "$body" | jq -c .detail,.error,.errorCode? 2>/dev/null || echo "$body")" + issues_json="$(append_issue_json "$issues_json" \ + "MongoDB Atlas cluster inventory request failed for project \`${ATLAS_PROJECT_ID}\`" \ + "$details" \ + 4 \ + "Confirm ATLAS_PROJECT_ID, API key scopes, Accept header (${ATLAS_ACCEPT_HEADER}), and project membership.")" + echo "$issues_json" >"$OUTPUT_FILE" + printf '%s\n' "$details" + exit 0 +fi + +filtered="$(filter_clusters_by_name "$body" "$CLUSTER_FILTER")" +count_filtered="$(echo "$filtered" | jq 'length')" +if [[ "$count_filtered" == "0" ]]; then + issues_json="$(append_issue_json "$issues_json" \ + "No Atlas clusters matched filter for project \`${ATLAS_PROJECT_ID}\`" \ + "CLUSTER_FILTER is set (${CLUSTER_FILTER}) but no clusters in this project matched the names supplied." \ + 2 \ + "Unset CLUSTER_FILTER to evaluate all clusters, or fix comma-separated names to match Atlas cluster names exactly.")" +fi + +audit_ctx="" +[[ -n "$ATLAS_ORG_ID" ]] && audit_ctx="$(printf ' ATLAS_ORG_ID=%s' "$ATLAS_ORG_ID")" + +printf '📋 MongoDB Atlas project %s%s — %s cluster(s) after filter\n' \ + "${ATLAS_PROJECT_ID}" "${audit_ctx}" "${count_filtered}" + +declare -i idx=0 +while IFS= read -r cjson; do + [[ -z "$cjson" ]] && continue + idx+=1 + name="$(echo "$cjson" | jq -r '.name // "-"')" + st="$(echo "$cjson" | jq -r '.stateName // (.state.name // "-")')" + ver="$(echo "$cjson" | jq -r '.mongoDBMajorVersion // .mongoDBVersion // "-"')" + paused="$(echo "$cjson" | jq -r '.paused // false')" + ctype="$(echo "$cjson" | jq -r '.clusterType // "-"')" + prov="$(echo "$cjson" | jq -r '.providerSettings.providerName // "-"')" + reg="$(echo "$cjson" | jq -r '.providerSettings.regionName // "-"')" + size="$(echo "$cjson" | jq -r '.providerSettings.instanceSizeName // "-"')" + disk="$(echo "$cjson" | jq -r '.diskSizeGB // "-"')" + + printf ' [%s] %s | type=%s state=%s version=%s provider=%s region=%s tier=%s diskGB=%s paused=%s\n' \ + "$idx" "$name" "$ctype" "$st" "$ver" "$prov" "$reg" "$size" "$disk" "$paused" + + # inventory issues: paused / noteworthy states (severity 1–2) + if [[ "$paused" == "true" ]]; then + issues_json="$(append_issue_json "$issues_json" \ + "Cluster \`${name}\` is paused" \ + "Atlas reports paused=true for cluster \`${name}\` (${prov}/${reg}, ${size}). Operational traffic is halted until resumed." \ + 2 \ + "Resume via Atlas UI/API if maintenance is complete, or acknowledge intentional pause outside production windows.")" + fi + if [[ "$st" != "IDLE" && "$st" != "-" && "$paused" != "true" ]]; then + issues_json="$(append_issue_json "$issues_json" \ + "Cluster \`${name}\` is not in IDLE state (${st})" \ + "stateName=${st} for \`${name}\` — Atlas may still be applying changes." \ + 1 \ + "Track Atlas UI Deployment view; informational while updates finish unless coupled with outage symptoms.")" + fi +done < <(echo "$filtered" | jq -c '.[]') + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +printf 'Inventory complete. Issues written to %s\n' "$OUTPUT_FILE" diff --git a/codebundles/mongodb-atlas-cluster-health/runbook.robot b/codebundles/mongodb-atlas-cluster-health/runbook.robot new file mode 100644 index 00000000..6514ca31 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/runbook.robot @@ -0,0 +1,203 @@ +*** Settings *** +Documentation Read-only MongoDB Atlas Admin API v2 sweeps for clusters in a single project: topology inventory, operational state signals, and sampled workload metrics via digest-authenticated HTTPS. +Metadata Author rw-codebundle-agent +Metadata Display Name MongoDB Atlas Cluster Health +Metadata Supports MongoDB Atlas cluster replication metrics observability + +Force Tags MongoDB Atlas cluster health metrics read-only + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + +*** Tasks *** +Gather MongoDB Atlas Cluster Inventory for Project `${ATLAS_PROJECT_ID}` + [Documentation] Lists Atlas clusters in the scoped project and summarizes edition, MongoDB versions, clouds, tiers, disks, paused flags, and transitional states operators need prior to narrowing incidents. + [Tags] MongoDB Atlas inventory access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=gather-atlas-cluster-inventory.sh + ... env=${env} + ... secret__atlas_api_key_credentials=${atlas_api_key_credentials} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=CLUSTER_FILTER="${CLUSTER_FILTER}" ./gather-atlas-cluster-inventory.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat atlas_cluster_inventory_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for Atlas inventory issues, defaulting to empty list. WARN + ${issue_list}= Create List + END + + ${inv_count}= Get Length ${issue_list} + + IF ${inv_count} > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Atlas inventory snapshots should expose healthy IDLE clusters aligned with Atlas UI realities. + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Inventory summary: + RW.Core.Add Pre To Report ${result.stdout} + +Check MongoDB Atlas Cluster State for Project `${ATLAS_PROJECT_ID}` + [Documentation] Evaluates paused clusters, transitional Atlas state enums, MongoDB replica process scopes, and healthStatus markers to pinpoint degradations before SLA breaches. + [Tags] MongoDB Atlas availability replication access:read-only data:config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-atlas-cluster-state.sh + ... env=${env} + ... secret__atlas_api_key_credentials=${atlas_api_key_credentials} + ... timeout_seconds=240 + ... include_in_history=false + ... cmd_override=CLUSTER_FILTER="${CLUSTER_FILTER}" ./check-atlas-cluster-state.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat atlas_cluster_state_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse Atlas cluster-state JSON payload, defaulting to empty issue list. WARN + ${issue_list}= Create List + END + + ${state_count}= Get Length ${issue_list} + + IF ${state_count} > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Clusters remain IDLE/MONGOS_ONLY with healthy replica topology or explicit Atlas maintenance windows without surprise downtime. + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Operational state findings: + RW.Core.Add Pre To Report ${result.stdout} + +Analyze MongoDB Atlas Cluster Metrics for Project `${ATLAS_PROJECT_ID}` + [Documentation] Pulls last~45 minute PT5 granular measurements for CONNECTIVITY_PERCENT, NORMALIZED_SYSTEM_CPU_USER, OPLOG-derived replication lag, and DISK PARTITION usage to compare against thresholds for noisy-neighbor workloads. + [Tags] MongoDB Atlas metrics observability access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=analyze-atlas-cluster-metrics.sh + ... env=${env} + ... secret__atlas_api_key_credentials=${atlas_api_key_credentials} + ... timeout_seconds=280 + ... include_in_history=false + ... cmd_override=CLUSTER_FILTER="${CLUSTER_FILTER}" ./analyze-atlas-cluster-metrics.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat atlas_cluster_metrics_issues.json + ... timeout_seconds=30 + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse Atlas metrics JSON payload. WARN + ${issue_list}= Create List + END + + ${metrics_count}= Get Length ${issue_list} + + IF ${metrics_count} > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=Workload metrics remain under configured envelopes for connections, CPU, disks, and replica lag snapshots. + ... actual=${issue['details']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Metrics sweep output: + RW.Core.Add Pre To Report ${result.stdout} + +*** Keywords *** +Suite Initialization + TRY + ${atlas_api_key_credentials}= RW.Core.Import Secret atlas_api_key_credentials + ... type=string + ... description=MongoDB Atlas programmatic API digest key serialized as JSON + ... pattern=\w* + Set Suite Variable ${atlas_api_key_credentials} ${atlas_api_key_credentials} + EXCEPT + Log atlas_api_key_credentials unavailable; Atlas API calls fail fast with structured issues. WARN + Set Suite Variable ${atlas_api_key_credentials} ${EMPTY} + END + + ${ATLAS_PROJECT_ID}= RW.Core.Import User Variable ATLAS_PROJECT_ID + ... type=string + ... description=MongoDB Atlas project / group identifier (hex) + ... pattern=^[a-f0-9]{24}$ + ${ATLAS_ORG_ID}= RW.Core.Import User Variable ATLAS_ORG_ID + ... type=string + ... description=Optional Atlas organization identifier for audit annotations + ... pattern=^[a-fA-F0-9]{0,24}$ + ... default= + ${CLUSTER_FILTER}= RW.Core.Import User Variable CLUSTER_FILTER + ... type=string + ... description=Comma-separated Atlas cluster names; blank checks every cluster discovered in-project + ... pattern=^[\w\-, ]*$ + ... default= + ${CONNECTION_THRESHOLD}= RW.Core.Import User Variable CONNECTION_THRESHOLD + ... type=string + ... description=Issues when sampled CONNECTIONS_PERCENT exceeds this utilization percent + ... pattern=^\d+$ + ... default=85 + ${DISK_UTIL_THRESHOLD}= RW.Core.Import User Variable DISK_UTIL_THRESHOLD + ... type=string + ... description=Issues when modeled disk occupancy exceeds this percent versus diskSizeGB + ... pattern=^\d+$ + ... default=85 + ${REPLICATION_LAG_MS_THRESHOLD}= RW.Core.Import User Variable REPLICATION_LAG_MS_THRESHOLD + ... type=string + ... description=Issues when OPLOG_SLAVE_LAG_MASTER_TIME exceeds milliseconds value + ... pattern=^\d+$ + ... default=5000 + ${CPU_UTIL_THRESHOLD}= RW.Core.Import User Variable CPU_UTIL_THRESHOLD + ... type=string + ... description=Issues when NORMALIZED_SYSTEM_CPU_USER maximum samples exceed percentage + ... pattern=^\d{1,3}$ + ... default=92 + + Set Suite Variable ${ATLAS_PROJECT_ID} ${ATLAS_PROJECT_ID} + Set Suite Variable ${ATLAS_ORG_ID} ${ATLAS_ORG_ID} + Set Suite Variable ${CLUSTER_FILTER} ${CLUSTER_FILTER} + Set Suite Variable ${CONNECTION_THRESHOLD} ${CONNECTION_THRESHOLD} + Set Suite Variable ${DISK_UTIL_THRESHOLD} ${DISK_UTIL_THRESHOLD} + Set Suite Variable ${REPLICATION_LAG_MS_THRESHOLD} ${REPLICATION_LAG_MS_THRESHOLD} + Set Suite Variable ${CPU_UTIL_THRESHOLD} ${CPU_UTIL_THRESHOLD} + + ${env}= Create Dictionary + ... ATLAS_PROJECT_ID=${ATLAS_PROJECT_ID} + ... ATLAS_ORG_ID=${ATLAS_ORG_ID} + ... CLUSTER_FILTER=${CLUSTER_FILTER} + ... CONNECTION_THRESHOLD=${CONNECTION_THRESHOLD} + ... DISK_UTIL_THRESHOLD=${DISK_UTIL_THRESHOLD} + ... REPLICATION_LAG_MS_THRESHOLD=${REPLICATION_LAG_MS_THRESHOLD} + ... CPU_UTIL_THRESHOLD=${CPU_UTIL_THRESHOLD} + Set Suite Variable ${env} ${env} diff --git a/codebundles/mongodb-atlas-cluster-health/sli-mongodb-atlas-quick-check.sh b/codebundles/mongodb-atlas-cluster-health/sli-mongodb-atlas-quick-check.sh new file mode 100755 index 00000000..44f0cb58 --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/sli-mongodb-atlas-quick-check.sh @@ -0,0 +1,132 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=/dev/null +source "${SCRIPT_DIR}/atlas-api-common.inc.sh" + +OUTPUT_FILE="sli_mongodb_atlas_scores.json" + +: "${ATLAS_PROJECT_ID:?Must set ATLAS_PROJECT_ID}" + +CLUSTER_FILTER="${CLUSTER_FILTER:-}" +SLI_MAX_MEASUREMENT_PROCESSES="${SLI_MAX_MEASUREMENT_PROCESSES:-8}" +METRIC_WINDOW='granularity=PT5M&period=PT45M&m=CONNECTIONS_PERCENT&m=NORMALIZED_SYSTEM_CPU_USER' + +api_ok=0 +idle_ok=0 +metrics_ok=0 + +payload_out() { + jq -nc \ + --argjson api "$api_ok" \ + --argjson idle "$idle_ok" \ + --argjson met "$metrics_ok" \ + '{api_ok:$api, clusters_stable:$idle, metrics_snapshot_ok:$met}' +} + +if ! atlas_resolve_credentials; then + api_ok=0 + idle_ok=0 + metrics_ok=0 + payload_out | tee "$OUTPUT_FILE" + exit 0 +fi + +atlas_clusters_json "${ATLAS_PROJECT_ID}" +if [[ "${atlas_last_http_status:-}" != "200" ]]; then + api_ok=0 +else + api_ok=1 +fi + +if [[ "${api_ok}" != "1" ]]; then + idle_ok=0 + metrics_ok=0 + payload_out | tee "$OUTPUT_FILE" + exit 0 +fi + +clusters_payload="${atlas_last_http_body}" +filtered="$(filter_clusters_by_name "${clusters_payload}" "${CLUSTER_FILTER}")" +count="$(echo "$filtered" | jq 'length')" +if [[ "$count" == "0" ]]; then + idle_ok=1 + metrics_ok=1 + payload_out | tee "$OUTPUT_FILE" + exit 0 +fi + +bad_idle=0 +while IFS= read -r cjson; do + [[ -z "$cjson" ]] && continue + paused="$(echo "$cjson" | jq -r '.paused // false')" + state="$(echo "$cjson" | jq -r '.stateName // (.state.name // empty)')" + [[ -z "$state" ]] && state="IDLE" + sidle="${state^^}" + if [[ "$paused" == "true" ]] || [[ "$sidle" != "IDLE" && "$sidle" != "MONGOS_ONLY" ]]; then + bad_idle=1 + fi +done < <(echo "$filtered" | jq -c '.[]') + +if [[ "$bad_idle" == "0" ]]; then + idle_ok=1 +fi + +metrics_ok=1 + +atlas_processes_json "${ATLAS_PROJECT_ID}" +if [[ "${atlas_last_http_status:-}" != "200" ]]; then + metrics_ok=1 +else + proc_blob="${atlas_last_http_body}" + measured=0 + while IFS= read -r cjson; do + [[ "${measured}" -ge "${SLI_MAX_MEASUREMENT_PROCESSES}" ]] && break + [[ -z "$cjson" ]] && continue + cname="$(echo "$cjson" | jq -r '.name')" + pid="$(echo "$proc_blob" | jq -r --arg cn "$cname" ' + first( + (.results // [])[] + | select(.typeName == "REPLICA_PRIMARY") + | select(.replicaSetName != null) + | select(.replicaSetName == $cn or (.replicaSetName | startswith($cn + "-"))) + | .id + ) // empty + ')" + [[ -z "$pid" ]] && continue + atlas_measurement_json "${ATLAS_PROJECT_ID}" "${pid}" "?${METRIC_WINDOW}" + measured=$((measured + 1)) + if [[ "${atlas_last_http_status:-}" != "200" ]]; then + continue + fi + mbuf="${atlas_last_http_body}" + conn="$(echo "$mbuf" | jq -r ' + [.measurements[]? + | select(.name=="CONNECTIONS_PERCENT") + | .dataPoints[]? + | (.value | select(. != null)) + ] | max // empty + ')" + cpu="$(echo "$mbuf" | jq -r ' + [.measurements[]? + | select(.name=="NORMALIZED_SYSTEM_CPU_USER") + | .dataPoints[]? + | (.value | select(. != null)) + ] | max // empty + ')" + ct="${CONNECTION_THRESHOLD:-85}" + cpu_lim="${CPU_UTIL_THRESHOLD:-92}" + if [[ -n "$conn" ]] && awk -v c="$ct" -v v="$conn" 'BEGIN { exit !( (v+0) > (c+0) ) }'; then + metrics_ok=0 + break + fi + if [[ -n "$cpu" ]] && awk -v c="$cpu_lim" -v v="$cpu" 'BEGIN { exit !( (v+0) > (c+0) ) }'; then + metrics_ok=0 + break + fi + done < <(echo "$filtered" | jq -c '.[]') +fi + +payload_out | tee "$OUTPUT_FILE" diff --git a/codebundles/mongodb-atlas-cluster-health/sli.robot b/codebundles/mongodb-atlas-cluster-health/sli.robot new file mode 100644 index 00000000..c87cda6d --- /dev/null +++ b/codebundles/mongodb-atlas-cluster-health/sli.robot @@ -0,0 +1,103 @@ +*** Settings *** +Documentation Measures MongoDB Atlas project reachability plus cluster IDLE posture and capped PRIMARY replicas for CONNECTIVITY_PERCENT / NORMALIZED CPU windows, collapsing them into one 0-1 mean health score sourced from atlas Admin API digest calls. +Metadata Author rw-codebundle-agent +Metadata Display Name MongoDB Atlas Cluster Health SLI +Metadata Supports MongoDB Atlas cluster replication metrics + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + +*** Keywords *** +Suite Initialization + TRY + ${atlas_api_key_credentials}= RW.Core.Import Secret atlas_api_key_credentials + ... type=string + ... description=MongoDB Atlas API digest credential JSON blob + ... pattern=\w* + Set Suite Variable ${atlas_api_key_credentials} ${atlas_api_key_credentials} + EXCEPT + Log atlas_api_key_credentials unavailable; Atlas SLI scores zero everywhere. WARN + Set Suite Variable ${atlas_api_key_credentials} ${EMPTY} + END + + ${ATLAS_PROJECT_ID}= RW.Core.Import User Variable ATLAS_PROJECT_ID + ... type=string + ... description=Atlas group/project hex id + ... pattern=^[a-f0-9]{24}$ + ${CLUSTER_FILTER}= RW.Core.Import User Variable CLUSTER_FILTER + ... type=string + ... description=Comma-separated Atlas cluster filters for SLI scope + ... pattern=^[\w\-, ]*$ + ... default= + ${CONNECTION_THRESHOLD}= RW.Core.Import User Variable CONNECTION_THRESHOLD + ... type=string + ... description=SLI rejects samples above CONNECTIONS_PERCENT + ... pattern=^\d+$ + ... default=85 + ${CPU_UTIL_THRESHOLD}= RW.Core.Import User Variable CPU_UTIL_THRESHOLD + ... type=string + ... description=SLI rejects samples above NORMALIZED_SYSTEM_CPU_USER percent + ... pattern=^\d{1,3}$ + ... default=92 + ${SLI_MAX_MEASUREMENT_PROCESSES}= RW.Core.Import User Variable SLI_MAX_MEASUREMENT_PROCESSES + ... type=string + ... description=Maximum PRIMARY measurements per SLI run to mitigate API throttling + ... pattern=^\d+$ + ... default=8 + + Set Suite Variable ${ATLAS_PROJECT_ID} ${ATLAS_PROJECT_ID} + Set Suite Variable ${CLUSTER_FILTER} ${CLUSTER_FILTER} + Set Suite Variable ${CONNECTION_THRESHOLD} ${CONNECTION_THRESHOLD} + Set Suite Variable ${CPU_UTIL_THRESHOLD} ${CPU_UTIL_THRESHOLD} + Set Suite Variable ${SLI_MAX_MEASUREMENT_PROCESSES} ${SLI_MAX_MEASUREMENT_PROCESSES} + + ${env}= Create Dictionary + ... ATLAS_PROJECT_ID=${ATLAS_PROJECT_ID} + ... CLUSTER_FILTER=${CLUSTER_FILTER} + ... CONNECTION_THRESHOLD=${CONNECTION_THRESHOLD} + ... CPU_UTIL_THRESHOLD=${CPU_UTIL_THRESHOLD} + ... SLI_MAX_MEASUREMENT_PROCESSES=${SLI_MAX_MEASUREMENT_PROCESSES} + Set Suite Variable ${env} ${env} + +*** Tasks *** +Gather MongoDB Atlas SLI Signals and Emit Composite Score for Project `${ATLAS_PROJECT_ID}` + [Documentation] Runs capped digest-authenticated curls for cluster inventory and PRIMARY metric samples before averaging binary sub-metrics into the SLI heartbeat published for alerts. + [Tags] MongoDB Atlas sli access:read-only data:metrics + + ${runner}= RW.CLI.Run Bash File + ... bash_file=sli-mongodb-atlas-quick-check.sh + ... env=${env} + ... secret__atlas_api_key_credentials=${atlas_api_key_credentials} + ... timeout_seconds=120 + ... include_in_history=false + ... cmd_override=./sli-mongodb-atlas-quick-check.sh + + TRY + ${scores_raw}= RW.CLI.Run Cli + ... cmd=cat sli_mongodb_atlas_scores.json + ... timeout_seconds=15 + ${scores}= Evaluate json.loads(r'''${scores_raw.stdout}''') json + EXCEPT + Log Failed to decode SLI shim JSON output. WARN + ${scores}= Create Dictionary + ... api_ok=0 + ... clusters_stable=0 + ... metrics_snapshot_ok=0 + END + + ${api_ok}= Evaluate int(scores['api_ok']) scores=${scores} + ${stable_ok}= Evaluate int(scores['clusters_stable']) scores=${scores} + ${metric_ok}= Evaluate int(scores['metrics_snapshot_ok']) scores=${scores} + + RW.Core.Push Metric ${api_ok} sub_name=atlas_api_ok + RW.Core.Push Metric ${stable_ok} sub_name=clusters_idle_ok + RW.Core.Push Metric ${metric_ok} sub_name=metrics_sample_ok + + ${health}= Evaluate (${api_ok}+${stable_ok}+${metric_ok})/3.0 + ${health}= Convert To Number ${health} 2 + RW.Core.Add To Report MongoDB Atlas quick health (${ATLAS_PROJECT_ID}): composite=${health}, api=${api_ok}, idle=${stable_ok}, metrics_sample=${metric_ok}. + RW.Core.Push Metric ${health}