diff --git a/codebundles/mailgun-platform-status-health/.gitignore b/codebundles/mailgun-platform-status-health/.gitignore new file mode 100644 index 00000000..250700a1 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.gitignore @@ -0,0 +1,5 @@ +# Generated by task scripts at runtime (JSON issue payloads for the runbook) +status_incidents_output.json +incident_feed_output.json +api_us_reachability_output.json +api_eu_reachability_output.json diff --git a/codebundles/mailgun-platform-status-health/.runwhen/generation-rules/mailgun-platform-status-health.yaml b/codebundles/mailgun-platform-status-health/.runwhen/generation-rules/mailgun-platform-status-health.yaml new file mode 100644 index 00000000..6eaecd1b --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.runwhen/generation-rules/mailgun-platform-status-health.yaml @@ -0,0 +1,22 @@ +apiVersion: runwhen.com/v1 +kind: GenerationRules +spec: + platform: mailgun + generationRules: + - resourceTypes: + - network_service + matchRules: + - type: pattern + pattern: ".+" + properties: ["name"] + mode: substring + slxs: + - baseName: mailgun-platform-status + qualifiers: ["workspace"] + baseTemplateName: mailgun-platform-status-health + levelOfDetail: basic + outputItems: + - type: slx + - type: sli + - type: runbook + templateName: mailgun-platform-status-health-taskset.yaml diff --git a/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-sli.yaml b/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-sli.yaml new file mode 100644 index 00000000..f7a0270c --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-sli.yaml @@ -0,0 +1,39 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelIndicator +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} + runwhen.com/sli: "true" +spec: + displayUnitsLong: Health Score + displayUnitsShort: score + locations: + - {{default_location}} + description: Composite 0–1 score from Mailgun Statuspage health and unauthenticated US/EU API probes. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/mailgun-platform-status-health/sli.robot + intervalStrategy: intermezzo + intervalSeconds: 180 + configProvided: + - name: MAILGUN_STATUS_REGION_FOCUS + value: "{{ custom.mailgun_status_region_focus | default('both') }}" + - name: MAILGUN_STATUS_LOOKBACK_HOURS + value: "{{ custom.mailgun_status_lookback_hours | default('24') }}" + secretsProvided: [] + alertConfig: + tasks: + persona: eager-edgar + sessionTTL: 10m diff --git a/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-slx.yaml b/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-slx.yaml new file mode 100644 index 00000000..c1e89f9b --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-slx.yaml @@ -0,0 +1,29 @@ +apiVersion: runwhen.com/v1 +kind: ServiceLevelX +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + icon: Email + alias: Mailgun platform status {{match_resource.resource.name}} + asMeasuredBy: Public Statuspage data and unauthenticated regional API probes (401 + JSON). + configProvided: + - name: SLX_PLACEHOLDER + value: SLX_PLACEHOLDER + owners: + - {{workspace.owner_email}} + statement: Mailgun’s public status and regional API bases should be healthy before domain-level checks. + additionalContext: + qualified_name: "{{ match_resource.qualified_name }}" + tags: + - name: platform + value: mailgun + - name: service + value: mailgun + - name: scope + value: account + - name: access + value: read-only diff --git a/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-taskset.yaml b/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-taskset.yaml new file mode 100644 index 00000000..e137beb5 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.runwhen/templates/mailgun-platform-status-health-taskset.yaml @@ -0,0 +1,29 @@ +apiVersion: runwhen.com/v1 +kind: Runbook +metadata: + name: {{slx_name}} + labels: + {% include "common-labels.yaml" %} + annotations: + {% include "common-annotations.yaml" %} +spec: + location: {{default_location}} + description: Monitors Mailgun platform status and regional API reachability using public endpoints only. + codeBundle: + {% if repo_url %} + repoUrl: {{repo_url}} + {% else %} + repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git + {% endif %} + {% if ref %} + ref: {{ref}} + {% else %} + ref: main + {% endif %} + pathToRobot: codebundles/mailgun-platform-status-health/runbook.robot + configProvided: + - name: MAILGUN_STATUS_REGION_FOCUS + value: "{{ custom.mailgun_status_region_focus | default('both') }}" + - name: MAILGUN_STATUS_LOOKBACK_HOURS + value: "{{ custom.mailgun_status_lookback_hours | default('24') }}" + secretsProvided: [] diff --git a/codebundles/mailgun-platform-status-health/.test/Taskfile.yaml b/codebundles/mailgun-platform-status-health/.test/Taskfile.yaml new file mode 100644 index 00000000..75d98a7c --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.test/Taskfile.yaml @@ -0,0 +1,47 @@ +version: "3" + +tasks: + default: + desc: "Validate bundle structure (public endpoints; no test cloud resources)" + cmds: + - ./validate-bundle-structure.sh + + clean: + desc: "Cleanup local test artifacts" + cmds: + - rm -f workspaceInfo.yaml + - rm -rf output + + build-infra: + desc: "No provisioned infrastructure — bundle probes public Mailgun URLs only" + cmds: + - ./validate-bundle-structure.sh + + check-unpushed-commits: + desc: Check for uncommitted changes outside .test before integration testing + vars: + BASE_DIR: "../" + cmds: + - | + UNCOMMITTED=$(git diff --name-only HEAD 2>/dev/null | grep -E "^${BASE_DIR}" | grep -v "/\.test/" || true) + if [ -n "$UNCOMMITTED" ]; then + echo "Uncommitted changes in bundle (excluding .test):" + echo "$UNCOMMITTED" + exit 1 + fi + silent: true + + generate-rwl-config: + desc: "Placeholder — workspace wiring is environment-specific for non-Kubernetes bundles" + cmds: + - echo "See RunWhen workspace docs to attach this CodeBundle." + + run-rwl-discovery: + desc: "Placeholder — discovery is optional for static network_service bundles" + cmds: + - ./validate-bundle-structure.sh + + clean-rwl-discovery: + desc: "Remove discovery output" + cmds: + - rm -rf output diff --git a/codebundles/mailgun-platform-status-health/.test/validate-bundle-structure.sh b/codebundles/mailgun-platform-status-health/.test/validate-bundle-structure.sh new file mode 100755 index 00000000..ec0754ab --- /dev/null +++ b/codebundles/mailgun-platform-status-health/.test/validate-bundle-structure.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash +# Sanity check for Mailgun platform bundle layout (no cloud resources required). +set -euo pipefail +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +test -f "$ROOT/runbook.robot" +test -f "$ROOT/sli.robot" +test -f "$ROOT/.runwhen/generation-rules/mailgun-platform-status-health.yaml" +test -f "$ROOT/.runwhen/templates/mailgun-platform-status-health-slx.yaml" +test -f "$ROOT/.runwhen/templates/mailgun-platform-status-health-taskset.yaml" +test -f "$ROOT/.runwhen/templates/mailgun-platform-status-health-sli.yaml" +echo "mailgun-platform-status-health bundle structure OK" diff --git a/codebundles/mailgun-platform-status-health/README.md b/codebundles/mailgun-platform-status-health/README.md new file mode 100644 index 00000000..f0ffda81 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/README.md @@ -0,0 +1,48 @@ +# Mailgun Platform Status & Reachability + +This CodeBundle detects Mailgun-wide service disruptions and loss of regional API reachability before authenticated domain checks run. It uses only Mailgun’s public Statuspage JSON endpoints and unauthenticated HTTPS probes (no API keys), so it works in environments where secrets are restricted. + +## Overview + +- **Live status page**: Reads `status.json`, `summary.json`, unresolved incidents, and active scheduled maintenance from `https://status.mailgun.com` to flag non-green indicators, degraded components, active incidents, and maintenance windows. +- **Incident history**: Scans the public incidents feed for major or critical incidents that **resolved** within the configured lookback window (context after an outage even when the banner is green again). +- **US API reachability**: GET `https://api.mailgun.net/v3/domains` without credentials; expects HTTP **401** and JSON (confirms TLS, routing, and API edge behavior). +- **EU API reachability**: Same check against `https://api.eu.mailgun.net/v3/domains` when EU routing is in scope. +- **SLI**: Produces a 0–1 health score from page status, unresolved incidents, and regional probes (see `sli.robot`). + +## Configuration + +### Required Variables + +None. The bundle uses only public endpoints and optional configuration below. + +### Optional Variables + +- `MAILGUN_STATUS_REGION_FOCUS`: Which regional API reachability checks to run: `us`, `eu`, or `both` (default: `both`). When set to `us` only, the EU probe is skipped (and vice versa). +- `MAILGUN_STATUS_LOOKBACK_HOURS`: Hours of history to consider “recent” for the incident-feed task (default: `24`). Major/critical incidents resolved within this window are surfaced as lower-severity informational issues. + +### Secrets + +None. + +## Tasks Overview + +### Check Mailgun Status Page for Published Incidents + +Uses Statuspage APIs for overall indicator, per-component status from `summary.json`, any unresolved incidents, and active scheduled maintenance. Raises issues when the page is not all-green, components are degraded, incidents are open, or maintenance is active. + +### Check Mailgun Public Incident Feed for Recent Critical Events + +Uses the incidents JSON feed to list major or critical incidents that reached **resolved** status inside the lookback window (recent blast-radius context). + +### Verify Mailgun US API Endpoint Reachability + +Probes the US API base; expects HTTP 401 with JSON without an API key. Skipped when `MAILGUN_STATUS_REGION_FOCUS` is `eu` only. + +### Verify Mailgun EU API Endpoint Reachability + +Same probe for the EU regional base. Skipped when `MAILGUN_STATUS_REGION_FOCUS` is `us` only. + +### SLI (sli.robot) + +Aggregates binary checks into a single health score for alerting; see `.runwhen/templates/mailgun-platform-status-health-sli.yaml`. diff --git a/codebundles/mailgun-platform-status-health/check-mailgun-api-eu-reachability.sh b/codebundles/mailgun-platform-status-health/check-mailgun-api-eu-reachability.sh new file mode 100755 index 00000000..5a489d85 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/check-mailgun-api-eu-reachability.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Unauthenticated probe of the EU Mailgun API base (api.eu.mailgun.net). +# Expects HTTP 401 with JSON when no API key is supplied. +# Env: MAILGUN_STATUS_REGION_FOCUS — skip when set to "us" only. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="${OUTPUT_FILE:-api_eu_reachability_output.json}" +MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS:-both}" +API_URL="https://api.eu.mailgun.net/v3/domains" + +issues_json='[]' + +append_issue() { + local title="$1" + local expected="$2" + local actual="$3" + local details="$4" + local severity="$5" + local next_steps="$6" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg expected "$expected" \ + --arg actual "$actual" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, expected: $expected, actual: $actual, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "us" ]]; then + echo '[]' | jq '.' >"$OUTPUT_FILE" + echo "Skipped EU API check (MAILGUN_STATUS_REGION_FOCUS=${MAILGUN_STATUS_REGION_FOCUS})" + exit 0 +fi + +tmp_body=$(mktemp) +http_code="000" +set +e +http_code=$(curl -sS -o "$tmp_body" -w '%{http_code}' --connect-timeout 10 --max-time 60 \ + -H 'Accept: application/json' "$API_URL") +curl_rc=$? +set -e + +body_head=$(head -c 400 "$tmp_body" | tr -d '\r' || true) +rm -f "$tmp_body" + +if [[ "$curl_rc" -ne 0 ]]; then + append_issue \ + "Mailgun EU API base unreachable (TLS or network failure)" \ + "curl completes with exit 0 and an HTTP status from Mailgun" \ + "curl exit ${curl_rc} for ${API_URL}" \ + "curl to ${API_URL} failed before a reliable HTTP status was recorded." \ + 4 \ + "Check egress to the EU region, DNS for api.eu.mailgun.net, and Mailgun status before debugging domain configuration." +elif [[ "$http_code" == "000" ]]; then + append_issue \ + "Mailgun EU API base returned no HTTP status" \ + "curl returns a non-zero HTTP status code" \ + "Recorded HTTP code ${http_code}" \ + "curl completed without a usable HTTP code for ${API_URL}." \ + 4 \ + "Investigate network path; compare with US reachability and the public status page." +elif [[ "$http_code" != "401" ]]; then + append_issue \ + "Mailgun EU API base returned unexpected HTTP ${http_code}" \ + "Unauthenticated GET returns HTTP 401 with JSON from Mailgun" \ + "HTTP ${http_code}; body prefix: ${body_head}" \ + "Expected HTTP 401 for unauthenticated GET ${API_URL}." \ + 4 \ + "Compare with documented API behavior; verify you require EU routing and that proxies are not altering responses." +else + if ! echo "$body_head" | jq -e . >/dev/null 2>&1; then + append_issue \ + "Mailgun EU API base response was not JSON" \ + "HTTP 401 body parses as JSON" \ + "Non-JSON body prefix: ${body_head}" \ + "HTTP ${http_code} received but body did not parse as JSON." \ + 3 \ + "Validate that traffic reaches Mailgun EU and is not rewritten by a proxy or HTML error page." + fi +fi + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE} ($(echo "$issues_json" | jq 'length') issue(s))" diff --git a/codebundles/mailgun-platform-status-health/check-mailgun-api-us-reachability.sh b/codebundles/mailgun-platform-status-health/check-mailgun-api-us-reachability.sh new file mode 100755 index 00000000..7745e793 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/check-mailgun-api-us-reachability.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Unauthenticated probe of the US Mailgun API base. Expects HTTP 401 with a +# JSON body (no API key) — confirms TLS, routing, and that the edge responds. +# Env: MAILGUN_STATUS_REGION_FOCUS — skip when set to "eu" only. +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="${OUTPUT_FILE:-api_us_reachability_output.json}" +MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS:-both}" +API_URL="https://api.mailgun.net/v3/domains" + +issues_json='[]' + +append_issue() { + local title="$1" + local expected="$2" + local actual="$3" + local details="$4" + local severity="$5" + local next_steps="$6" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg expected "$expected" \ + --arg actual "$actual" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, expected: $expected, actual: $actual, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "eu" ]]; then + echo '[]' | jq '.' >"$OUTPUT_FILE" + echo "Skipped US API check (MAILGUN_STATUS_REGION_FOCUS=${MAILGUN_STATUS_REGION_FOCUS})" + exit 0 +fi + +tmp_body=$(mktemp) +http_code="000" +set +e +http_code=$(curl -sS -o "$tmp_body" -w '%{http_code}' --connect-timeout 10 --max-time 60 \ + -H 'Accept: application/json' "$API_URL") +curl_rc=$? +set -e + +body_head=$(head -c 400 "$tmp_body" | tr -d '\r' || true) +rm -f "$tmp_body" + +if [[ "$curl_rc" -ne 0 ]]; then + append_issue \ + "Mailgun US API base unreachable (TLS or network failure)" \ + "curl completes with exit 0 and an HTTP status from Mailgun" \ + "curl exit ${curl_rc} for ${API_URL}" \ + "curl to ${API_URL} failed before a reliable HTTP status was recorded." \ + 4 \ + "Check corporate egress, DNS for api.mailgun.net, TLS interception, and Mailgun status before debugging domain configuration." +elif [[ "$http_code" == "000" ]]; then + append_issue \ + "Mailgun US API base returned no HTTP status" \ + "curl returns a non-zero HTTP status code" \ + "Recorded HTTP code ${http_code}" \ + "curl completed without a usable HTTP code for ${API_URL}." \ + 4 \ + "Investigate network path and proxies; compare with EU reachability task and the public status page." +elif [[ "$http_code" != "401" ]]; then + append_issue \ + "Mailgun US API base returned unexpected HTTP ${http_code}" \ + "Unauthenticated GET returns HTTP 401 with JSON from Mailgun" \ + "HTTP ${http_code}; body prefix: ${body_head}" \ + "Expected HTTP 401 for unauthenticated GET ${API_URL}." \ + 4 \ + "Compare with documented API behavior; check for intercepting proxies, regional routing changes, or ongoing incidents." +else + if ! echo "$body_head" | jq -e . >/dev/null 2>&1; then + append_issue \ + "Mailgun US API base response was not JSON" \ + "HTTP 401 body parses as JSON" \ + "Non-JSON body prefix: ${body_head}" \ + "HTTP ${http_code} received but body did not parse as JSON." \ + 3 \ + "Validate that traffic reaches Mailgun and is not rewritten by a proxy or HTML error page." + fi +fi + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE} ($(echo "$issues_json" | jq 'length') issue(s))" diff --git a/codebundles/mailgun-platform-status-health/check-mailgun-incident-feed.sh b/codebundles/mailgun-platform-status-health/check-mailgun-incident-feed.sh new file mode 100755 index 00000000..657e59a6 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/check-mailgun-incident-feed.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Uses Mailgun Statuspage incidents JSON (public) to highlight major/critical +# incidents that resolved within the lookback window — useful context even when +# the page is green again. Skips active incidents handled by the unresolved API. +# Env: MAILGUN_STATUS_LOOKBACK_HOURS (default 24) +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="${OUTPUT_FILE:-incident_feed_output.json}" +MAILGUN_STATUS_LOOKBACK_HOURS="${MAILGUN_STATUS_LOOKBACK_HOURS:-24}" +INCIDENTS_URL="https://status.mailgun.com/api/v2/incidents.json" + +issues_json='[]' + +append_issue() { + local title="$1" + local expected="$2" + local actual="$3" + local details="$4" + local severity="$5" + local next_steps="$6" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg expected "$expected" \ + --arg actual "$actual" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, expected: $expected, actual: $actual, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +now_epoch=$(date +%s) +cutoff_epoch=$((now_epoch - MAILGUN_STATUS_LOOKBACK_HOURS * 3600)) + +if ! inc_raw=$(curl -fsS --connect-timeout 10 --max-time 60 "$INCIDENTS_URL" 2>/dev/null); then + append_issue \ + "Cannot fetch Mailgun incidents feed JSON" \ + "HTTPS GET to incidents.json succeeds" \ + "GET ${INCIDENTS_URL} failed" \ + "Unable to load the incident history feed from Statuspage." \ + 3 \ + "Verify outbound HTTPS; use https://status.mailgun.com as a fallback." +else + count=$(echo "$inc_raw" | jq '.incidents | length') + idx=0 + while [[ "$idx" -lt "${count:-0}" ]]; do + inc=$(echo "$inc_raw" | jq -c ".incidents[$idx]") + impact=$(echo "$inc" | jq -r '.impact // "none"') + istatus=$(echo "$inc" | jq -r '.status // "unknown"') + name=$(echo "$inc" | jq -r '.name // "Incident"') + resolved_at=$(echo "$inc" | jq -r '.resolved_at // empty') + shortlink=$(echo "$inc" | jq -r '.shortlink // empty') + + if [[ "$impact" != "major" && "$impact" != "critical" ]]; then + idx=$((idx + 1)) + continue + fi + + if [[ "$istatus" != "resolved" ]]; then + idx=$((idx + 1)) + continue + fi + + if [[ -z "$resolved_at" ]]; then + idx=$((idx + 1)) + continue + fi + + res_epoch=$(date -d "$resolved_at" +%s 2>/dev/null || echo 0) + if [[ "$res_epoch" -lt "$cutoff_epoch" ]]; then + idx=$((idx + 1)) + continue + fi + + append_issue \ + "Recent resolved Mailgun ${impact} incident: ${name}" \ + "No major or critical incidents resolved within the configured lookback window" \ + "impact=${impact}; status=${istatus}; resolved_at=${resolved_at}" \ + "Within the last ${MAILGUN_STATUS_LOOKBACK_HOURS}h a ${impact} incident reached resolved state. Link: ${shortlink}" \ + 2 \ + "Review post-incident behavior for your integration (retries, queues); confirm metrics and logs look healthy after the window." + idx=$((idx + 1)) + done +fi + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE} ($(echo "$issues_json" | jq 'length') issue(s))" diff --git a/codebundles/mailgun-platform-status-health/check-mailgun-status-incidents.sh b/codebundles/mailgun-platform-status-health/check-mailgun-status-incidents.sh new file mode 100755 index 00000000..10256370 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/check-mailgun-status-incidents.sh @@ -0,0 +1,147 @@ +#!/usr/bin/env bash +set -euo pipefail +set -x +# ----------------------------------------------------------------------------- +# Reads Mailgun Statuspage JSON (public) for overall status, degraded components, +# unresolved incidents, and active scheduled maintenance. Writes a JSON array of +# issues for the runbook to surface. +# Env: MAILGUN_STATUS_LOOKBACK_HOURS (optional, used for report context only) +# ----------------------------------------------------------------------------- + +OUTPUT_FILE="${OUTPUT_FILE:-status_incidents_output.json}" +MAILGUN_STATUS_LOOKBACK_HOURS="${MAILGUN_STATUS_LOOKBACK_HOURS:-24}" + +STATUS_URL="https://status.mailgun.com/api/v2/status.json" +SUMMARY_URL="https://status.mailgun.com/api/v2/summary.json" +UNRESOLVED_URL="https://status.mailgun.com/api/v2/incidents/unresolved.json" +MAINT_URL="https://status.mailgun.com/api/v2/scheduled-maintenances/active.json" + +issues_json='[]' + +append_issue() { + local title="$1" + local expected="$2" + local actual="$3" + local details="$4" + local severity="$5" + local next_steps="$6" + issues_json=$(echo "$issues_json" | jq \ + --arg title "$title" \ + --arg expected "$expected" \ + --arg actual "$actual" \ + --arg details "$details" \ + --argjson severity "$severity" \ + --arg next_steps "$next_steps" \ + '. += [{title: $title, expected: $expected, actual: $actual, details: $details, severity: $severity, next_steps: $next_steps}]') +} + +if ! status_raw=$(curl -fsS --connect-timeout 10 --max-time 60 "$STATUS_URL" 2>/dev/null); then + append_issue \ + "Cannot fetch Mailgun status page JSON" \ + "HTTPS GET to the public Statuspage summary succeeds" \ + "curl to ${STATUS_URL} failed" \ + "GET ${STATUS_URL} failed (network, DNS, or HTTP error). Confirm outbound HTTPS access to status.mailgun.com." \ + 4 \ + "Retry from a host with internet access; verify firewall and proxy rules for status.mailgun.com." +else + indicator=$(echo "$status_raw" | jq -r '.status.indicator // "unknown"') + description=$(echo "$status_raw" | jq -r '.status.description // ""') + if [[ "$indicator" != "none" ]]; then + sev=3 + if [[ "$indicator" == "minor" ]]; then sev=2; fi + if [[ "$indicator" == "critical" ]]; then sev=4; fi + append_issue \ + "Mailgun status indicator is not green (${indicator})" \ + "Statuspage indicator is none (all systems operational)" \ + "indicator=${indicator}; description=${description}" \ + "Statuspage indicator: ${indicator}. Summary: ${description}. Lookback context: ${MAILGUN_STATUS_LOOKBACK_HOURS}h." \ + "$sev" \ + "Review https://status.mailgun.com for live updates, subscribe to notifications, and pause risky mail changes until green." + fi +fi + +if ! summary_raw=$(curl -fsS --connect-timeout 10 --max-time 60 "$SUMMARY_URL" 2>/dev/null); then + append_issue \ + "Cannot fetch Mailgun status summary JSON" \ + "HTTPS GET to the Statuspage summary.json succeeds" \ + "GET ${SUMMARY_URL} failed" \ + "Unable to evaluate per-component health from Statuspage summary." \ + 4 \ + "Check connectivity to status.mailgun.com and retry; confirm corporate proxies allow Statuspage APIs." +else + while IFS= read -r line; do + [[ -z "$line" ]] && continue + name=$(echo "$line" | cut -d$'\t' -f1) + cstat=$(echo "$line" | cut -d$'\t' -f2) + if [[ "$cstat" != "operational" ]]; then + append_issue \ + "Mailgun component not operational: ${name}" \ + "All Statuspage components report operational status" \ + "component=${name} status=${cstat}" \ + "Component '${name}' reports status '${cstat}' in the Statuspage summary API." \ + 3 \ + "Monitor https://status.mailgun.com, defer non-urgent sends if the component maps to your integration, and retest reachability tasks after recovery." + fi + done < <(echo "$summary_raw" | jq -r '.. | objects | select(has("name") and has("status") and (.name|type=="string")) | [.name,.status] | @tsv' 2>/dev/null || true) +fi + +if ! unres_raw=$(curl -fsS --connect-timeout 10 --max-time 60 "$UNRESOLVED_URL" 2>/dev/null); then + append_issue \ + "Cannot fetch Mailgun unresolved incidents" \ + "HTTPS GET to incidents/unresolved.json succeeds" \ + "GET ${UNRESOLVED_URL} failed" \ + "Unable to list unresolved incidents from the Statuspage API." \ + 3 \ + "Verify access to the Mailgun status API; use the web status page as a fallback." +else + count=$(echo "$unres_raw" | jq '.incidents | length') + if [[ "${count:-0}" -gt 0 ]]; then + idx=0 + while [[ "$idx" -lt "$count" ]]; do + inc=$(echo "$unres_raw" | jq -c ".incidents[$idx]") + name=$(echo "$inc" | jq -r '.name // "Incident"') + impact=$(echo "$inc" | jq -r '.impact // "unknown"') + istatus=$(echo "$inc" | jq -r '.status // "unknown"') + shortlink=$(echo "$inc" | jq -r '.shortlink // empty') + append_issue \ + "Active Mailgun incident: ${name}" \ + "No unresolved incidents on the public status page" \ + "impact=${impact}; status=${istatus}; link=${shortlink}" \ + "Unresolved incident from Statuspage: impact=${impact}, status=${istatus}. Link: ${shortlink}" \ + 3 \ + "Follow the incident timeline on the status page, adjust traffic or retries as advised, and confirm regional API tasks once resolved." + idx=$((idx + 1)) + done + fi +fi + +if ! maint_raw=$(curl -fsS --connect-timeout 10 --max-time 60 "$MAINT_URL" 2>/dev/null); then + append_issue \ + "Cannot fetch Mailgun active scheduled maintenances" \ + "HTTPS GET to scheduled-maintenances/active.json succeeds" \ + "GET ${MAINT_URL} failed" \ + "Unable to list active maintenance windows." \ + 2 \ + "Confirm HTTPS access; check the web status page for maintenance banners." +else + mcount=$(echo "$maint_raw" | jq '.scheduled_maintenances | length') + if [[ "${mcount:-0}" -gt 0 ]]; then + midx=0 + while [[ "$midx" -lt "$mcount" ]]; do + m=$(echo "$maint_raw" | jq -c ".scheduled_maintenances[$midx]") + mname=$(echo "$m" | jq -r '.name // "Maintenance"') + mstat=$(echo "$m" | jq -r '.status // "unknown"') + append_issue \ + "Mailgun scheduled maintenance in progress: ${mname}" \ + "No active maintenance window affecting the platform" \ + "maintenance=${mname} status=${mstat}" \ + "Active maintenance window reported by Statuspage (status=${mstat})." \ + 2 \ + "Plan around the window, expect possible API or control-plane noise, and re-run checks after maintenance completes." + midx=$((midx + 1)) + done + fi +fi + +echo "$issues_json" | jq '.' >"$OUTPUT_FILE" +echo "Wrote ${OUTPUT_FILE} ($(echo "$issues_json" | jq 'length') issue(s))" diff --git a/codebundles/mailgun-platform-status-health/runbook.robot b/codebundles/mailgun-platform-status-health/runbook.robot new file mode 100644 index 00000000..03f95ecf --- /dev/null +++ b/codebundles/mailgun-platform-status-health/runbook.robot @@ -0,0 +1,195 @@ +*** Settings *** +Documentation Detects Mailgun-wide service disruptions and loss of regional API reachability using public status sources and unauthenticated HTTP probes before authenticated domain checks. +Metadata Author rw-codebundle-agent +Metadata Display Name Mailgun Platform Status & Reachability +Metadata Supports Mailgun network_service platform-status +Force Tags Mailgun network_service platform-status + +Library String +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Check Mailgun Status Page for Published Incidents (region focus `${MAILGUN_STATUS_REGION_FOCUS}`) + [Documentation] Fetches Mailgun Statuspage JSON for overall health, degraded components, unresolved incidents, and active maintenance windows that can explain delivery or API issues without using API keys. + [Tags] mailgun status platform access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-mailgun-status-incidents.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=MAILGUN_STATUS_LOOKBACK_HOURS="${MAILGUN_STATUS_LOOKBACK_HOURS}" ./check-mailgun-status-incidents.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat status_incidents_output.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=${issue['expected']} + ... actual=${issue['actual']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Mailgun status snapshot (stdout): + RW.Core.Add Pre To Report ${result.stdout} + +Check Mailgun Public Incident Feed for Recent Critical Events (lookback `${MAILGUN_STATUS_LOOKBACK_HOURS}`h) + [Documentation] Reads the Statuspage incidents JSON for major or critical incidents resolved inside the lookback window to surface recent platform risk even when the banner is green. + [Tags] mailgun incidents feed access:read-only data:logs-config + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-mailgun-incident-feed.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=MAILGUN_STATUS_LOOKBACK_HOURS="${MAILGUN_STATUS_LOOKBACK_HOURS}" ./check-mailgun-incident-feed.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat incident_feed_output.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=${issue['expected']} + ... actual=${issue['actual']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Mailgun incident feed results: + RW.Core.Add Pre To Report ${result.stdout} + +Verify Mailgun US API Endpoint Reachability (focus `${MAILGUN_STATUS_REGION_FOCUS}`) + [Documentation] Performs a TLS and HTTP probe to api.mailgun.net expecting HTTP 401 without credentials, confirming US regional routing and availability. + [Tags] mailgun api us reachability access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-mailgun-api-us-reachability.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS}" ./check-mailgun-api-us-reachability.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat api_us_reachability_output.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=${issue['expected']} + ... actual=${issue['actual']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Mailgun US API reachability: + RW.Core.Add Pre To Report ${result.stdout} + +Verify Mailgun EU API Endpoint Reachability (focus `${MAILGUN_STATUS_REGION_FOCUS}`) + [Documentation] Same unauthenticated probe for api.eu.mailgun.net when EU routing matters to the workspace. + [Tags] mailgun api eu reachability access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=check-mailgun-api-eu-reachability.sh + ... env=${env} + ... timeout_seconds=180 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS}" ./check-mailgun-api-eu-reachability.sh + + ${issues}= RW.CLI.Run Cli + ... cmd=cat api_eu_reachability_output.json + ... env=${env} + + TRY + ${issue_list}= Evaluate json.loads(r'''${issues.stdout}''') json + EXCEPT + Log Failed to parse JSON for task, defaulting to empty list. WARN + ${issue_list}= Create List + END + + IF len(@{issue_list}) > 0 + FOR ${issue} IN @{issue_list} + RW.Core.Add Issue + ... severity=${issue['severity']} + ... expected=${issue['expected']} + ... actual=${issue['actual']} + ... title=${issue['title']} + ... reproduce_hint=${result.cmd} + ... details=${issue['details']} + ... next_steps=${issue['next_steps']} + END + END + + RW.Core.Add Pre To Report Mailgun EU API reachability: + RW.Core.Add Pre To Report ${result.stdout} + + +*** Keywords *** +Suite Initialization + ${MAILGUN_STATUS_REGION_FOCUS}= RW.Core.Import User Variable MAILGUN_STATUS_REGION_FOCUS + ... type=string + ... description=Which regional reachability checks matter: us, eu, or both. + ... pattern=\w* + ... default=both + ... enum=[us,eu,both] + ${MAILGUN_STATUS_LOOKBACK_HOURS}= RW.Core.Import User Variable MAILGUN_STATUS_LOOKBACK_HOURS + ... type=string + ... description=Hours of incident history to treat as recent for feed correlation. + ... pattern=^\d+$ + ... default=24 + + Set Suite Variable ${MAILGUN_STATUS_REGION_FOCUS} ${MAILGUN_STATUS_REGION_FOCUS} + Set Suite Variable ${MAILGUN_STATUS_LOOKBACK_HOURS} ${MAILGUN_STATUS_LOOKBACK_HOURS} + + ${env_dict}= Create Dictionary + ... MAILGUN_STATUS_REGION_FOCUS=${MAILGUN_STATUS_REGION_FOCUS} + ... MAILGUN_STATUS_LOOKBACK_HOURS=${MAILGUN_STATUS_LOOKBACK_HOURS} + Set Suite Variable ${env} ${env_dict} diff --git a/codebundles/mailgun-platform-status-health/sli-mailgun-platform-score.sh b/codebundles/mailgun-platform-status-health/sli-mailgun-platform-score.sh new file mode 100755 index 00000000..22a5e032 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/sli-mailgun-platform-score.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail +# ----------------------------------------------------------------------------- +# Lightweight aggregate for SLI: page green + no unresolved incidents + regional +# API probes (401 + JSON). Prints one JSON object on stdout for sli.robot. +# Env: MAILGUN_STATUS_REGION_FOCUS (default both), MAILGUN_STATUS_LOOKBACK_HOURS (unused; reserved) +# ----------------------------------------------------------------------------- + +MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS:-both}" + +page_score=0 +if status_json=$(curl -fsS --connect-timeout 5 --max-time 15 "https://status.mailgun.com/api/v2/status.json" 2>/dev/null); then + if echo "$status_json" | jq -e '.status.indicator == "none"' >/dev/null 2>&1; then + if unres_json=$(curl -fsS --connect-timeout 5 --max-time 15 "https://status.mailgun.com/api/v2/incidents/unresolved.json" 2>/dev/null); then + if echo "$unres_json" | jq -e '(.incidents | length) == 0' >/dev/null 2>&1; then + page_score=1 + fi + fi + fi +fi + +probe_api() { + local url="$1" + local tmp + tmp=$(mktemp) + local code rc + set +e + code=$(curl -sS -o "$tmp" -w '%{http_code}' --connect-timeout 5 --max-time 15 -H 'Accept: application/json' "$url") + rc=$? + set -e + local head + head=$(head -c 300 "$tmp" || true) + rm -f "$tmp" + if [[ "$rc" -ne 0 || "$code" != "401" ]]; then + echo "0" + return + fi + if echo "$head" | jq -e . >/dev/null 2>&1; then + echo "1" + else + echo "0" + fi +} + +us_score=-1 +eu_score=-1 +us_included=0 +eu_included=0 +if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "both" || "$MAILGUN_STATUS_REGION_FOCUS" == "us" ]]; then + us_score=$(probe_api "https://api.mailgun.net/v3/domains") + us_included=1 +fi +if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "both" || "$MAILGUN_STATUS_REGION_FOCUS" == "eu" ]]; then + eu_score=$(probe_api "https://api.eu.mailgun.net/v3/domains") + eu_included=1 +fi + +dims=0 +sum=0 +sum=$((sum + page_score)) +dims=$((dims + 1)) +if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "both" || "$MAILGUN_STATUS_REGION_FOCUS" == "us" ]]; then + sum=$((sum + us_score)) + dims=$((dims + 1)) +fi +if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "both" || "$MAILGUN_STATUS_REGION_FOCUS" == "eu" ]]; then + sum=$((sum + eu_score)) + dims=$((dims + 1)) +fi + +health_score=$(awk -v s="$sum" -v d="$dims" 'BEGIN { if (d < 1) { print 0 } else { printf "%.4f", s / d } }') + +jq -n \ + --argjson page "$page_score" \ + --argjson us_raw "$us_score" \ + --argjson eu_raw "$eu_score" \ + --argjson us_included "$us_included" \ + --argjson eu_included "$eu_included" \ + --argjson health "$health_score" \ + '{page: $page, us: (if $us_included == 0 then null else $us_raw end), eu: (if $eu_included == 0 then null else $eu_raw end), us_included: ($us_included == 1), eu_included: ($eu_included == 1), health_score: $health}' diff --git a/codebundles/mailgun-platform-status-health/sli.robot b/codebundles/mailgun-platform-status-health/sli.robot new file mode 100644 index 00000000..26884553 --- /dev/null +++ b/codebundles/mailgun-platform-status-health/sli.robot @@ -0,0 +1,76 @@ +*** Settings *** +Documentation Measures Mailgun platform health from public status APIs and unauthenticated regional API probes. Produces a value between 0 (failing) and 1 (healthy). +Metadata Author rw-codebundle-agent +Metadata Display Name Mailgun Platform Status SLI +Metadata Supports Mailgun network_service platform-status +Force Tags Mailgun network_service platform-status + +Library BuiltIn +Library RW.Core +Library RW.CLI +Library RW.platform + +Suite Setup Suite Initialization + + +*** Tasks *** +Score Mailgun Platform Health for Region Focus `${MAILGUN_STATUS_REGION_FOCUS}` + [Documentation] Runs lightweight curl checks for status green, zero unresolved incidents, and expected 401 JSON responses on regional API bases. Aggregates binary scores into a 0-1 metric. + [Tags] mailgun platform sli access:read-only data:metrics + + ${result}= RW.CLI.Run Bash File + ... bash_file=sli-mailgun-platform-score.sh + ... env=${env} + ... timeout_seconds=25 + ... include_in_history=false + ... show_in_rwl_cheatsheet=true + ... cmd_override=MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS}" ./sli-mailgun-platform-score.sh + + TRY + ${d}= Evaluate json.loads(r'''${result.stdout}''') json + EXCEPT + Log Failed to parse SLI JSON, scoring 0. WARN + ${d}= Evaluate {"page": 0, "health_score": 0, "us_included": False, "eu_included": False} json + END + + ${page}= Evaluate float($d['page']) + RW.Core.Push Metric ${page} sub_name=page + + ${inc_us}= Evaluate $d.get('us_included') json + IF ${inc_us} + ${us}= Evaluate float($d['us']) + RW.Core.Push Metric ${us} sub_name=us_api + END + + ${inc_eu}= Evaluate $d.get('eu_included') json + IF ${inc_eu} + ${eu}= Evaluate float($d['eu']) + RW.Core.Push Metric ${eu} sub_name=eu_api + END + + ${health}= Evaluate float($d['health_score']) + RW.Core.Add to Report Mailgun platform health score: ${health} + RW.Core.Push Metric ${health} + + +*** Keywords *** +Suite Initialization + ${MAILGUN_STATUS_REGION_FOCUS}= RW.Core.Import User Variable MAILGUN_STATUS_REGION_FOCUS + ... type=string + ... description=Which regional reachability checks matter: us, eu, or both. + ... pattern=\w* + ... default=both + ... enum=[us,eu,both] + ${MAILGUN_STATUS_LOOKBACK_HOURS}= RW.Core.Import User Variable MAILGUN_STATUS_LOOKBACK_HOURS + ... type=string + ... description=Hours of incident history (reserved for parity with runbook; SLI uses live status only). + ... pattern=^\d+$ + ... default=24 + + Set Suite Variable ${MAILGUN_STATUS_REGION_FOCUS} ${MAILGUN_STATUS_REGION_FOCUS} + Set Suite Variable ${MAILGUN_STATUS_LOOKBACK_HOURS} ${MAILGUN_STATUS_LOOKBACK_HOURS} + + ${env_dict}= Create Dictionary + ... MAILGUN_STATUS_REGION_FOCUS=${MAILGUN_STATUS_REGION_FOCUS} + ... MAILGUN_STATUS_LOOKBACK_HOURS=${MAILGUN_STATUS_LOOKBACK_HOURS} + Set Suite Variable ${env} ${env_dict}