Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions codebundles/mailgun-platform-status-health/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Generated by task scripts at runtime (JSON issue payloads for the runbook)
status_incidents_output.json
incident_feed_output.json
api_us_reachability_output.json
api_eu_reachability_output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: runwhen.com/v1
kind: GenerationRules
spec:
platform: mailgun
generationRules:
- resourceTypes:
- network_service
matchRules:
- type: pattern
pattern: ".+"
properties: ["name"]
mode: substring
slxs:
- baseName: mailgun-platform-status
qualifiers: ["workspace"]
baseTemplateName: mailgun-platform-status-health
levelOfDetail: basic
outputItems:
- type: slx
- type: sli
- type: runbook
templateName: mailgun-platform-status-health-taskset.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelIndicator
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
runwhen.com/sli: "true"
spec:
displayUnitsLong: Health Score
displayUnitsShort: score
locations:
- {{default_location}}
description: Composite 0–1 score from Mailgun Statuspage health and unauthenticated US/EU API probes.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/mailgun-platform-status-health/sli.robot
intervalStrategy: intermezzo
intervalSeconds: 180
configProvided:
- name: MAILGUN_STATUS_REGION_FOCUS
value: "{{ custom.mailgun_status_region_focus | default('both') }}"
- name: MAILGUN_STATUS_LOOKBACK_HOURS
value: "{{ custom.mailgun_status_lookback_hours | default('24') }}"
secretsProvided: []
alertConfig:
tasks:
persona: eager-edgar
sessionTTL: 10m
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: runwhen.com/v1
kind: ServiceLevelX
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
icon: Email
alias: Mailgun platform status {{match_resource.resource.name}}
asMeasuredBy: Public Statuspage data and unauthenticated regional API probes (401 + JSON).
configProvided:
- name: SLX_PLACEHOLDER
value: SLX_PLACEHOLDER
owners:
- {{workspace.owner_email}}
statement: Mailgun’s public status and regional API bases should be healthy before domain-level checks.
additionalContext:
qualified_name: "{{ match_resource.qualified_name }}"
tags:
- name: platform
value: mailgun
- name: service
value: mailgun
- name: scope
value: account
- name: access
value: read-only
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
apiVersion: runwhen.com/v1
kind: Runbook
metadata:
name: {{slx_name}}
labels:
{% include "common-labels.yaml" %}
annotations:
{% include "common-annotations.yaml" %}
spec:
location: {{default_location}}
description: Monitors Mailgun platform status and regional API reachability using public endpoints only.
codeBundle:
{% if repo_url %}
repoUrl: {{repo_url}}
{% else %}
repoUrl: https://github.com/runwhen-contrib/rw-cli-codecollection.git
{% endif %}
{% if ref %}
ref: {{ref}}
{% else %}
ref: main
{% endif %}
pathToRobot: codebundles/mailgun-platform-status-health/runbook.robot
configProvided:
- name: MAILGUN_STATUS_REGION_FOCUS
value: "{{ custom.mailgun_status_region_focus | default('both') }}"
- name: MAILGUN_STATUS_LOOKBACK_HOURS
value: "{{ custom.mailgun_status_lookback_hours | default('24') }}"
secretsProvided: []
47 changes: 47 additions & 0 deletions codebundles/mailgun-platform-status-health/.test/Taskfile.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
version: "3"

tasks:
default:
desc: "Validate bundle structure (public endpoints; no test cloud resources)"
cmds:
- ./validate-bundle-structure.sh

clean:
desc: "Cleanup local test artifacts"
cmds:
- rm -f workspaceInfo.yaml
- rm -rf output

build-infra:
desc: "No provisioned infrastructure — bundle probes public Mailgun URLs only"
cmds:
- ./validate-bundle-structure.sh

check-unpushed-commits:
desc: Check for uncommitted changes outside .test before integration testing
vars:
BASE_DIR: "../"
cmds:
- |
UNCOMMITTED=$(git diff --name-only HEAD 2>/dev/null | grep -E "^${BASE_DIR}" | grep -v "/\.test/" || true)
if [ -n "$UNCOMMITTED" ]; then
echo "Uncommitted changes in bundle (excluding .test):"
echo "$UNCOMMITTED"
exit 1
fi
silent: true

generate-rwl-config:
desc: "Placeholder — workspace wiring is environment-specific for non-Kubernetes bundles"
cmds:
- echo "See RunWhen workspace docs to attach this CodeBundle."

run-rwl-discovery:
desc: "Placeholder — discovery is optional for static network_service bundles"
cmds:
- ./validate-bundle-structure.sh

clean-rwl-discovery:
desc: "Remove discovery output"
cmds:
- rm -rf output
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env bash
# Sanity check for Mailgun platform bundle layout (no cloud resources required).
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
test -f "$ROOT/runbook.robot"
test -f "$ROOT/sli.robot"
test -f "$ROOT/.runwhen/generation-rules/mailgun-platform-status-health.yaml"
test -f "$ROOT/.runwhen/templates/mailgun-platform-status-health-slx.yaml"
test -f "$ROOT/.runwhen/templates/mailgun-platform-status-health-taskset.yaml"
test -f "$ROOT/.runwhen/templates/mailgun-platform-status-health-sli.yaml"
echo "mailgun-platform-status-health bundle structure OK"
48 changes: 48 additions & 0 deletions codebundles/mailgun-platform-status-health/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Mailgun Platform Status & Reachability

This CodeBundle detects Mailgun-wide service disruptions and loss of regional API reachability before authenticated domain checks run. It uses only Mailgun’s public Statuspage JSON endpoints and unauthenticated HTTPS probes (no API keys), so it works in environments where secrets are restricted.

## Overview

- **Live status page**: Reads `status.json`, `summary.json`, unresolved incidents, and active scheduled maintenance from `https://status.mailgun.com` to flag non-green indicators, degraded components, active incidents, and maintenance windows.
- **Incident history**: Scans the public incidents feed for major or critical incidents that **resolved** within the configured lookback window (context after an outage even when the banner is green again).
- **US API reachability**: GET `https://api.mailgun.net/v3/domains` without credentials; expects HTTP **401** and JSON (confirms TLS, routing, and API edge behavior).
- **EU API reachability**: Same check against `https://api.eu.mailgun.net/v3/domains` when EU routing is in scope.
- **SLI**: Produces a 0–1 health score from page status, unresolved incidents, and regional probes (see `sli.robot`).

## Configuration

### Required Variables

None. The bundle uses only public endpoints and optional configuration below.

### Optional Variables

- `MAILGUN_STATUS_REGION_FOCUS`: Which regional API reachability checks to run: `us`, `eu`, or `both` (default: `both`). When set to `us` only, the EU probe is skipped (and vice versa).
- `MAILGUN_STATUS_LOOKBACK_HOURS`: Hours of history to consider “recent” for the incident-feed task (default: `24`). Major/critical incidents resolved within this window are surfaced as lower-severity informational issues.

### Secrets

None.

## Tasks Overview

### Check Mailgun Status Page for Published Incidents

Uses Statuspage APIs for overall indicator, per-component status from `summary.json`, any unresolved incidents, and active scheduled maintenance. Raises issues when the page is not all-green, components are degraded, incidents are open, or maintenance is active.

### Check Mailgun Public Incident Feed for Recent Critical Events

Uses the incidents JSON feed to list major or critical incidents that reached **resolved** status inside the lookback window (recent blast-radius context).

### Verify Mailgun US API Endpoint Reachability

Probes the US API base; expects HTTP 401 with JSON without an API key. Skipped when `MAILGUN_STATUS_REGION_FOCUS` is `eu` only.

### Verify Mailgun EU API Endpoint Reachability

Same probe for the EU regional base. Skipped when `MAILGUN_STATUS_REGION_FOCUS` is `us` only.

### SLI (sli.robot)

Aggregates binary checks into a single health score for alerting; see `.runwhen/templates/mailgun-platform-status-health-sli.yaml`.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env bash
set -euo pipefail
set -x
# -----------------------------------------------------------------------------
# Unauthenticated probe of the EU Mailgun API base (api.eu.mailgun.net).
# Expects HTTP 401 with JSON when no API key is supplied.
# Env: MAILGUN_STATUS_REGION_FOCUS — skip when set to "us" only.
# -----------------------------------------------------------------------------

OUTPUT_FILE="${OUTPUT_FILE:-api_eu_reachability_output.json}"
MAILGUN_STATUS_REGION_FOCUS="${MAILGUN_STATUS_REGION_FOCUS:-both}"
API_URL="https://api.eu.mailgun.net/v3/domains"

issues_json='[]'

append_issue() {
local title="$1"
local expected="$2"
local actual="$3"
local details="$4"
local severity="$5"
local next_steps="$6"
issues_json=$(echo "$issues_json" | jq \
--arg title "$title" \
--arg expected "$expected" \
--arg actual "$actual" \
--arg details "$details" \
--argjson severity "$severity" \
--arg next_steps "$next_steps" \
'. += [{title: $title, expected: $expected, actual: $actual, details: $details, severity: $severity, next_steps: $next_steps}]')
}

if [[ "$MAILGUN_STATUS_REGION_FOCUS" == "us" ]]; then
echo '[]' | jq '.' >"$OUTPUT_FILE"
echo "Skipped EU API check (MAILGUN_STATUS_REGION_FOCUS=${MAILGUN_STATUS_REGION_FOCUS})"
exit 0
fi

tmp_body=$(mktemp)
http_code="000"
set +e
http_code=$(curl -sS -o "$tmp_body" -w '%{http_code}' --connect-timeout 10 --max-time 60 \
-H 'Accept: application/json' "$API_URL")
curl_rc=$?
set -e

body_head=$(head -c 400 "$tmp_body" | tr -d '\r' || true)
rm -f "$tmp_body"

if [[ "$curl_rc" -ne 0 ]]; then
append_issue \
"Mailgun EU API base unreachable (TLS or network failure)" \
"curl completes with exit 0 and an HTTP status from Mailgun" \
"curl exit ${curl_rc} for ${API_URL}" \
"curl to ${API_URL} failed before a reliable HTTP status was recorded." \
4 \
"Check egress to the EU region, DNS for api.eu.mailgun.net, and Mailgun status before debugging domain configuration."
elif [[ "$http_code" == "000" ]]; then
append_issue \
"Mailgun EU API base returned no HTTP status" \
"curl returns a non-zero HTTP status code" \
"Recorded HTTP code ${http_code}" \
"curl completed without a usable HTTP code for ${API_URL}." \
4 \
"Investigate network path; compare with US reachability and the public status page."
elif [[ "$http_code" != "401" ]]; then
append_issue \
"Mailgun EU API base returned unexpected HTTP ${http_code}" \
"Unauthenticated GET returns HTTP 401 with JSON from Mailgun" \
"HTTP ${http_code}; body prefix: ${body_head}" \
"Expected HTTP 401 for unauthenticated GET ${API_URL}." \
4 \
"Compare with documented API behavior; verify you require EU routing and that proxies are not altering responses."
else
if ! echo "$body_head" | jq -e . >/dev/null 2>&1; then
append_issue \
"Mailgun EU API base response was not JSON" \
"HTTP 401 body parses as JSON" \
"Non-JSON body prefix: ${body_head}" \
"HTTP ${http_code} received but body did not parse as JSON." \
3 \
"Validate that traffic reaches Mailgun EU and is not rewritten by a proxy or HTML error page."
fi
fi

echo "$issues_json" | jq '.' >"$OUTPUT_FILE"
echo "Wrote ${OUTPUT_FILE} ($(echo "$issues_json" | jq 'length') issue(s))"
Loading