diff --git a/package.json b/package.json index 64e1d25..61e3d48 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "devhelm", - "version": "0.6.0", + "version": "0.6.1", "description": "DevHelm CLI — manage monitors, deployments, and infrastructure as code", "author": "DevHelm ", "license": "MIT", @@ -17,7 +17,8 @@ "files": [ "bin", "dist", - "oclif.manifest.json" + "oclif.manifest.json", + "skills" ], "main": "dist/index.js", "types": "dist/index.d.ts", @@ -25,7 +26,8 @@ "typegen": "openapi-typescript docs/openapi/monitoring-api.json -o src/lib/api.generated.ts", "zodgen": "node scripts/generate-zod.mjs", "descgen": "node scripts/extract-descriptions.mjs", - "build": "npm run typegen && npm run zodgen && npm run descgen && tsc -b && oclif manifest", + "skillgen": "node scripts/generate-skill-references.mjs", + "build": "npm run typegen && npm run zodgen && npm run descgen && npm run skillgen && tsc -b && oclif manifest", "lint": "eslint src/ test/", "lint:fix": "eslint src/ test/ --fix", "test": "vitest run", diff --git a/scripts/generate-skill-references.mjs b/scripts/generate-skill-references.mjs new file mode 100644 index 0000000..b0d84cd --- /dev/null +++ b/scripts/generate-skill-references.mjs @@ -0,0 +1,218 @@ +#!/usr/bin/env node +/** + * Generate `skills/devhelm-/references/_generated/.fields.md` + * from the vendored OpenAPI spec. + * + * Each skill reference is a focused field listing for one resource type, + * covering the Create / Update request shapes and the primary Dto (response) + * shape. The agent reads these alongside the hand-written reference to get + * the exact current field surface without us re-documenting fields in prose. + * + * Usage: node scripts/generate-skill-references.mjs + * + * Output is idempotent: re-running produces byte-identical files. The + * openapi-drift test (`test/skills/openapi-drift.test.ts`) depends on this. + */ + +import { readFileSync, writeFileSync, mkdirSync, rmSync, existsSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { fileURLToPath } from 'node:url'; +import { preprocessSpec } from './lib/preprocess.mjs'; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const ROOT = join(__dirname, '..'); +const SPEC_PATH = join(ROOT, 'docs/openapi/monitoring-api.json'); + +/** + * Map each skill + resource to the OpenAPI schema names we care about. + * Schema names follow Springdoc conventions: + * CreateRequest / UpdateRequest / Dto + * where is the PascalCase singular. We look up all three; missing + * ones are skipped silently (not every resource has a Create endpoint). + */ +const RESOURCES = { + 'devhelm-configure': { + monitors: { singular: 'Monitor' }, + 'alert-channels': { singular: 'AlertChannel' }, + 'notification-policies': { singular: 'NotificationPolicy' }, + 'resource-groups': { singular: 'ResourceGroup' }, + dependencies: { singular: 'Dependency' }, + secrets: { singular: 'Secret' }, + tags: { singular: 'Tag' }, + webhooks: { singular: 'Webhook' }, + environments: { singular: 'Environment' }, + }, + 'devhelm-investigate': { + 'check-results': { singular: 'CheckResult', readOnly: true }, + incidents: { singular: 'Incident' }, + 'audit-events': { singular: 'AuditEvent', readOnly: true }, + }, + 'devhelm-communicate': { + 'status-pages': { singular: 'StatusPage' }, + 'status-page-components': { singular: 'StatusPageComponent' }, + 'status-page-incidents': { singular: 'StatusPageIncident' }, + 'status-page-subscribers': { singular: 'StatusPageSubscriber' }, + 'status-page-domains': { singular: 'StatusPageDomain' }, + }, + 'devhelm-manage': { + 'api-keys': { singular: 'ApiKey' }, + environments: { singular: 'Environment' }, + workspaces: { singular: 'Workspace', readOnly: true }, + entitlements: { singular: 'Entitlements', readOnly: true }, + }, +}; + +function loadSpec() { + const raw = JSON.parse(readFileSync(SPEC_PATH, 'utf8')); + preprocessSpec(raw); + return raw; +} + +/** + * Resolve `$ref` one hop. We intentionally don't deep-resolve to keep the + * field listing flat — nested objects show as their schema name, which the + * reader can look up in the same file or in a sibling generated file. + */ +function shortRef(ref) { + if (!ref) return undefined; + const parts = ref.split('/'); + return parts[parts.length - 1]; +} + +/** + * Produce a concise human-readable type expression for a property. + * Enums get their values inline (up to 8); larger enums get summarised. + */ +function typeOf(prop) { + if (!prop) return '?'; + if (prop.$ref) return shortRef(prop.$ref); + + if (prop.enum) { + if (prop.enum.length <= 8) { + return prop.enum.map((v) => JSON.stringify(v)).join(' \\| '); + } + return `${prop.type ?? 'string'} (${prop.enum.length} enum values — see OpenAPI spec)`; + } + + if (prop.type === 'array') { + const itemType = prop.items ? typeOf(prop.items) : 'any'; + return `${itemType}[]`; + } + + if (prop.type === 'object') { + if (prop.additionalProperties && typeof prop.additionalProperties === 'object') { + return `Map`; + } + return 'object'; + } + + let t = prop.type ?? 'any'; + if (prop.format) t += ` (${prop.format})`; + return t; +} + +function escapePipe(s) { + return String(s).replace(/\|/g, '\\|').replace(/\n/g, ' '); +} + +function renderSchemaTable(schemaName, schema) { + if (!schema || schema.type !== 'object' || !schema.properties) { + return `> Schema \`${schemaName}\` is not a simple object; see OpenAPI spec.\n`; + } + + const required = new Set(schema.required ?? []); + const props = Object.entries(schema.properties); + if (props.length === 0) return '> No fields.\n'; + + const header = + '| Field | Type | Required | Nullable | Description |\n' + + '|---|---|---|---|---|'; + + const rows = props.map(([name, prop]) => { + const t = typeOf(prop); + const req = required.has(name) ? '✓' : ''; + const nullable = prop.nullable === true ? '✓' : ''; + const desc = prop.description ? escapePipe(prop.description) : ''; + return `| \`${name}\` | ${t} | ${req} | ${nullable} | ${desc} |`; + }); + + return [header, ...rows, ''].join('\n'); +} + +function findSchema(spec, names) { + const all = spec.components?.schemas ?? {}; + for (const n of names) { + if (all[n]) return { name: n, schema: all[n] }; + } + return null; +} + +function renderResource(spec, resourceName, { singular, readOnly }) { + const lines = []; + lines.push(`# ${resourceName} — field reference`); + lines.push(''); + lines.push( + '> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand.', + ); + lines.push( + '> Regenerate with `node scripts/generate-skill-references.mjs`.', + ); + lines.push(''); + + if (!readOnly) { + const create = findSchema(spec, [`Create${singular}Request`]); + if (create) { + lines.push(`## \`Create${singular}Request\``); + lines.push(''); + lines.push(renderSchemaTable(create.name, create.schema)); + } + + const update = findSchema(spec, [ + `Update${singular}Request`, + `Patch${singular}Request`, + ]); + if (update) { + lines.push(`## \`${update.name}\``); + lines.push(''); + lines.push(renderSchemaTable(update.name, update.schema)); + } + } + + const dto = findSchema(spec, [`${singular}Dto`, singular]); + if (dto) { + lines.push(`## \`${dto.name}\` (response shape)`); + lines.push(''); + lines.push(renderSchemaTable(dto.name, dto.schema)); + } + + if ( + lines.filter((l) => l.startsWith('## ')).length === 0 + ) { + lines.push('> No schemas found for this resource in the current spec.'); + lines.push(''); + } + + return lines.join('\n'); +} + +function main() { + const spec = loadSpec(); + + let total = 0; + for (const [skill, resources] of Object.entries(RESOURCES)) { + const outDir = join(ROOT, 'skills', skill, 'references', '_generated'); + if (existsSync(outDir)) rmSync(outDir, { recursive: true, force: true }); + mkdirSync(outDir, { recursive: true }); + + for (const [resource, cfg] of Object.entries(resources)) { + const body = renderResource(spec, resource, cfg); + const outPath = join(outDir, `${resource}.fields.md`); + writeFileSync(outPath, body + '\n', 'utf8'); + total += 1; + } + } + + console.log(`Generated ${total} skill field references.`); +} + +main(); diff --git a/skills/devhelm-communicate/SKILL.md b/skills/devhelm-communicate/SKILL.md new file mode 100644 index 0000000..b37485e --- /dev/null +++ b/skills/devhelm-communicate/SKILL.md @@ -0,0 +1,221 @@ +--- +name: devhelm-communicate +description: Set up and operate public DevHelm status pages — create pages, add components grouped by service, attach custom domains, manage subscribers, and publish incident updates. Use whenever the user wants a status page, says "status.example.com", "tell users about an outage", "post an update", "publish an incident", "add a subscriber", or "customize the page". +--- + +# DevHelm — Communicate + +You help the user run the **public-facing** side of DevHelm: status +pages, components, public incidents, subscribers, and custom domains. + +For the private write side (creating the underlying monitors / alerts), +switch to `devhelm-configure`. For debugging a failure *before* +communicating, switch to `devhelm-investigate`. + +--- + +## Preconditions + +1. `devhelm --version` succeeds. +2. `devhelm auth me` succeeds. +3. **Workspace has at least one monitor** — a status page without + monitors has nothing to render. If there are none, offer to create + one via `devhelm-configure` first. + +--- + +## The two main journeys + +### Journey A — "First status page" (onboarding happy path) + +**User prompt:** *"Set up a status page"*, *"Get me on status. +example.com"*, *"Publish a public status page"*. + +**Step 1 — Inventory existing monitors.** + +```bash +devhelm monitors list --output=json +``` + +If 0 monitors → stop, hand off to `devhelm-configure`. + +**Step 2 — Propose a component layout.** + +The cleanest default is **one component per monitor**. If the user has +clear groupings (e.g. `api-prod`, `api-staging`, `dashboard-prod`), +propose **groups** for environments or services. + +Show the user the proposed layout as a compact tree and ask for a +single yes/no: + +``` +Proposed layout for status page: + + Production + API → monitor api-prod + Dashboard → monitor dashboard-prod + Staging + API → monitor api-staging + +Publish with this layout? (y/n, or name changes you want) +``` + +**Step 3 — Confirm slug + name.** + +- **Slug**: defaults to the org's slug (get it from `devhelm auth me` + → `organization.slug`). Ask only if the user hasn't named one. +- **Name**: defaults to ` Status`. + +**Step 4 — Create the page, then components, then groups.** + +```bash +# 4a. Create the page (unpublished by default) +devhelm status-pages create \ + --name="Acme Status" \ + --slug=acme \ + --headline="Acme service status" + +# 4b. Create component groups (if using them) +devhelm status-pages groups create \ + --name="Production" + +# 4c. Create components (one per monitor) +devhelm status-pages components create \ + --name="API" \ + --monitor-id= \ + --group-id= + +# 4d. Publish +devhelm status-pages update --published=true +``` + +**Step 5 — Verify.** + +```bash +devhelm status-pages get +``` + +Report the public URL: `https://.devhelm.io` (or whatever the +response returns as the canonical URL). + +**Step 6 — Offer next steps.** + +*"Want me to attach a custom domain like `status.example.com`? Or add a +subscriber so someone gets emailed when you post updates?"* — one, not +both. + +### Journey B — "Publish an incident update" + +**User prompt:** *"Post an update to our status page"*, *"Tell users +API is down"*, *"Publish an incident"*. + +**Step 1 — Identify the status page.** + +If the user only has one published page, use it. Otherwise: + +```bash +devhelm status-pages list --output=table +``` + +Ask which one if ambiguous. + +**Step 2 — Existing incident or new one?** + +- **Existing auto-created incident** (from a monitor going down): + list recent ones with `devhelm status-pages incidents list + ` and ask the user which to post under. +- **New manual incident**: create with `devhelm status-pages incidents + create ...` (planned maintenance, external provider + outages, etc.). + +**Step 3 — Compose the update.** + +Incident update bodies are **public-facing text**. Follow these rules: + +- Lead with *what users should do right now* (*"Retry your request in + 5 minutes"*, *"No action needed"*). +- Name the impact in user terms, not internal service names (*"Login + is temporarily failing"*, not *"auth-svc pods are crashlooping"*). +- Include a plain timestamp (*"Started ~14:30 UTC"*). +- Keep it under ~280 characters for the first post; longer follow-ups + are fine. +- Never include PII, API keys, stack traces, or customer names. + +**Step 4 — Post it.** + +```bash +devhelm status-pages incidents updates create \ + --status=INVESTIGATING \ + --body="" \ + --notify-subscribers=true +``` + +Valid statuses: `INVESTIGATING`, `IDENTIFIED`, `MONITORING`, +`RESOLVED`. The `--notify-subscribers=true` flag sends emails — +always ask before setting it on the first post of a session. + +**Step 5 — Verify + share URL.** + +Show the permalink to the incident on the public page. + +--- + +## Custom domains + +```bash +devhelm status-pages domains add --hostname=status.example.com +devhelm status-pages domains list +``` + +The response includes a CNAME target and a TLS verification record. +Tell the user to add both to their DNS, then run `devhelm status-pages +domains verify `. Do NOT loop polling for them — instruct +and stop. + +For full field details: `@references/custom-domains.md`. + +--- + +## Subscribers + +```bash +devhelm status-pages subscribers create --email=foo@bar.com +devhelm status-pages subscribers list +``` + +Subscribers receive emails whenever an incident update is posted with +`--notify-subscribers=true`. Full field details: +`@references/subscribers.md`. + +--- + +## Safety rails + +1. **Never publish a status page without explicit confirmation.** Show + the layout first, wait for yes. +2. **Never post an incident update without showing the body first** and + asking *"post this with status=X and notify-subscribers=?"*. + Public communication is irreversible (updates can be deleted but + subscribers already got the email). +3. **Never set `notify-subscribers=true` on the first post of a fresh + incident without confirming.** Accidental notifications are the #1 + regret. +4. **Never include PII, API keys, stack traces, internal service + names, or customer identifiers** in incident bodies. If the user + pastes something sensitive, redact and ask before posting. +5. **No localhost URLs in custom domains.** Same reason as + `devhelm-configure`. +6. **Never delete a published status page** unless the user explicitly + said "delete this page" using that word. "Unpublish" is + `devhelm status-pages update --published=false`, which is + reversible; prefer it. + +--- + +## References + +- `@references/status-pages.md` +- `@references/components.md` +- `@references/incidents.md` +- `@references/subscribers.md` +- `@references/custom-domains.md` diff --git a/skills/devhelm-communicate/references/_generated/status-page-components.fields.md b/skills/devhelm-communicate/references/_generated/status-page-components.fields.md new file mode 100644 index 0000000..ee7dbab --- /dev/null +++ b/skills/devhelm-communicate/references/_generated/status-page-components.fields.md @@ -0,0 +1,54 @@ +# status-page-components — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateStatusPageComponentRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Component display name | +| `description` | string | | ✓ | Optional description shown on expand | +| `type` | "MONITOR" \| "GROUP" \| "STATIC" | ✓ | | Component type: MONITOR, GROUP, or STATIC | +| `monitorId` | string (uuid) | | ✓ | Monitor ID (required when type=MONITOR) | +| `resourceGroupId` | string (uuid) | | ✓ | Resource group ID (required when type=GROUP) | +| `groupId` | string (uuid) | | ✓ | Component group ID for visual grouping | +| `showUptime` | boolean | | ✓ | Whether to show the uptime bar (default: true) | +| `displayOrder` | integer (int32) | | ✓ | Position in the component list | +| `excludeFromOverall` | boolean | | ✓ | Exclude from overall status calculation (default: false, use true for third-party deps) | +| `startDate` | string (date) | | ✓ | Date from which to start showing uptime; defaults to component creation. Set earlier to backdate (e.g. launch day); clamped at the monitor's createdAt for MONITOR-type components | + +## `UpdateStatusPageComponentRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | | ✓ | New component name; null preserves current | +| `description` | string | | ✓ | New description; null preserves current, empty string clears | +| `groupId` | string (uuid) | | ✓ | Move to a different group; null preserves current | +| `removeFromGroup` | boolean | | ✓ | Remove the component from its group (default: false) | +| `showUptime` | boolean | | ✓ | Whether to show the uptime bar; null preserves current | +| `displayOrder` | integer (int32) | | ✓ | New position in the component list; null preserves current | +| `excludeFromOverall` | boolean | | ✓ | Exclude from overall status calculation; null preserves current | +| `startDate` | string (date) | | ✓ | Date from which to start showing uptime; null preserves current. Bars never extend earlier than the underlying monitor's createdAt regardless of value | + +## `StatusPageComponentDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | | +| `statusPageId` | string (uuid) | ✓ | | | +| `groupId` | string (uuid) | | ✓ | | +| `name` | string | ✓ | | | +| `description` | string | | ✓ | | +| `type` | "MONITOR" \| "GROUP" \| "STATIC" | ✓ | | | +| `monitorId` | string (uuid) | | ✓ | | +| `resourceGroupId` | string (uuid) | | ✓ | | +| `currentStatus` | "OPERATIONAL" \| "DEGRADED_PERFORMANCE" \| "PARTIAL_OUTAGE" \| "MAJOR_OUTAGE" \| "UNDER_MAINTENANCE" | ✓ | | | +| `showUptime` | boolean | ✓ | | | +| `displayOrder` | integer (int32) | ✓ | | | +| `pageOrder` | integer (int32) | ✓ | | | +| `excludeFromOverall` | boolean | ✓ | | | +| `startDate` | string (date-time) | | ✓ | | +| `createdAt` | string (date-time) | ✓ | | | +| `updatedAt` | string (date-time) | ✓ | | | + diff --git a/skills/devhelm-communicate/references/_generated/status-page-domains.fields.md b/skills/devhelm-communicate/references/_generated/status-page-domains.fields.md new file mode 100644 index 0000000..5f9b42e --- /dev/null +++ b/skills/devhelm-communicate/references/_generated/status-page-domains.fields.md @@ -0,0 +1,7 @@ +# status-page-domains — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +> No schemas found for this resource in the current spec. + diff --git a/skills/devhelm-communicate/references/_generated/status-page-incidents.fields.md b/skills/devhelm-communicate/references/_generated/status-page-incidents.fields.md new file mode 100644 index 0000000..16ac994 --- /dev/null +++ b/skills/devhelm-communicate/references/_generated/status-page-incidents.fields.md @@ -0,0 +1,56 @@ +# status-page-incidents — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateStatusPageIncidentRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `title` | string | ✓ | | Customer-facing incident title | +| `status` | "INVESTIGATING" \| "IDENTIFIED" \| "MONITORING" \| "RESOLVED" | | ✓ | Initial status (default: INVESTIGATING) | +| `impact` | "NONE" \| "MINOR" \| "MAJOR" \| "CRITICAL" | ✓ | | Impact level: NONE, MINOR, MAJOR, or CRITICAL | +| `body` | string | ✓ | | Initial update body in markdown | +| `affectedComponents` | AffectedComponent[] | | ✓ | Component IDs affected by this incident | +| `scheduled` | boolean | | ✓ | Whether this is a scheduled maintenance (default: false) | +| `scheduledFor` | string (date-time) | | ✓ | Maintenance start time (required when scheduled=true) | +| `scheduledUntil` | string (date-time) | | ✓ | Maintenance end time | +| `autoResolve` | boolean | | ✓ | Auto-resolve at scheduledUntil (default: false) | +| `notifySubscribers` | boolean | | ✓ | Whether to email confirmed subscribers about this incident (default: true) | + +## `UpdateStatusPageIncidentRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `title` | string | | ✓ | New title; null preserves current | +| `status` | "INVESTIGATING" \| "IDENTIFIED" \| "MONITORING" \| "RESOLVED" | | ✓ | New status; null preserves current | +| `impact` | "NONE" \| "MINOR" \| "MAJOR" \| "CRITICAL" | | ✓ | New impact level; null preserves current | +| `affectedComponents` | AffectedComponent[] | | ✓ | Updated affected components; null preserves current | +| `postmortemBody` | string | | ✓ | Postmortem body in markdown; empty string clears | +| `postmortemUrl` | string | | ✓ | URL to an external postmortem document; empty string clears | + +## `StatusPageIncidentDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | | +| `statusPageId` | string (uuid) | ✓ | | | +| `title` | string | ✓ | | | +| `status` | "INVESTIGATING" \| "IDENTIFIED" \| "MONITORING" \| "RESOLVED" | ✓ | | | +| `impact` | "NONE" \| "MINOR" \| "MAJOR" \| "CRITICAL" | ✓ | | | +| `scheduled` | boolean | ✓ | | | +| `scheduledFor` | string (date-time) | | ✓ | | +| `scheduledUntil` | string (date-time) | | ✓ | | +| `autoResolve` | boolean | ✓ | | | +| `incidentId` | string (uuid) | | ✓ | | +| `startedAt` | string (date-time) | ✓ | | | +| `publishedAt` | string (date-time) | | ✓ | | +| `resolvedAt` | string (date-time) | | ✓ | | +| `createdByUserId` | integer (int32) | | ✓ | | +| `postmortemBody` | string | | ✓ | | +| `postmortemUrl` | string | | ✓ | | +| `affectedComponents` | StatusPageIncidentComponentDto[] | | ✓ | | +| `updates` | StatusPageIncidentUpdateDto[] | | ✓ | | +| `createdAt` | string (date-time) | ✓ | | | +| `updatedAt` | string (date-time) | ✓ | | | + diff --git a/skills/devhelm-communicate/references/_generated/status-page-subscribers.fields.md b/skills/devhelm-communicate/references/_generated/status-page-subscribers.fields.md new file mode 100644 index 0000000..d070a11 --- /dev/null +++ b/skills/devhelm-communicate/references/_generated/status-page-subscribers.fields.md @@ -0,0 +1,14 @@ +# status-page-subscribers — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `StatusPageSubscriberDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | | +| `email` | string | ✓ | | | +| `confirmed` | boolean | ✓ | | | +| `createdAt` | string (date-time) | ✓ | | | + diff --git a/skills/devhelm-communicate/references/_generated/status-pages.fields.md b/skills/devhelm-communicate/references/_generated/status-pages.fields.md new file mode 100644 index 0000000..9541826 --- /dev/null +++ b/skills/devhelm-communicate/references/_generated/status-pages.fields.md @@ -0,0 +1,48 @@ +# status-pages — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateStatusPageRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable name for this status page | +| `slug` | string | ✓ | | URL slug (lowercase, hyphens, globally unique) | +| `description` | string | | ✓ | Optional description shown below the page header | +| `branding` | any | | ✓ | | +| `visibility` | "PUBLIC" \| "PASSWORD" \| "IP_RESTRICTED" | | ✓ | Page visibility: PUBLIC, PASSWORD, or IP_RESTRICTED (default: PUBLIC) | +| `enabled` | boolean | | ✓ | Whether the page is enabled (default: true) | +| `incidentMode` | "MANUAL" \| "REVIEW" \| "AUTOMATIC" | | ✓ | Incident mode: MANUAL, REVIEW, or AUTOMATIC (default: AUTOMATIC) | + +## `UpdateStatusPageRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | | ✓ | New name; null preserves current | +| `description` | string | | ✓ | New description; null preserves current, empty string clears | +| `branding` | any | | ✓ | | +| `visibility` | "PUBLIC" \| "PASSWORD" \| "IP_RESTRICTED" | | ✓ | Page visibility; null preserves current | +| `enabled` | boolean | | ✓ | Whether the page is enabled; null preserves current | +| `incidentMode` | "MANUAL" \| "REVIEW" \| "AUTOMATIC" | | ✓ | Incident mode: MANUAL, REVIEW, or AUTOMATIC; null preserves current | + +## `StatusPageDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | | +| `organizationId` | integer (int32) | ✓ | | | +| `workspaceId` | integer (int32) | ✓ | | | +| `name` | string | ✓ | | | +| `slug` | string | ✓ | | | +| `description` | string | | ✓ | | +| `branding` | StatusPageBranding | ✓ | | | +| `visibility` | "PUBLIC" \| "PASSWORD" \| "IP_RESTRICTED" | ✓ | | | +| `enabled` | boolean | ✓ | | | +| `incidentMode` | "MANUAL" \| "REVIEW" \| "AUTOMATIC" | ✓ | | | +| `componentCount` | integer (int32) | | ✓ | | +| `subscriberCount` | integer (int64) | | ✓ | | +| `overallStatus` | "OPERATIONAL" \| "DEGRADED_PERFORMANCE" \| "PARTIAL_OUTAGE" \| "MAJOR_OUTAGE" \| "UNDER_MAINTENANCE" | | ✓ | | +| `createdAt` | string (date-time) | ✓ | | | +| `updatedAt` | string (date-time) | ✓ | | | + diff --git a/skills/devhelm-communicate/references/components.md b/skills/devhelm-communicate/references/components.md new file mode 100644 index 0000000..41a6715 --- /dev/null +++ b/skills/devhelm-communicate/references/components.md @@ -0,0 +1,105 @@ +# Status Page Components + +A **component** is a row on the status page. Each component has: + +- A user-facing name (*"API"*, *"Dashboard"*, *"Checkout"*). +- A source of truth — usually a **monitor** or a **dependency**, but + can also be manual (useful for non-automated services). +- An optional parent **group** for sectioning. +- A display order. + +## Create + +```bash +# Backed by a monitor +devhelm status-pages components create \ + --name="API" \ + --monitor-id= \ + --group-id= \ + --description="Public API" + +# Backed by a dependency +devhelm status-pages components create \ + --name="Stripe Payments" \ + --dependency-id= + +# Manual (status set by user, not auto-computed) +devhelm status-pages components create \ + --name="Internal Admin" \ + --manual-status=OPERATIONAL +``` + +## Groups + +Groups section the page. Create them first, then attach components: + +```bash +devhelm status-pages groups create \ + --name="Production" \ + --display-order=1 + +devhelm status-pages components create \ + --name="API" \ + --monitor-id= \ + --group-id= +``` + +Display order: ascending integer. The page renders groups in order; +components are ordered within their group. + +## Propose a component layout (onboarding flow) + +When the user says *"set up a status page"* and has N monitors: + +1. Read `devhelm monitors list --output=json`. +2. Infer groupings from tags (`env`, `service`) and resource-group + membership. +3. Propose one component per monitor, within groups derived from the + dominant grouping tag. + +Show as a tree, ask for one yes/no before creating anything. + +## Update / delete / reorder + +```bash +devhelm status-pages components update --name="..." --display-order=2 +devhelm status-pages components delete +``` + +Delete is reversible only via undelete API within 24h (not exposed in +CLI yet) — warn the user. + +## Manual status override + +For manual components or when the user wants to force a status +regardless of the underlying monitor: + +```bash +devhelm status-pages components update \ + --manual-status=DEGRADED \ + --manual-status-reason="Known issue with X, ETA 30min" +``` + +Valid statuses: `OPERATIONAL`, `DEGRADED`, `PARTIAL_OUTAGE`, +`MAJOR_OUTAGE`, `UNDER_MAINTENANCE`. + +To return to monitor-driven status: + +```bash +devhelm status-pages components update --manual-status=null +``` + +## Common gotchas + +- **Monitor binding is 1:1 per page** — the same monitor can back + components on multiple pages (if you run public + internal pages), + but not twice on the same page. API rejects duplicates. +- **Group deletion** with components inside → components become + orphaned (group-less), not deleted. Prune them separately. +- **Auto-incidents from manual components** — don't fire, because + there's no monitor driving them. Manual incidents only. + +## Complete field reference + +`@_generated/status-page-components.fields.md`. Runtime pull: +`devhelm skills schema status-page-components`. diff --git a/skills/devhelm-communicate/references/custom-domains.md b/skills/devhelm-communicate/references/custom-domains.md new file mode 100644 index 0000000..bd5163c --- /dev/null +++ b/skills/devhelm-communicate/references/custom-domains.md @@ -0,0 +1,90 @@ +# Custom Domains + +Status pages can be served on a custom domain like +`status.example.com` instead of the default `.devhelm.io`. + +## Add a domain + +```bash +devhelm status-pages domains add --hostname=status.example.com +``` + +The response returns: + +- `cnameTarget` — the value the user must CNAME their `status` + record to (e.g. `custom.devhelm.io`). +- `tlsVerificationRecord` — a DNS TXT record value used for ACME + DNS-01 challenge verification. +- `status` — `PENDING_DNS` initially. + +## Configure DNS + +Tell the user to add two records at their DNS provider: + +``` +Type Name Value +CNAME status.example.com +TXT _devhelm-challenge.status.example.com +``` + +TTL: any. Propagation: typically 5–60 minutes; often faster. + +**Do NOT poll from this skill.** Instruct the user, then stop. The +user comes back when DNS is live. + +## Verify + +Once DNS is set: + +```bash +devhelm status-pages domains verify +``` + +This triggers a fresh check. Possible outcomes: + +- `VERIFIED` → TLS cert issued via ACME; page is live on the custom + domain within ~1 minute. +- `DNS_NOT_FOUND` → records not visible from our side yet. Wait more. +- `CNAME_MISMATCH` → CNAME points somewhere else. Re-check record. +- `CHALLENGE_FAILED` → TXT record wrong or missing. + +## List / inspect / remove + +```bash +devhelm status-pages domains list +devhelm status-pages domains get +devhelm status-pages domains remove +``` + +Removing a domain immediately stops serving; the default +`.devhelm.io` URL remains active. + +## Multi-domain + +Multiple custom domains per page are supported (e.g. +`status.example.com` + `status.example.io`). Only one is the +canonical — pass `--canonical=true` on the one that should be in +Open Graph metadata and emails: + +```bash +devhelm status-pages domains update --canonical=true +``` + +## Gotchas + +- **Apex/root domain** (`example.com` as the status page) needs an + `ALIAS` / `ANAME` record — not all DNS providers support these. + Work around by serving from `status.example.com` instead. +- **Cloudflare proxy (orange cloud)** must be **off** for our ACME + challenge to resolve. Set to DNS-only (gray cloud) until + verification, then flip back on. +- **AAAA records** for the user's existing domain don't interfere, + but any CAA record restricting issuers must include `letsencrypt.org` + (our current issuer). +- **TLS renewals** are automatic; we renew 30 days before expiry. No + user action needed once initial verification succeeds. + +## Complete field reference + +`@_generated/status-page-domains.fields.md`. Runtime pull: +`devhelm skills schema status-page-domains`. diff --git a/skills/devhelm-communicate/references/incidents.md b/skills/devhelm-communicate/references/incidents.md new file mode 100644 index 0000000..0d88382 --- /dev/null +++ b/skills/devhelm-communicate/references/incidents.md @@ -0,0 +1,132 @@ +# Public Incidents (on status pages) + +This reference is for **public-facing incidents on a status page** — +distinct from the internal incident records covered in +`devhelm-investigate` → `@references/incidents.md`. Public incidents +show up on the status page and optionally notify subscribers. + +Two flavors: + +- **Auto-created**: a monitor tied to a status page component went + DOWN; DevHelm opened a public incident automatically. The user can + edit/update it. +- **Manual**: the user creates one for scheduled maintenance or + vendor-side outages that monitors can't detect. + +## List on a page + +```bash +devhelm status-pages incidents list --state=OPEN,MONITORING +devhelm status-pages incidents get +``` + +## Create a manual incident + +```bash +devhelm status-pages incidents create \ + --title="Scheduled maintenance — DB upgrade" \ + --status=SCHEDULED \ + --scheduled-start=2026-05-01T02:00:00Z \ + --scheduled-end=2026-05-01T03:00:00Z \ + --affected-components=, \ + --body="Database upgrade. Expect 5-10m of read-only mode." +``` + +### Incident kinds + +| Kind | Use | Required fields | +|---|---|---| +| `REAL_TIME` | Something is broken right now | `status=INVESTIGATING`, `body` | +| `SCHEDULED` | Upcoming planned work | `scheduled_start`, `scheduled_end`, `body` | +| `HISTORICAL` | Retrospective entry for an outage already over | `status=RESOLVED`, `body` with timeline | + +## Update an incident (post an update) + +```bash +devhelm status-pages incidents updates create \ + --status=IDENTIFIED \ + --body="Root cause: upstream DNS resolver. Working on failover." \ + --notify-subscribers=true +``` + +Valid status values: + +- `SCHEDULED` → upcoming maintenance +- `INVESTIGATING` → just started, cause unknown +- `IDENTIFIED` → cause known, fix in progress +- `MONITORING` → fix applied, watching +- `RESOLVED` → over + +Each update is appended to the incident; the page shows the full +timeline. + +## Writing update bodies — the rules + +1. **Lead with what users should do.** *"Retry your request in 5 + minutes"*, *"No action needed"*. +2. **Name the impact in user terms.** Not *"auth-svc pods + crashlooping"* — say *"login is temporarily failing"*. +3. **Include a plain timestamp.** *"Started ~14:30 UTC"*. +4. **Keep the first post under ~280 characters.** Longer follow-ups + are fine. +5. **Never include PII, API keys, stack traces, internal service + names, or customer identifiers.** + +### Example — good + +> **Login temporarily failing.** Some users can't sign in since ~14:32 +> UTC. No data loss. We've identified the cause and are rolling back a +> recent change. Next update in 15 minutes. + +### Example — bad + +> auth-svc pod stuck in CrashLoopBackoff due to OOMKilled. user +> alice@acme.com reported. working on rollback of PR #1234. + +## Edit a past update + +```bash +devhelm status-pages incidents updates update \ + --body="Corrected text..." +``` + +**Edits are reflected immediately on the public page.** Subscribers +who already received the original email are *not* re-notified of the +edit. Prefer posting a new update over editing for clarity. + +## Resolve + +```bash +devhelm status-pages incidents updates create \ + --status=RESOLVED \ + --body="The fix is deployed. All monitors green since 14:58 UTC. + Sorry for the disruption." \ + --notify-subscribers=true +``` + +For auto-created incidents, you can also let them auto-resolve when +the monitor returns to UP — this posts a default resolve message. +Control per page: + +```bash +devhelm status-pages update --auto-resolve-incidents=true +``` + +If the user wants a custom resolve message, disable auto-resolve and +post manually. + +## Delete an incident + +```bash +devhelm status-pages incidents delete +``` + +Reversible within 24h via dashboard (not CLI). Deletion removes the +incident from the public timeline but preserves the internal record +for audit. Never delete an incident to "hide" an outage — users who +got subscriber emails still have the email. + +## Complete field reference + +`@_generated/status-page-incidents.fields.md`. Runtime pull: +`devhelm skills schema status-page-incidents`. diff --git a/skills/devhelm-communicate/references/status-pages.md b/skills/devhelm-communicate/references/status-pages.md new file mode 100644 index 0000000..e16fe01 --- /dev/null +++ b/skills/devhelm-communicate/references/status-pages.md @@ -0,0 +1,109 @@ +# Status Pages + +A **status page** is a public web page that shows the real-time +status of a customer's services. It renders: + +- A top-level UP/DEGRADED/DOWN banner computed from member + components. +- Components (one per monitor or dependency). +- Ongoing + recent incidents with customer-facing updates. +- Historical uptime (last 90 days of per-day status). +- A subscribe form (if enabled). + +DevHelm hosts pages at `https://.devhelm.io`. Custom domains +(`status.example.com`) are supported — see +`@references/custom-domains.md`. + +## Create a page (unpublished) + +```bash +devhelm status-pages create \ + --name="Acme Status" \ + --slug=acme \ + --headline="Acme service status" \ + --description="Real-time status of Acme products" \ + --published=false +``` + +Defaults: + +- `slug` — defaults to the org's slug if not supplied. Immutable after + publish (external bookmarks would break). +- `headline` — defaults to ` service status`. +- `published` — defaults to `false`. Users must opt in to publish. + +## List / get / update / delete + +```bash +devhelm status-pages list +devhelm status-pages get +devhelm status-pages update --headline="..." --published=true +devhelm status-pages delete # irreversible; unpublish first +``` + +## Theme & branding + +Light customization is available via flags: + +```bash +devhelm status-pages update \ + --logo-url=https://cdn.example.com/logo.svg \ + --primary-color="#0066CC" \ + --favicon-url=https://cdn.example.com/favicon.ico +``` + +Heavier customization (custom CSS, custom layout) is dashboard-only +for now. + +## Auto-created incidents + +When a monitor tied to a status-page component goes DOWN, DevHelm +**auto-creates a public incident** on the page. This means the page +reflects reality without the user having to manually post. + +Control this per page: + +```bash +devhelm status-pages update --auto-create-incidents=true +``` + +Or per component (see `@references/components.md`). + +## Publish flow + +The user-visible flow is four steps: + +1. `devhelm status-pages create --published=false` +2. `devhelm status-pages components create ...` (one per monitor) +3. Preview with `devhelm status-pages preview ` (opens the + dashboard's draft-preview URL). +4. `devhelm status-pages update --published=true` + +Do not skip step 3 if the user hasn't seen the page layout before — +status pages are public and mistakes are visible. + +## Un-publishing + +```bash +devhelm status-pages update --published=false +``` + +Reversible. The custom domain (if any) serves a 404 while unpublished. +Use this instead of delete for "temporarily hide the page". + +## Incidents on the page + +Two sources: + +- **Auto-created** — from monitor failures. +- **Manual** — `devhelm status-pages incidents create ...` + for maintenance windows, vendor-side outages, etc. + +Both are rendered in the same section. Manual incidents need an +explicit resolve; auto-created ones resolve when the underlying +monitor returns to UP. + +## Complete field reference + +`@_generated/status-pages.fields.md`. Runtime pull: +`devhelm skills schema status-pages`. diff --git a/skills/devhelm-communicate/references/subscribers.md b/skills/devhelm-communicate/references/subscribers.md new file mode 100644 index 0000000..f9277e0 --- /dev/null +++ b/skills/devhelm-communicate/references/subscribers.md @@ -0,0 +1,83 @@ +# Subscribers + +**Subscribers** are end users (customers) who want to receive email +notifications whenever incident updates are posted to a status page. + +- Three channels: `email`, `webhook`, `rss` (RSS is URL-only; no + explicit subscribe). +- Opt-in only. Users self-subscribe via the status page's subscribe + form, or an operator adds them programmatically. +- Per-component filtering: subscribers can choose to follow only + specific components rather than everything. + +## Add (operator-initiated) + +```bash +devhelm status-pages subscribers create \ + --email=foo@example.com \ + --components=, # optional; defaults to all +``` + +- Sends a confirmation email. +- Subscriber status is `PENDING` until they click confirm. +- `--skip-confirmation` is available for bulk imports but should + only be used when the user has the subscriber's consent on file. + +## List + +```bash +devhelm status-pages subscribers list \ + --status=ACTIVE,PENDING,UNSUBSCRIBED \ + --output=table +``` + +## Delete (unsubscribe) + +```bash +devhelm status-pages subscribers delete +``` + +Sends a farewell email confirming unsubscription. Subscribers can +also self-unsubscribe via one-click link in every email. + +## Webhook subscribers + +```bash +devhelm status-pages subscribers create \ + --webhook-url=https://hooks.example.com/devhelm-status \ + --secret= +``` + +Receives the same event payloads as outbound webhooks +(`@references/webhooks.md`) filtered to status-page incidents on the +specified page. + +## Import from another provider + +No bulk-import CLI command yet. For small lists, shell-loop: + +```bash +while read email; do + devhelm status-pages subscribers create --email="$email" +done < emails.txt +``` + +For large lists, direct the user to the dashboard's CSV import. + +## Privacy / legal rails + +1. **Consent.** Never add a subscriber without their consent on + file. `--skip-confirmation` bypasses the opt-in email; use only + with documented consent. +2. **PII in transcripts.** Don't echo full subscriber emails in chat + output. Truncate to `f***@example.com` when listing; show full + only for operator-initiated adds where the user just provided the + address. +3. **Unsubscribe is one-click.** Don't work around it. Manual + resubscribe after unsubscribe requires the user to re-subscribe + from the page, not an operator to re-add them. + +## Complete field reference + +`@_generated/status-page-subscribers.fields.md`. Runtime pull: +`devhelm skills schema status-page-subscribers`. diff --git a/skills/devhelm-configure/SKILL.md b/skills/devhelm-configure/SKILL.md new file mode 100644 index 0000000..4b64f7f --- /dev/null +++ b/skills/devhelm-configure/SKILL.md @@ -0,0 +1,227 @@ +--- +name: devhelm-configure +description: Create and manage DevHelm monitoring resources — monitors, alert channels, notification policies, resource groups, dependencies, secrets, tags, webhooks, and environments. Use when the user wants to set up uptime or heartbeat monitoring, wire Slack/email/PagerDuty/webhook alerts, group monitors by team/service, track third-party status pages as dependencies, store secrets for monitor auth, or otherwise configure any DevHelm resource. Auto-detects monitoring-as-code repos (devhelm.yml, Terraform) and picks the right surface. +--- + +# DevHelm — Configure + +You help the user create and manage monitoring resources in DevHelm. This +skill covers the **write side** of the platform: everything that creates, +updates, or deletes a resource. + +For read/debug flows (status, failures, incidents, uptime), switch to the +`devhelm-investigate` skill. For status pages, switch to +`devhelm-communicate`. For API keys / plan / workspace admin, switch to +`devhelm-manage`. + +--- + +## Preconditions (run once per session) + +1. **CLI installed.** Run `devhelm --version`. If it fails, stop and tell + the user: `npm install -g devhelm`. +2. **Authenticated.** Run `devhelm auth me`. If it fails with a 401 or + reports no active context, stop and tell the user: `devhelm auth login` + (or `devhelm auth login --token=` if they have one). Do not + attempt to create anything without an active context. +3. **Optional env override.** If the user's prompt names an environment + (e.g. "in staging"), resolve it with `devhelm environments list` and + pass `--environment ` on every subsequent command. + +--- + +## Step 1 — Detect the mode + +Check the repo, in this order, and stop at the first match: + +| Signal | Mode | Rationale | +|---|---|---| +| `devhelm.yml` or `devhelm.yaml` exists at repo root | **MaC-YAML** | User has already committed to declarative. | +| Any `*.tf` file references `devhelm_*` resources | **MaC-Terraform** | User has already committed to Terraform. | +| `.github/workflows/*.yml` contains `devhelm deploy` | **MaC-YAML** | CI-driven declarative flow. | +| User's prompt names ≥3 resources to create in one turn | **MaC-YAML** | Bulk is always worth the reviewability. | +| None of the above, user is in a git repo | **Imperative CLI**, then offer to bootstrap MaC after success | +| Not in a git repo | **Imperative CLI only** | No bootstrap offer. | + +Fast checks (do NOT scan every file): + +```bash +test -f devhelm.yml || test -f devhelm.yaml && echo YAML +grep -rl --include='*.tf' 'devhelm_' . 2>/dev/null | head -1 +grep -l 'devhelm deploy' .github/workflows/*.yml 2>/dev/null | head -1 +``` + +**Announce the detected mode in one sentence** before taking any action. +Example: *"Detected `devhelm.yml` in this repo — I'll add the monitor +there and run `devhelm plan` before applying."* Don't ladder through +multi-turn confirmation — proceed unless the user pushes back. + +Always re-run detection at the start of each *new* user turn; repos +change between requests. + +--- + +## Step 2 — Identify the resource(s) + +Map the user's intent to a resource type, then load the matching +reference. Each reference has a **hand-written** authoritative section +and a **generated** field list (`_generated/.fields.md`) that +tracks the current OpenAPI spec. + +| Intent vocabulary | Resource | Reference | +|---|---|---| +| "monitor", "check", "uptime of X", "health of X" | monitors | `@references/monitors.md` | +| "alert", "Slack channel", "PagerDuty", "email me", "webhook alert" | alert-channels | `@references/alert-channels.md` | +| "policy", "escalation", "notify on-call", "route alerts" | notification-policies | `@references/notification-policies.md` | +| "group", "per team", "per service", "bundle monitors" | resource-groups | `@references/resource-groups.md` | +| "track github", "slack status", "depends on X status page" | dependencies | `@references/dependencies.md` | +| "store secret", "API token for auth", "credential" | secrets | `@references/secrets.md` | +| "label", "env=prod", "team=payments" | tags | `@references/tags.md` | +| "outbound webhook", "notify external service" | webhooks | `@references/webhooks.md` | +| "staging env", "production env", "environment" | environments | `@references/environments.md` | + +If the intent is ambiguous (e.g. "set up monitoring for Slack"), ask +**one** clarifier — "Do you want to monitor your own API that depends on +Slack (monitor), or show Slack's public status on your page (dependency)?" +— then proceed. + +--- + +## Step 3 — Execute + +### Mode: Imperative CLI (single resource, fastest to green) + +Every resource type supports the same six verbs: + +```bash +devhelm create +devhelm list [--output json|yaml|table] +devhelm get +devhelm update +devhelm delete +devhelm test # where applicable (monitors, webhooks, alert-channels) +``` + +Flags are driven by the OpenAPI spec. To see current flags for any +resource: `devhelm create --help`. To see the full field +schema programmatically: `devhelm skills schema `. + +**Defaults to apply unless the user specified otherwise:** + +- HTTP monitors → `frequency=60`, `regions=us-east`, `method=GET`, + `follow_redirects=true`, `assertions=[{type: "STATUS_CODE", operator: "EQUALS", target: "200"}]`. +- Heartbeat monitors → `grace_period=300`. +- Notification policies — create a sensible default that fans to any + alert channels the user has already configured (list them first). + +After creation, always run the resource's `test` command (if it exists) +and report the result. + +### Mode: MaC-YAML (one or many, declarative) + +Read `@references/mac-yaml.md` for the full recipe. Core loop: + +1. Read existing `devhelm.yml` — never overwrite fields the user already + set. +2. Show the user the **diff** you're about to write (use the Read / + Write tools, not a shell `sed`). +3. Run `devhelm plan` and include its output verbatim in your reply. +4. Wait for explicit confirmation unless user already said "go ahead". +5. Run `devhelm deploy` and summarise the changeset. + +### Mode: MaC-Terraform + +Read `@references/mac-terraform.md` for the full recipe. Core loop: + +1. Add the HCL block to the most topical `*.tf` file, or a new + `monitors.tf` if the repo is unstructured. +2. Run `terraform plan` and include its output. +3. Wait for confirmation. +4. Run `terraform apply`. + +### Verify (all modes) + +- Monitors: `devhelm monitors test ` immediately after create, + then `devhelm monitors get ` a few seconds later to show status. +- Alert channels: `devhelm alert-channels test `. +- Webhooks: `devhelm webhooks test `. +- Everything else: `devhelm get `. + +Report the resource name, ID, dashboard URL, and verification result. + +--- + +## Step 4 — Offer the next logical step + +After creating a monitor, suggest **one** next step, tailored to +context: + +- No alert channels exist → *"Want me to wire up a Slack/email alert + channel so failures notify you?"* +- 1 monitor, no group → skip; grouping makes sense at ≥3. +- ≥3 monitors, no `devhelm.yml` → *"Want me to bootstrap `devhelm.yml` + so these live as code?"* +- ≥3 monitors, no status page → *"Want me to publish a public status + page showing these?"* (hand off to `devhelm-communicate`). + +One suggestion, not a menu. The user can ignore it without guilt. + +--- + +## Safety rails (non-negotiable) + +1. **Never edit `devhelm.yml` without showing the diff first.** Use the + Read+Write tools; do not `sed` config files in the shell. +2. **Never create more than 5 resources in one turn** without explicit + per-resource confirmation. 6+ = ask. +3. **Never delete** an existing resource unless the user explicitly used + the word "delete" or "remove". "Update" and "replace" are not delete + signals. +4. **Never run `devhelm deploy` without `devhelm plan` immediately + before**, and never skip the plan output from your reply. +5. **Reject localhost / private IP / link-local targets** on HTTP + monitors with a one-line explanation: *"DevHelm probes run from + public datacenters and can't reach 127.0.0.1 / 10.0.0.0/8 / + 192.168.0.0/16 / 169.254.0.0/16. Use a heartbeat monitor instead if + you want to verify an internal service from its own host."* Then + offer to create the heartbeat. +6. **Never expose the full value of an existing API key or secret.** + They're only visible once at creation time; past that, only the + last-4 characters and ID are returned by the API. + +--- + +## Error handling + +The CLI uses structured exit codes (see `DevhelmError` taxonomy): + +| Exit | Meaning | Your response | +|---|---|---| +| 0 | Success | Report. | +| 4 | Local validation | The user's input failed a Zod schema. Show the field + fix. Don't retry blindly. | +| 10 | Plan has changes | Expected during `plan`. Show diff. | +| 11 | API error (4xx/5xx) | Include the API error's `code`, `message`, and `requestId` verbatim in your reply. | +| 12 | Transport error | Network/TLS/DNS. Retry once; if it fails again, ask the user to check connectivity. | +| 13 | Partial failure (deploy) | Some resources applied, some didn't. Report both sides. | + +Any non-zero exit → stop and report. Never silently retry `deploy` or +`create`. + +--- + +## References + +- `@references/monitors.md` +- `@references/alert-channels.md` +- `@references/notification-policies.md` +- `@references/resource-groups.md` +- `@references/dependencies.md` +- `@references/secrets.md` +- `@references/tags.md` +- `@references/webhooks.md` +- `@references/environments.md` +- `@references/mac-yaml.md` +- `@references/mac-terraform.md` + +For absolute-latest field schemas (useful when the CLI is a stale +version): `devhelm skills schema `. diff --git a/skills/devhelm-configure/references/_generated/alert-channels.fields.md b/skills/devhelm-configure/references/_generated/alert-channels.fields.md new file mode 100644 index 0000000..e49e8c9 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/alert-channels.fields.md @@ -0,0 +1,33 @@ +# alert-channels — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateAlertChannelRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable name for this alert channel | +| `config` | any | ✓ | | | + +## `UpdateAlertChannelRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | New channel name (full replacement, not partial update) | +| `config` | any | ✓ | | | + +## `AlertChannelDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique alert channel identifier | +| `name` | string | ✓ | | Human-readable channel name | +| `channelType` | "email" \| "webhook" \| "slack" \| "pagerduty" \| "opsgenie" \| "teams" \| "discord" | ✓ | | Channel integration type (e.g. SLACK, PAGERDUTY, EMAIL) | +| `displayConfig` | any | | ✓ | | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the channel was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the channel was last updated | +| `configHash` | string | | ✓ | SHA-256 hash of the channel config; use for change detection | +| `lastDeliveryAt` | string (date-time) | | ✓ | Timestamp of the most recent delivery attempt | +| `lastDeliveryStatus` | string | | ✓ | Outcome of the most recent delivery (SUCCESS, FAILED, etc.) | + diff --git a/skills/devhelm-configure/references/_generated/dependencies.fields.md b/skills/devhelm-configure/references/_generated/dependencies.fields.md new file mode 100644 index 0000000..0f9a726 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/dependencies.fields.md @@ -0,0 +1,7 @@ +# dependencies — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +> No schemas found for this resource in the current spec. + diff --git a/skills/devhelm-configure/references/_generated/environments.fields.md b/skills/devhelm-configure/references/_generated/environments.fields.md new file mode 100644 index 0000000..0cb3330 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/environments.fields.md @@ -0,0 +1,36 @@ +# environments — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateEnvironmentRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable environment name | +| `slug` | string | ✓ | | URL-safe identifier (lowercase alphanumeric, hyphens, underscores) | +| `variables` | Map | | ✓ | Initial key-value variable pairs for this environment | +| `isDefault` | boolean | | ✓ | Whether this is the default environment for new monitors (default: false) | + +## `UpdateEnvironmentRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | | ✓ | New environment name; null preserves current | +| `variables` | Map | | ✓ | Replace all variables; null preserves current | +| `isDefault` | boolean | | ✓ | Whether this is the default environment; null preserves current | + +## `EnvironmentDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique environment identifier | +| `orgId` | integer (int32) | ✓ | | Organization this environment belongs to | +| `name` | string | ✓ | | Human-readable environment name | +| `slug` | string | ✓ | | URL-safe identifier | +| `variables` | Map | ✓ | | Key-value variable pairs available for interpolation | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the environment was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the environment was last updated | +| `monitorCount` | integer (int32) | ✓ | | Number of monitors using this environment | +| `isDefault` | boolean | ✓ | | Whether this is the default environment for new monitors | + diff --git a/skills/devhelm-configure/references/_generated/monitors.fields.md b/skills/devhelm-configure/references/_generated/monitors.fields.md new file mode 100644 index 0000000..41daeb5 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/monitors.fields.md @@ -0,0 +1,65 @@ +# monitors — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateMonitorRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable name for this monitor | +| `type` | "HTTP" \| "DNS" \| "MCP_SERVER" \| "TCP" \| "ICMP" \| "HEARTBEAT" | ✓ | | Monitor protocol type | +| `config` | any | ✓ | | | +| `frequencySeconds` | integer (int32) | | ✓ | Check frequency in seconds (30–86400); null defaults to plan minimum (60s on most paid plans) | +| `enabled` | boolean | | ✓ | Whether the monitor is active (default: true) | +| `regions` | string[] | | ✓ | Probe regions to run checks from, e.g. us-east, eu-west | +| `managedBy` | "DASHBOARD" \| "CLI" \| "TERRAFORM" | ✓ | | Who manages this monitor: DASHBOARD or CLI | +| `environmentId` | string (uuid) | | ✓ | Environment to associate with this monitor | +| `assertions` | CreateAssertionRequest[] | | ✓ | Assertions to evaluate against each check result | +| `auth` | any | | ✓ | | +| `incidentPolicy` | any | | ✓ | | +| `alertChannelIds` | string (uuid)[] | | ✓ | Alert channels to notify when this monitor triggers | +| `tags` | any | | ✓ | | + +## `UpdateMonitorRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | | ✓ | New monitor name; null preserves current | +| `config` | any | | ✓ | | +| `frequencySeconds` | integer (int32) | | ✓ | New check frequency in seconds (30–86400); null preserves current | +| `enabled` | boolean | | ✓ | Enable or disable the monitor; null preserves current | +| `regions` | string[] | | ✓ | New probe regions; null preserves current | +| `managedBy` | "DASHBOARD" \| "CLI" \| "TERRAFORM" | | ✓ | New management source; null preserves current | +| `environmentId` | string (uuid) | | ✓ | New environment ID; null preserves current (use clearEnvironmentId to unset) | +| `clearEnvironmentId` | boolean | | ✓ | Set to true to remove the environment association | +| `assertions` | CreateAssertionRequest[] | | ✓ | Replace all assertions; null preserves current | +| `auth` | any | | ✓ | | +| `clearAuth` | boolean | | ✓ | Set to true to remove authentication | +| `incidentPolicy` | any | | ✓ | | +| `alertChannelIds` | string (uuid)[] | | ✓ | Replace alert channel list; null preserves current | +| `tags` | any | | ✓ | | + +## `MonitorDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique monitor identifier | +| `organizationId` | integer (int32) | ✓ | | Organization this monitor belongs to | +| `name` | string | ✓ | | Human-readable name for this monitor | +| `type` | "HTTP" \| "DNS" \| "MCP_SERVER" \| "TCP" \| "ICMP" \| "HEARTBEAT" | ✓ | | | +| `config` | any | ✓ | | | +| `frequencySeconds` | integer (int32) | ✓ | | Check frequency in seconds (30–86400) | +| `enabled` | boolean | ✓ | | Whether the monitor is active | +| `regions` | string[] | ✓ | | Probe regions where checks are executed | +| `managedBy` | "DASHBOARD" \| "CLI" \| "TERRAFORM" | ✓ | | Management source: DASHBOARD or CLI | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the monitor was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the monitor was last updated | +| `assertions` | MonitorAssertionDto[] | | ✓ | Assertions evaluated against each check result; null on list responses | +| `tags` | TagDto[] | | ✓ | Tags applied to this monitor | +| `pingUrl` | string | | ✓ | Heartbeat ping URL; populated for HEARTBEAT monitors only | +| `environment` | any | | ✓ | | +| `auth` | any | | ✓ | | +| `incidentPolicy` | any | | ✓ | | +| `alertChannelIds` | string (uuid)[] | | ✓ | Alert channel IDs linked to this monitor; populated on single-monitor responses | + diff --git a/skills/devhelm-configure/references/_generated/notification-policies.fields.md b/skills/devhelm-configure/references/_generated/notification-policies.fields.md new file mode 100644 index 0000000..7610a78 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/notification-policies.fields.md @@ -0,0 +1,39 @@ +# notification-policies — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateNotificationPolicyRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable name for this policy | +| `matchRules` | MatchRule[] | | ✓ | Match rules to evaluate (all must pass; omit or empty for catch-all) | +| `escalation` | EscalationChain | ✓ | | | +| `enabled` | boolean | | ✓ | Whether this policy is enabled (default true) | +| `priority` | integer (int32) | | ✓ | Evaluation priority; higher value = evaluated first (default 0) | + +## `UpdateNotificationPolicyRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | | ✓ | Human-readable name for this policy; null preserves current | +| `matchRules` | MatchRule[] | | ✓ | Match rules to evaluate (all must pass; omit or empty for catch-all) | +| `escalation` | any | | ✓ | | +| `enabled` | boolean | | ✓ | Whether this policy is enabled; null preserves current | +| `priority` | integer (int32) | | ✓ | Evaluation priority; higher value = evaluated first; null preserves current | + +## `NotificationPolicyDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique notification policy identifier | +| `organizationId` | integer (int32) | ✓ | | Organization this policy belongs to | +| `name` | string | ✓ | | Human-readable name for this policy | +| `matchRules` | MatchRule[] | ✓ | | Match rules (all must pass; empty = catch-all) | +| `escalation` | EscalationChain | ✓ | | | +| `enabled` | boolean | ✓ | | Whether this policy is active | +| `priority` | integer (int32) | ✓ | | Evaluation order; higher value = evaluated first | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the policy was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the policy was last updated | + diff --git a/skills/devhelm-configure/references/_generated/resource-groups.fields.md b/skills/devhelm-configure/references/_generated/resource-groups.fields.md new file mode 100644 index 0000000..75cd0d2 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/resource-groups.fields.md @@ -0,0 +1,66 @@ +# resource-groups — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateResourceGroupRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable name for this group | +| `description` | string | | ✓ | Optional description | +| `alertPolicyId` | string (uuid) | | ✓ | Optional notification policy to apply for this group | +| `defaultFrequency` | integer (int32) | | ✓ | Default check frequency in seconds applied to members (30–86400) | +| `defaultRegions` | string[] | | ✓ | Default regions applied to member monitors | +| `defaultRetryStrategy` | any | | ✓ | | +| `defaultAlertChannels` | string (uuid)[] | | ✓ | Default alert channel IDs applied to member monitors | +| `defaultEnvironmentId` | string (uuid) | | ✓ | Default environment ID applied to member monitors | +| `healthThresholdType` | "COUNT" \| "PERCENTAGE" | | ✓ | Health threshold type: COUNT or PERCENTAGE | +| `healthThresholdValue` | number | | ✓ | Health threshold value: count (0+) or percentage (0–100) | +| `suppressMemberAlerts` | boolean | | ✓ | Suppress member-level alert notifications when group manages alerting | +| `confirmationDelaySeconds` | integer (int32) | | ✓ | Confirmation delay in seconds before group incident creation (0–600) | +| `recoveryCooldownMinutes` | integer (int32) | | ✓ | Recovery cooldown in minutes after group incident resolves (0–60) | + +## `UpdateResourceGroupRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Human-readable name for this group | +| `description` | string | | ✓ | Optional description; null clears the existing value | +| `alertPolicyId` | string (uuid) | | ✓ | Optional notification policy to apply for this group; null clears the existing value | +| `defaultFrequency` | integer (int32) | | ✓ | Default check frequency in seconds for members (30–86400); null clears | +| `defaultRegions` | string[] | | ✓ | Default regions for member monitors; null clears | +| `defaultRetryStrategy` | any | | ✓ | | +| `defaultAlertChannels` | string (uuid)[] | | ✓ | Default alert channel IDs for member monitors; null clears | +| `defaultEnvironmentId` | string (uuid) | | ✓ | Default environment ID for member monitors; null clears | +| `healthThresholdType` | "COUNT" \| "PERCENTAGE" | | ✓ | Health threshold type: COUNT or PERCENTAGE; null disables threshold | +| `healthThresholdValue` | number | | ✓ | Health threshold value; null disables threshold | +| `suppressMemberAlerts` | boolean | | ✓ | Suppress member-level alert notifications; null preserves current value | +| `confirmationDelaySeconds` | integer (int32) | | ✓ | Confirmation delay in seconds; null clears | +| `recoveryCooldownMinutes` | integer (int32) | | ✓ | Recovery cooldown in minutes; null clears | + +## `ResourceGroupDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique resource group identifier | +| `organizationId` | integer (int32) | ✓ | | Organization this group belongs to | +| `name` | string | ✓ | | Human-readable group name | +| `slug` | string | ✓ | | URL-safe group identifier | +| `description` | string | | ✓ | Optional group description | +| `alertPolicyId` | string (uuid) | | ✓ | Notification policy applied to this group | +| `defaultFrequency` | integer (int32) | | ✓ | Default check frequency in seconds for member monitors | +| `defaultRegions` | string[] | | ✓ | Default regions for member monitors | +| `defaultRetryStrategy` | any | | ✓ | | +| `defaultAlertChannels` | string (uuid)[] | | ✓ | Default alert channel IDs for member monitors | +| `defaultEnvironmentId` | string (uuid) | | ✓ | Default environment ID for member monitors | +| `healthThresholdType` | "COUNT" \| "PERCENTAGE" | | ✓ | Health threshold type: COUNT or PERCENTAGE | +| `healthThresholdValue` | number | | ✓ | Health threshold value | +| `suppressMemberAlerts` | boolean | ✓ | | When true, member-level incidents skip notification dispatch; only group alerts fire | +| `confirmationDelaySeconds` | integer (int32) | | ✓ | Seconds to wait after health threshold breach before creating group incident | +| `recoveryCooldownMinutes` | integer (int32) | | ✓ | Cooldown minutes after group incident resolves before a new one can open | +| `health` | ResourceGroupHealthDto | ✓ | | | +| `members` | ResourceGroupMemberDto[] | | ✓ | Member list with individual statuses; populated on detail GET only | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the group was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the group was last updated | + diff --git a/skills/devhelm-configure/references/_generated/secrets.fields.md b/skills/devhelm-configure/references/_generated/secrets.fields.md new file mode 100644 index 0000000..185332e --- /dev/null +++ b/skills/devhelm-configure/references/_generated/secrets.fields.md @@ -0,0 +1,30 @@ +# secrets — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateSecretRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `key` | string | ✓ | | Unique secret key within the workspace (max 255 chars) | +| `value` | string | ✓ | | Secret value, stored encrypted (max 32KB) | + +## `UpdateSecretRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `value` | string | ✓ | | New secret value, stored encrypted (max 32KB) | + +## `SecretDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique secret identifier | +| `key` | string | ✓ | | Secret key name, unique within the workspace | +| `dekVersion` | integer (int32) | ✓ | | DEK version at the time of last encryption | +| `valueHash` | string | ✓ | | SHA-256 hex digest of the current plaintext; use for change detection | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the secret was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the secret was last updated | +| `usedByMonitors` | MonitorReference[] | | ✓ | Monitors that reference this secret; null on create/update responses | + diff --git a/skills/devhelm-configure/references/_generated/tags.fields.md b/skills/devhelm-configure/references/_generated/tags.fields.md new file mode 100644 index 0000000..be5ea9e --- /dev/null +++ b/skills/devhelm-configure/references/_generated/tags.fields.md @@ -0,0 +1,30 @@ +# tags — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CreateTagRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | ✓ | | Tag name, unique within the org | +| `color` | string | | ✓ | Hex color code (defaults to #6B7280 if omitted) | + +## `UpdateTagRequest` + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `name` | string | | ✓ | New tag name | +| `color` | string | | ✓ | New hex color code | + +## `TagDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique tag identifier | +| `organizationId` | integer (int32) | ✓ | | Organization this tag belongs to | +| `name` | string | ✓ | | Tag name, unique within the org | +| `color` | string | ✓ | | Hex color code for display (e.g. #6B7280) | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the tag was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the tag was last updated | + diff --git a/skills/devhelm-configure/references/_generated/webhooks.fields.md b/skills/devhelm-configure/references/_generated/webhooks.fields.md new file mode 100644 index 0000000..489d662 --- /dev/null +++ b/skills/devhelm-configure/references/_generated/webhooks.fields.md @@ -0,0 +1,7 @@ +# webhooks — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +> No schemas found for this resource in the current spec. + diff --git a/skills/devhelm-configure/references/alert-channels.md b/skills/devhelm-configure/references/alert-channels.md new file mode 100644 index 0000000..28fcab1 --- /dev/null +++ b/skills/devhelm-configure/references/alert-channels.md @@ -0,0 +1,124 @@ +# Alert Channels + +An **alert channel** is *where* notifications go. It's decoupled from +*when* (that's notification-policies) and *what* (that's monitors + +incidents). Ship the channel first, wire it up via a policy afterwards. + +## Types + +| Type | Needs | Use when | +|---|---|---| +| `SLACK` | Incoming webhook URL | Team-channel alerting in Slack | +| `DISCORD` | Incoming webhook URL | Gaming / community teams | +| `EMAIL` | Recipient email address | Direct one-off, or fallback | +| `PAGERDUTY` | PagerDuty integration key | On-call escalation | +| `OPSGENIE` | OpsGenie API key | On-call escalation (alternative) | +| `WEBHOOK` | URL + optional secret | Custom routing, SIEM, bespoke pipelines | +| `MS_TEAMS` | Teams incoming webhook | Microsoft-shop teams | +| `TELEGRAM` | Bot token + chat ID | Personal / small team | + +## Create + +```bash +# Slack +devhelm alert-channels create \ + --name=slack-platform \ + --type=SLACK \ + --webhook-url=https://hooks.slack.com/services/T000/B000/XXXX + +# Email +devhelm alert-channels create \ + --name=oncall-email \ + --type=EMAIL \ + --recipient=oncall@example.com + +# PagerDuty +devhelm alert-channels create \ + --name=pd-prod \ + --type=PAGERDUTY \ + --integration-key= + +# Generic webhook with HMAC +devhelm alert-channels create \ + --name=siem \ + --type=WEBHOOK \ + --url=https://siem.example.com/ingest/devhelm \ + --secret= +``` + +## Verify + +Always run this immediately after creating — sends a real test +notification so the user can confirm the channel works end-to-end: + +```bash +devhelm alert-channels test +``` + +If the test fails, the error response tells you which step broke +(invalid webhook URL, auth rejected, etc.). Report it verbatim. + +## Wire it up + +A channel does nothing until a **notification policy** references it. +The simplest flow: + +```bash +devhelm notification-policies create \ + --name=default \ + --alert-channels= \ + --trigger-count=2 \ + --applies-to-all=true +``` + +See `@references/notification-policies.md` for the full model. + +## Credentials go in secrets + +Don't paste raw webhook URLs or integration keys into `devhelm.yml` or +Terraform — store them as secrets and reference: + +```yaml +alert_channels: + - name: slack-platform + type: SLACK + webhook_url: ${{secrets.SLACK_PLATFORM_WEBHOOK}} +``` + +```hcl +resource "devhelm_alert_channel" "slack_platform" { + name = "slack-platform" + type = "SLACK" + webhook_url = var.slack_platform_webhook # set via Terraform variables +} +``` + +## Common gotchas + +- **Slack webhook revoked** → Slack-side. The test command surfaces a + 401; user must regen in Slack. +- **PagerDuty "routing_key" vs "integration_key"** — DevHelm uses the + Events API v2 integration key. If the user pastes a service-key + that starts with `P`, that's the wrong one. +- **Email rate-limits** — bulk alerting via EMAIL can hit provider + limits; for high-volume paths, prefer SLACK or PAGERDUTY. +- **Webhook HMAC** — the optional `secret` becomes the shared secret + for the HMAC-SHA256 signature sent in `X-DevHelm-Signature`. Users + verifying on their side should compare bytes, not strings, and use + constant-time comparison. + +## Delete behavior + +Deleting a channel that's referenced by a policy returns a 409 with +the referencing policies listed. Offer to unlink first: + +```bash +devhelm alert-channels delete # may fail with 409 +# Inspect references: +devhelm notification-policies list --alert-channel= +``` + +## Complete field reference + +`@_generated/alert-channels.fields.md` — regenerated from OpenAPI. +Runtime pull: `devhelm skills schema alert-channels`. diff --git a/skills/devhelm-configure/references/dependencies.md b/skills/devhelm-configure/references/dependencies.md new file mode 100644 index 0000000..88283d2 --- /dev/null +++ b/skills/devhelm-configure/references/dependencies.md @@ -0,0 +1,81 @@ +# Dependencies + +A **dependency** tracks a third-party service's public status feed and +mirrors it into DevHelm. Use it when: + +- Your product depends on Slack, Stripe, GitHub, AWS, Heroku, etc. and + you want outages there to show up in your own monitoring + status + page without you building a scraper. +- You want a single uptime board that says *"our service is UP, but + Stripe's Payments is DOWN, here's what users are seeing"*. + +Dependencies are distinct from **monitors** — they don't probe +anything from DevHelm's end; they consume the vendor's published feed +(public status page RSS, JSON API, or vendor-specific format). + +## Create + +```bash +devhelm dependencies create \ + --name="Stripe Payments" \ + --provider=stripe \ + --component=payments +``` + +The `provider` is DevHelm's internal slug for the vendor (see +`@_generated/dependencies.fields.md` for the full list — it's large +and grows with adapters). The `component` narrows to a specific +sub-service within that vendor's status page. + +## List / get / delete + +```bash +devhelm dependencies list +devhelm dependencies get +devhelm dependencies delete +``` + +## Attach to a status page + +Dependencies show up as **read-only components** on the user's status +page — customers see "we depend on X, and X is currently Y". + +```bash +devhelm status-pages components create \ + --dependency-id= \ + --name="Stripe Payments" +``` + +See `devhelm-communicate` skill → `@references/components.md` for the +full attach flow. + +## Alerting on a dependency failure + +Dependencies can drive notification policies just like monitors — if +the vendor goes red, you get paged. Scope a policy with the +dependency's tags: + +```bash +devhelm notification-policies create \ + --name=third-party \ + --tags=type=dependency \ + --alert-channels=slack-platform +``` + +## Common gotchas + +- **Feed latency.** Vendor status feeds update every 30–120s in our + experience; don't expect sub-minute resolution. +- **Unsupported vendor.** If the user names a vendor we don't have an + adapter for, tell them so and ask if we can add it (support-ticket + path) — don't try to scrape a random page. +- **Component naming drift.** Vendors rename their components + occasionally (e.g. Slack merged "Login" into "Workspace + administration"). The dependency record will start returning + `UNKNOWN` status; `devhelm dependencies get ` surfaces it. + Re-create with the new component name. + +## Complete field reference + +`@_generated/dependencies.fields.md`. Runtime pull: +`devhelm skills schema dependencies`. diff --git a/skills/devhelm-configure/references/environments.md b/skills/devhelm-configure/references/environments.md new file mode 100644 index 0000000..f6c0c38 --- /dev/null +++ b/skills/devhelm-configure/references/environments.md @@ -0,0 +1,108 @@ +# Environments + +An **environment** is a label that scopes DevHelm resources by +logical deployment tier — typically `production`, `staging`, `dev`. + +Environments are not separate workspaces. They're shared tenancy with +a scoping label that: + +- Filters dashboard views (`Environment: staging`). +- Scopes notification policies (a `production` policy won't fire for + `staging` monitors). +- Gates `devhelm deploy` so you can target one at a time. + +If the user wants **hard isolation** (different API keys, different +billing, different team roster), they should use multiple workspaces. +Environments are for one team managing multiple tiers. + +## Create + +```bash +devhelm environments create \ + --name="Production" \ + --slug=production \ + --description="Customer-facing" + +devhelm environments create --name=Staging --slug=staging +``` + +## List / select + +```bash +devhelm environments list +``` + +Active selection for CLI / MaC operations: + +```bash +# Per-command flag +devhelm monitors list --environment=staging + +# Per-session env var +export DEVHELM_ENVIRONMENT=staging +devhelm monitors list + +# Persistent, per-context (stored in ~/.devhelm/contexts.json) +devhelm environments use staging +``` + +## Assign resources + +```bash +devhelm monitors create --environment=production ... +devhelm monitors update --environment=production +``` + +YAML: + +```yaml +# devhelm.yml — single-environment file +environment: production + +monitors: + - name: api + ... +``` + +Or mixed (rarer): + +```yaml +monitors: + - name: api-prod + environment: production + ... + - name: api-staging + environment: staging + ... +``` + +The top-level `environment:` applies as a default; per-resource +overrides it. + +## Delete + +```bash +devhelm environments delete +``` + +Fails with 409 if any resources reference it — response lists them. +Move them first (`--environment=` or set to unset), then +retry. + +## Common gotchas + +- **Default environment.** New orgs come with a single `production` + environment. The user doesn't have to create one; they can just + start. +- **`devhelm deploy` scope.** Without `--environment`, deploy operates + on the currently-selected env. Be explicit in CI: + `devhelm deploy --environment=production`. +- **Cross-environment references.** A notification policy scoped to + `production` won't cover `staging` monitors. If the user wants one + policy for both, either omit the environment scope on the policy, + or create two scoped policies. + +## Complete field reference + +`@_generated/environments.fields.md`. Runtime pull: +`devhelm skills schema environments`. diff --git a/skills/devhelm-configure/references/mac-terraform.md b/skills/devhelm-configure/references/mac-terraform.md new file mode 100644 index 0000000..5326503 --- /dev/null +++ b/skills/devhelm-configure/references/mac-terraform.md @@ -0,0 +1,156 @@ +# Monitoring as Code — Terraform + +DevHelm has an official Terraform provider +(`registry.terraform.io/devhelmhq/devhelm`). Use this mode when the +user's repo already uses Terraform for infrastructure and they want +monitoring managed in the same state. + +## Provider setup + +```hcl +terraform { + required_providers { + devhelm = { + source = "devhelmhq/devhelm" + version = "~> 0.3" + } + } +} + +provider "devhelm" { + # Token resolution order: + # 1. provider config `api_token = var.devhelm_token` + # 2. env var DEVHELM_API_TOKEN + # 3. ~/.devhelm/contexts.json (if running locally) +} +``` + +In CI, always pass the token via env or TF variable — never hard-code. + +## Supported resource types + +As of the current provider version: + +| Terraform resource | DevHelm resource | +|---|---| +| `devhelm_monitor` | Monitor (HTTP/HEARTBEAT/TCP/DNS/MCP/RSS_STATUS) | +| `devhelm_alert_channel` | Alert channel | +| `devhelm_notification_policy` | Notification policy | +| `devhelm_resource_group` | Resource group | +| `devhelm_dependency` | Dependency | +| `devhelm_secret` | Secret (write-only — can't read value back) | +| `devhelm_webhook` | Outbound webhook | +| `devhelm_environment` | Environment | +| `devhelm_status_page` | Public status page | +| `devhelm_status_page_component` | Status page component | + +Check the provider's registry page for the canonical list if the user +asks about a resource not here. + +## Minimal example + +```hcl +resource "devhelm_alert_channel" "slack" { + name = "slack-platform" + type = "SLACK" + webhook_url = var.slack_webhook +} + +resource "devhelm_notification_policy" "default" { + name = "default" + applies_to_all = true + trigger_count = 2 + alert_channel_ids = [devhelm_alert_channel.slack.id] +} + +resource "devhelm_monitor" "api_prod" { + name = "api-prod" + type = "HTTP" + url = "https://api.example.com/health" + frequency = 60 + regions = ["us-east", "eu-west"] + tags = { env = "prod", team = "platform" } + + assertion { + type = "STATUS_CODE" + operator = "EQUALS" + target = "200" + } + assertion { + type = "RESPONSE_TIME" + operator = "LESS_THAN" + target = "500" + } +} +``` + +## The canonical flow + +```bash +terraform plan +terraform apply +``` + +Same pattern as YAML mode, just with Terraform's tooling. + +## Secrets in Terraform + +Two options: + +1. **DevHelm secrets** (recommended): create a `devhelm_secret` + once (manually or via a separate workspace), then reference via + `${{secrets.NAME}}` in string fields. The provider doesn't need + the value. +2. **Terraform variables + providers like vault**: pass the value + directly as a TF variable (`sensitive = true`). The value ends up + in Terraform state — encrypt your state backend. + +Prefer option 1 for anything reused; option 2 for one-off setup +values (Slack webhook URL). + +## Importing existing resources + +```bash +terraform import devhelm_monitor.api_prod +``` + +Every DevHelm resource exposes a stable `id`; you can find it via +`devhelm list`. + +For wholesale import of a pre-existing configuration: + +```bash +devhelm init --from-platform --format=terraform > devhelm.tf +``` + +## State considerations + +DevHelm state is **server-authoritative**. If a user edits a resource +via the dashboard, `terraform plan` sees drift on the next run and +proposes to revert. Either: + +- Tell the user to edit in Terraform only (preferred for code-managed + resources). +- Use `ignore_changes` per-field if a specific value must be + dashboard-managed (e.g. `tags = { ... last_modified_by = ... }`). + +## Gotchas + +- **Resource renames** destroy-then-create by default. Use + `terraform state mv ` before the rename to preserve the + underlying resource. +- **Assertion blocks are ordered**; reorder causes apparent drift. + Keep them in the same order between runs. +- **`devhelm_secret.value` is write-only**; provider can't read it + back on refresh. If rotation is needed, explicitly set a new + `value` in the TF config. +- **Provider version pinning** matters — new spec fields land in new + provider versions. When the user regenerates from `devhelm init`, + the output targets the currently-installed provider; pin it. + +## Reference + +- Full provider docs: + `https://registry.terraform.io/providers/devhelmhq/devhelm/latest/docs` +- Per-resource field lists: `@references/.md` in this + directory (YAML-shaped but the field names match Terraform 1:1). diff --git a/skills/devhelm-configure/references/mac-yaml.md b/skills/devhelm-configure/references/mac-yaml.md new file mode 100644 index 0000000..461d79a --- /dev/null +++ b/skills/devhelm-configure/references/mac-yaml.md @@ -0,0 +1,177 @@ +# Monitoring as Code — YAML + +DevHelm's declarative surface. One or more `devhelm.yml` files +describe the desired state; `devhelm plan` shows the diff against +the live config; `devhelm deploy` applies it. + +Use this mode when: + +- The repo already has `devhelm.yml` (or `devhelm.yaml`). +- The user is creating 3+ resources in one turn. +- The user said anything that implies reviewability: "let's put this + in code", "track in git", "review via PR", "CI should deploy". + +## File layout conventions + +| Pattern | Use | +|---|---| +| One file `devhelm.yml` at repo root | Small setups (<20 resources). | +| One file per resource type (`monitors.yml`, `alert-channels.yml`) | Mid-size (20–200). Pass with `-f`. | +| Directory `devhelm/` with one file per service | Large / per-team ownership (CODEOWNERS-friendly). | + +`devhelm plan`/`deploy` auto-discover `devhelm.yml` at root. For other +layouts, pass explicit files: + +```bash +devhelm plan -f devhelm/monitors/api.yml -f devhelm/alert-channels.yml +devhelm plan -f 'devhelm/**/*.yml' # glob supported +``` + +## Minimal template + +```yaml +# devhelm.yml +version: "1" + +alert_channels: + - name: slack-platform + type: SLACK + webhook_url: ${{secrets.SLACK_PLATFORM_WEBHOOK}} + +notification_policies: + - name: default + applies_to_all: true + trigger_count: 2 + alert_channels: [slack-platform] + +monitors: + - name: api-prod + type: HTTP + url: https://api.example.com/health + frequency: 60 + regions: [us-east, eu-west] + tags: { env: prod } +``` + +Resources reference each other **by name**. Name is immutable (used as +the idempotency key) — renaming requires a `moved` block (see below). + +## The canonical flow + +```bash +devhelm plan # show diff, exit 10 if changes pending +devhelm deploy # apply +``` + +Always `plan` first. In CI, use `--detailed-exitcode` on plan: + +- 0 = no changes +- 2 = plan succeeded, changes pending +- non-zero = error + +and drive `deploy` conditionally. + +## Renaming — the `moved` block + +If the user wants to rename `api-prod` → `api-production`, add: + +```yaml +moved: + - from: monitors.api-prod + to: monitors.api-production +``` + +…and update the name in the resource block. Without this, DevHelm +destroys `api-prod` and creates a new `api-production` (losing +history). + +## Pruning + +By default, resources removed from YAML remain in the platform (to +avoid destructive surprises). To delete them: + +```bash +devhelm plan --prune # preview deletions +devhelm deploy --prune # apply +``` + +Only applies to resources in namespaces the YAML explicitly manages. +Resources the YAML has never mentioned are left alone. + +## State + +DevHelm keeps server-side state; the CLI maintains a local +`devhelm.lock` for optimistic concurrency. If another user `deploy`s +while you're editing, `plan` will show your changes against the newer +server state, and `deploy` will refuse if the lock is stale — pull, +re-plan, re-deploy. + +## Secrets in YAML + +Reference with `${{secrets.NAME}}`. Never paste raw values into YAML. +See `@references/secrets.md`. + +## Multi-environment + +Two patterns: + +```yaml +# Pattern A — one file per environment, select via CLI +# devhelm/production.yml +environment: production +monitors: [...] + +# devhelm/staging.yml +environment: staging +monitors: [...] +``` + +```bash +devhelm deploy -f devhelm/production.yml +devhelm deploy -f devhelm/staging.yml +``` + +```yaml +# Pattern B — one file with per-resource environment +monitors: + - name: api-prod + environment: production + ... + - name: api-staging + environment: staging + ... +``` + +Pattern A is cleaner for separate CI jobs; Pattern B is simpler for a +single repo managed by one team. + +## Bootstrapping from an existing platform + +```bash +devhelm init --from-platform # writes devhelm.yml +``` + +Pulls everything currently configured into a single YAML file. Use +this when the user has set things up in the dashboard and now wants +it in code. + +## Gotchas + +- **YAML anchors** are supported but often confuse Git-diff reviewers. + Prefer explicit duplication or templating in your build pipeline. +- **Comments** survive roundtripping (we use a format-preserving YAML + parser) but only on edits — resource additions emit flat keys. +- **Integer vs string fields.** `frequency: 60` (int) not + `frequency: "60"`. The Zod schema rejects quoted numerics with a + helpful message. +- **`-f` vs glob.** Shell quoting matters: + `-f 'devhelm/**/*.yml'` — single-quoted so the shell doesn't expand + prematurely; the CLI does the glob. + +## Reference + +- CLI flags: `devhelm plan --help`, `devhelm deploy --help`. +- State commands: `devhelm state list`, `devhelm state rm `, + `devhelm state mv ` for advanced rebinding. +- Field reference per resource: the `@references/.md` files + in this directory. diff --git a/skills/devhelm-configure/references/monitors.md b/skills/devhelm-configure/references/monitors.md new file mode 100644 index 0000000..590bf78 --- /dev/null +++ b/skills/devhelm-configure/references/monitors.md @@ -0,0 +1,166 @@ +# Monitors + +A **monitor** checks that something is up. DevHelm supports six types; +pick the narrowest one that answers the user's question. + +## Types + +| Type | What it checks | Use when | +|---|---|---| +| `HTTP` | HTTP(S) endpoint returns a 2xx (or matches custom assertions) | The service has a public HTTP endpoint | +| `HEARTBEAT` | A job pings DevHelm within its expected interval | Cron jobs, queue consumers, internal services | +| `TCP` | TCP port accepts a connection | Databases, non-HTTP services, SSL handshake | +| `DNS` | A DNS record resolves to the expected value(s) | DNS propagation, CDN origin | +| `MCP` | An MCP server responds to a probe | MCP tool health | +| `RSS_STATUS` | A third-party vendor's public status feed reports UP | Heroku, Slack, Stripe, etc. (use with `dependencies` for pSEO) | + +### How to choose + +- **Public URL you control** → HTTP (default). +- **Internal job, not reachable from outside** → HEARTBEAT. +- **Raw port** → TCP (rare; prefer HTTP). +- **You care only about DNS** → DNS. +- **You want to track a vendor's status** → usually `dependencies`, + not a monitor directly. Read `@references/dependencies.md`. + +## Minimum viable monitor + +```bash +# HTTP (the 90% case) +devhelm monitors create \ + --name="api-prod" \ + --type=HTTP \ + --url=https://api.example.com/health + +# Heartbeat +devhelm monitors create \ + --name="nightly-backup" \ + --type=HEARTBEAT \ + --grace-period=3600 +``` + +Defaults you don't need to specify: + +- `frequency=60` (seconds between checks, 30–86400) +- `regions=["us-east"]` (one region; add more with `--regions`) +- `method=GET` +- `follow_redirects=true` +- `assertions=[{type: "STATUS_CODE", operator: "EQUALS", target: "200"}]` +- `enabled=true` + +## Assertions + +Assertions are the **definition of up**. Without custom assertions, +"HTTP 2xx" is the only check. Common additions: + +```bash +# "2xx AND body contains 'ok' AND responds within 500ms" +devhelm monitors create \ + --name=api-prod \ + --url=https://api.example.com/health \ + --assertion='{"type":"STATUS_CODE","operator":"EQUALS","target":"200"}' \ + --assertion='{"type":"RESPONSE_BODY","operator":"CONTAINS","target":"ok"}' \ + --assertion='{"type":"RESPONSE_TIME","operator":"LESS_THAN","target":"500"}' +``` + +Assertion types: `STATUS_CODE`, `RESPONSE_BODY`, `RESPONSE_TIME`, +`HEADER`, `JSON_BODY` (with JSONPath `property`), `SSL_CERTIFICATE` +(expiry check). + +Operators: `EQUALS`, `NOT_EQUALS`, `CONTAINS`, `NOT_CONTAINS`, +`GREATER_THAN`, `LESS_THAN`, `MATCHES_REGEX`, `IS_EMPTY`, `IS_NOT_EMPTY`. + +## Regions + +Run from multiple regions to catch origin-side vs. network-side issues. + +```bash +--regions us-east,eu-west,ap-southeast +``` + +A monitor is DOWN only if the incident policy says so — the default +requires 2+ regions to fail. See `@references/notification-policies.md`. + +## Authentication + +For monitors that need auth headers, tokens, or basic auth, **store +the credential as a secret first** (`@references/secrets.md`), then +reference it: + +```bash +devhelm secrets create --name=STRIPE_TEST_KEY --value=... +devhelm monitors create \ + --name=stripe-webhook \ + --url=https://webhooks.example.com/stripe \ + --headers='[{"name":"Authorization","value":"${{secrets.STRIPE_TEST_KEY}}"}]' +``` + +Never hard-code a credential in a CLI flag or YAML file — it ends up +in shell history / git. + +## Frequency vs. plan + +Frequency is gated by plan. Free allows ≥300s, Pro allows ≥60s, Scale +and Enterprise allow ≥30s. If the user asks for a sub-60s check and +they're on Free, the API returns a 403 with a plan-hint — surface it. + +## Common gotchas + +- **Localhost URLs are rejected.** DevHelm probes from public + datacenters. If the user needs to monitor something internal, use + HEARTBEAT. +- **Self-signed TLS** — HTTP monitors validate certs by default. Add + `--skip-tls-verify` only if the user explicitly asked (and warn them + this weakens the signal). +- **Redirects** — `follow_redirects=true` is the default; assertions + evaluate the *final* response. If the user wants to assert on the + redirect itself, pass `--no-follow-redirects`. +- **Cold-start 5xx** — serverless functions often return 502/503 on the + first cold check. Raise `trigger_count` in the incident policy + rather than the monitor's assertions. + +## YAML form + +```yaml +# devhelm.yml +version: "1" +monitors: + - name: api-prod + type: HTTP + url: https://api.example.com/health + frequency: 60 + regions: [us-east, eu-west] + tags: { env: prod, team: platform } + assertions: + - { type: STATUS_CODE, operator: EQUALS, target: "200" } + - { type: RESPONSE_TIME, operator: LESS_THAN, target: "500" } + alert_channels: [slack-platform] +``` + +Deploy with `devhelm plan` → `devhelm deploy`. The YAML is idempotent +by name. + +## Terraform form + +```hcl +resource "devhelm_monitor" "api_prod" { + name = "api-prod" + type = "HTTP" + url = "https://api.example.com/health" + frequency = 60 + regions = ["us-east", "eu-west"] + + assertion { + type = "STATUS_CODE" + operator = "EQUALS" + target = "200" + } +} +``` + +## Complete field reference + +Current spec: `@_generated/monitors.fields.md` (regenerated at CLI +build time from `docs/openapi/monitoring-api.json`). + +Runtime fresh-pull: `devhelm skills schema monitors`. diff --git a/skills/devhelm-configure/references/notification-policies.md b/skills/devhelm-configure/references/notification-policies.md new file mode 100644 index 0000000..8ba4cec --- /dev/null +++ b/skills/devhelm-configure/references/notification-policies.md @@ -0,0 +1,155 @@ +# Notification Policies + +A **notification policy** is *when and how* an alert channel gets +notified. It wraps: + +- **Which monitors** it applies to (all, or by tag, or by resource + group). +- **What triggers it** — number of consecutive failed checks + (`trigger_count`), regions required to fail. +- **What confirms it** — extra failures after trigger before state + flips to CONFIRMED (`confirm_count`). +- **What resolves it** — consecutive passes required to clear + (`resolve_count`). +- **Who to notify** — one or more alert channels. +- **Escalation** — optional second tier after a delay. + +The policy is the **source of truth for incident state transitions**. +The detection engine evaluates the policy against incoming check +results; transitions flow through the forensic model (see +`devhelm-investigate` skill → `@references/incidents.md`). + +## Minimum viable policy + +```bash +devhelm notification-policies create \ + --name=default \ + --applies-to-all=true \ + --trigger-count=2 \ + --resolve-count=2 \ + --alert-channels= +``` + +This alerts Slack after 2 consecutive failed checks anywhere, and +clears after 2 consecutive passes. + +## Scoping + +Three mutually-exclusive targeting modes: + +| Flag | Meaning | +|---|---| +| `--applies-to-all=true` | All monitors in the workspace | +| `--tags=key=value,key2=value2` | Only monitors with those tags | +| `--resource-group=` | Only monitors in that group | + +If multiple policies match a monitor, **all of them** fire (additive, +not exclusive). Prefer one general default + targeted overrides over +many narrowly-scoped ones. + +## Region requirements + +By default, a failure from **one region** counts. To reduce +probe-side false positives, require multiple regions: + +```bash +--regions-required=2 +``` + +Monitors with only 1 region configured can't satisfy a +regions-required=2 policy; the API returns a 400 with the offending +monitor names. + +## Escalation + +Optional second-tier channel after a delay: + +```bash +devhelm notification-policies create \ + --name=prod-escalated \ + --tags=env=prod \ + --trigger-count=2 \ + --alert-channels=slack-platform \ + --escalation-channels=pd-prod \ + --escalation-after-seconds=900 # 15 minutes +``` + +If the incident is still CONFIRMED 15 minutes later, PagerDuty gets +paged. Escalation fires once per incident; re-occurring incidents +start the timer fresh. + +## Severity + +Policies can set a default severity for incidents they create: + +```bash +--severity=DOWN # "the service is down" +--severity=DEGRADED # "something's slow or partially broken" +``` + +Overridable per-monitor via the monitor's own `severity` field. + +## Common patterns + +### Default: one catch-all + explicit prod overrides + +```yaml +notification_policies: + - name: catch-all + applies_to_all: true + trigger_count: 2 + alert_channels: [slack-noise] + + - name: prod + tags: { env: prod } + trigger_count: 2 + regions_required: 2 + alert_channels: [slack-platform] + escalation_channels: [pd-prod] + escalation_after_seconds: 600 +``` + +### Quiet hours — not supported as a single flag + +DevHelm doesn't have a "silence window" on policies. Implement +scheduled silencing by having the CI pipeline pause the monitor +(`devhelm monitors pause`) during known-maintenance windows, or +subscribe to the webhook and filter in your downstream. + +### "Only notify on confirmed, never on trigger" + +```bash +--notify-on-trigger=false \ +--notify-on-confirm=true +``` + +Trigger is the first-signal (one probe region, trigger_count failures). +Confirm is the second-signal (additional regions / failures). Many +teams want only confirmed notifications to cut noise; the default is +both. + +## Validation quirks + +- `trigger_count ≥ 1`, `confirm_count ≥ 0`, `resolve_count ≥ 1`. +- If `regions_required > 1`, the monitor must be configured with at + least that many regions. API pre-validates. +- Escalation channels must be a different set from the primary + channels (API rejects overlaps to prevent double-paging). + +## Testing + +There's no `test` verb on a policy directly — you test by: + +1. Pausing a monitor → unpausing with deliberately broken config + (e.g. URL=`https://httpstat.us/503`). +2. Observing the incident + alert-channel delivery. +3. Restoring the monitor. + +The `devhelm-investigate` skill's `forensics trace ` +command is the debugger for policy behavior — it shows exactly which +rule evaluation triggered, confirmed, or resolved an incident. + +## Complete field reference + +`@_generated/notification-policies.fields.md`. Runtime pull: +`devhelm skills schema notification-policies`. diff --git a/skills/devhelm-configure/references/resource-groups.md b/skills/devhelm-configure/references/resource-groups.md new file mode 100644 index 0000000..9684cd5 --- /dev/null +++ b/skills/devhelm-configure/references/resource-groups.md @@ -0,0 +1,97 @@ +# Resource Groups + +A **resource group** is a named bundle of monitors (and other +resources) that share an ownership boundary — a team, a service, a +product line. + +Use groups when: + +- You want a unified uptime number for a logical service ("Checkout + service = 3 monitors at 99.93%"). +- Multiple teams share a workspace and you want per-team dashboards + + policies. +- You're publishing a status page and want components grouped by + service rather than per-monitor. + +**Don't confuse with tags.** Tags are free-form key/value metadata +that any number of resources can share; they're for filtering. +Resource groups are single-parent containers with ownership +semantics. A monitor has one group (or none) but many tags. + +## Create + +```bash +devhelm resource-groups create \ + --name="Checkout" \ + --slug=checkout \ + --description="Payment + cart flow" +``` + +## Add monitors + +Two ways: + +```bash +# At monitor creation +devhelm monitors create --name=checkout-api --resource-group=checkout ... + +# After the fact +devhelm monitors update --resource-group=checkout +``` + +## Read / list + +```bash +devhelm resource-groups list +devhelm resource-groups get # includes member counts +devhelm monitors list --resource-group= +``` + +## YAML + +```yaml +resource_groups: + - name: Checkout + slug: checkout + description: Payment + cart flow + +monitors: + - name: checkout-api + type: HTTP + url: https://checkout.example.com/health + resource_group: checkout +``` + +## Use in status pages + +Status page components can be grouped by `resource_group` for +automatic sectioning — see `devhelm-communicate` skill → +`@references/components.md`. + +## Use in notification policies + +Policies can scope to a group: + +```bash +devhelm notification-policies create \ + --name=checkout-oncall \ + --resource-group=checkout \ + --trigger-count=2 \ + --alert-channels=pd-checkout +``` + +## Common gotchas + +- **Deleting a group with members** — the API returns 409. Move + monitors to another group or to none (`--resource-group=null`) + first. +- **Slug immutability** — slugs are used in dashboard URLs and pSEO + (if enabled). The API allows slug updates but any external links + break. Avoid unless necessary. +- **One group per monitor** — if the user wants cross-team shared + monitors, use tags and multiple notification policies instead. + +## Complete field reference + +`@_generated/resource-groups.fields.md`. Runtime pull: +`devhelm skills schema resource-groups`. diff --git a/skills/devhelm-configure/references/secrets.md b/skills/devhelm-configure/references/secrets.md new file mode 100644 index 0000000..8b9c88c --- /dev/null +++ b/skills/devhelm-configure/references/secrets.md @@ -0,0 +1,90 @@ +# Secrets + +A **secret** is an encrypted string you can reference from monitor +configurations (HTTP headers, request bodies, DNS queries) and alert +channel configurations (webhook URLs, auth tokens) without pasting the +value into YAML / Terraform / shell history. + +Values are encrypted at rest with envelope encryption; the dashboard +and API never return the plaintext after creation. Think of them like +`GITHUB_TOKEN` — create-only, one-time display, rotate when leaked. + +## Create + +```bash +devhelm secrets create \ + --name=STRIPE_TEST_KEY \ + --value=sk_test_xxxxx +``` + +The value can also be piped from stdin (so it doesn't end up in shell +history): + +```bash +echo -n "sk_test_xxxxx" | devhelm secrets create --name=STRIPE_TEST_KEY --value=- +``` + +Naming convention: **UPPER_SNAKE_CASE**, matching how they'll be +referenced. The name is part of the `${{secrets.NAME}}` template +syntax. + +## Reference + +In YAML: + +```yaml +monitors: + - name: webhooks + type: HTTP + url: https://webhooks.example.com + headers: + - { name: Authorization, value: "Bearer ${{secrets.STRIPE_TEST_KEY}}" } +``` + +In the CLI on a monitor / alert-channel create: + +```bash +devhelm monitors create \ + --headers='[{"name":"Authorization","value":"Bearer ${{secrets.STRIPE_TEST_KEY}}"}]' +``` + +The `${{secrets.NAME}}` token is resolved at check-execution time, not +at create time. Typos in the name don't fail validation; they fail at +runtime (the monitor will emit a failing check with an assertion +failure reason of "missing secret"). Always verify a newly-referenced +secret with `devhelm monitors test `. + +## List / rotate / delete + +```bash +devhelm secrets list # names + IDs + lastUpdatedAt — no values +devhelm secrets get # same; no value +devhelm secrets update --value= +devhelm secrets delete # fails if referenced; unlink first +``` + +Rotation best practice: **create the new secret under a new name**, +update the monitors to reference it, then delete the old one. In-place +`--value` updates are supported but harder to audit. + +## Common gotchas + +- **Never paste a secret into a chat transcript or commit.** If the + user pastes one, do not echo it back; mask everything past the + first 6 characters. +- **Secret values don't appear in `devhelm plan`** — the plan shows + the template token (`${{secrets.NAME}}`) unchanged, which is + correct behavior. Users sometimes panic when they don't see the + secret "applied" in the plan; explain the template model. +- **Deletion protection.** Secrets referenced by any monitor / alert + channel can't be deleted. API returns 409 with the references. +- **Environments.** Secrets are workspace-scoped, not + environment-scoped. If you need prod/staging separation, either: + (a) use two workspaces, or (b) name them with an environment + prefix (`PROD_STRIPE_KEY`, `STAGING_STRIPE_KEY`) and reference + accordingly. + +## Complete field reference + +`@_generated/secrets.fields.md`. Runtime pull: +`devhelm skills schema secrets`. diff --git a/skills/devhelm-configure/references/tags.md b/skills/devhelm-configure/references/tags.md new file mode 100644 index 0000000..a0a765a --- /dev/null +++ b/skills/devhelm-configure/references/tags.md @@ -0,0 +1,66 @@ +# Tags + +Tags are **free-form key/value metadata** you can attach to monitors, +alert channels, resource groups, and most other resource types. They +drive: + +- **Filtering**: `devhelm monitors list --tags=env=prod,team=platform`. +- **Notification-policy scoping**: a policy with + `--tags=env=prod` applies only to prod monitors. +- **Dashboard + status-page sectioning** in some views. + +Tags are not the same as **resource groups**. Groups have ownership +semantics (one-per-monitor, parent container). Tags are many-to-many +metadata. + +## Set tags + +Inline at creation: + +```bash +devhelm monitors create --name=api --tags='env=prod,team=platform' ... +``` + +Or as JSON: + +```bash +devhelm monitors update --tags='{"env":"prod","team":"platform","service":"checkout"}' +``` + +YAML: + +```yaml +monitors: + - name: api + tags: + env: prod + team: platform + service: checkout +``` + +## Rules + +- Keys: lowercase alphanumeric + `_` / `-`, ≤32 chars. +- Values: any string ≤128 chars. No newlines. +- A resource can have up to 32 tags. The first 4 show in list views; + the rest are filter-only. +- Tag updates are **full replacements** at the tag level, not deep + merges. Update sends the full tag map; fields omitted are removed. + +## Common patterns + +| Pattern | Tags | +|---|---| +| Environment separation | `env=prod`, `env=staging`, `env=dev` | +| Team ownership | `team=platform`, `team=payments` | +| Service boundary | `service=checkout`, `service=auth` | +| Severity class | `tier=t0`, `tier=t1`, `tier=t2` | +| Rollout phase | `phase=ga`, `phase=beta`, `phase=preview` | + +Most customers converge on `env` + `team` + `service`. Keep it +disciplined — tag-explosion makes filters useless. + +## Complete field reference + +There's no standalone "tags" resource. Tags are a field on other +resources. See the field reference for the resource you're tagging. diff --git a/skills/devhelm-configure/references/webhooks.md b/skills/devhelm-configure/references/webhooks.md new file mode 100644 index 0000000..a00b889 --- /dev/null +++ b/skills/devhelm-configure/references/webhooks.md @@ -0,0 +1,104 @@ +# Webhooks + +An **outbound webhook** is how DevHelm pushes events to external +systems. It's distinct from `alert-channels` of type `WEBHOOK`: + +- `alert-channels` type `WEBHOOK` → fires on *alert policy matches* + (incident triggers, confirms, resolves). One event type, tailored to + pager-like consumers. +- `webhooks` (this resource) → subscribes to a **stream of platform + events** (monitor.created, monitor.updated, incident.updated, + check.failed, etc.). For SIEMs, data warehouses, custom pipelines. + +If the user says "send alerts to my Slack webhook", that's an +**alert channel**. If they say "mirror all our monitoring events into +BigQuery", that's a **webhook**. + +## Create + +```bash +devhelm webhooks create \ + --url=https://events.example.com/devhelm \ + --events=monitor.*,incident.* \ + --secret= +``` + +### Events + +Event names follow `.`: + +- `monitor.created`, `monitor.updated`, `monitor.deleted`, + `monitor.paused`, `monitor.resumed` +- `check.failed`, `check.recovered` (high volume — don't subscribe + without a reason) +- `incident.triggered`, `incident.confirmed`, `incident.resolved`, + `incident.reopened`, `incident.update.created` +- `alert.delivered`, `alert.failed` +- `status_page.published`, `status_page.incident.created` + +Wildcards: `monitor.*` = all monitor verbs; `*` = everything. + +Full event catalog + payload shapes: +`@_generated/webhooks.fields.md`. + +## Signature verification + +Every webhook delivery includes: + +- `X-DevHelm-Delivery-Id` (UUID) +- `X-DevHelm-Event` (the event name) +- `X-DevHelm-Timestamp` (unix seconds) +- `X-DevHelm-Signature` (HMAC-SHA256 hex of + `.` using the configured `secret`) + +Verify on receipt with a constant-time string compare. Reject +deliveries whose timestamp drifts >5 minutes from server time. + +## Delivery semantics + +- **At-least-once.** Deliveries retry on 5xx or timeout with + exponential backoff (1s, 5s, 30s, 2m, 10m). After 5 failed retries, + the delivery is marked FAILED and surfaces in + `devhelm webhooks deliveries list --failed`. +- **Ordering is not guaranteed.** Use `X-DevHelm-Timestamp` or the + event's own ID to order on your side. +- **Idempotency key:** `X-DevHelm-Delivery-Id` is unique per retry + attempt; the event's own ID is unique per logical event. Dedupe + on the latter. + +## Test + +```bash +devhelm webhooks test +``` + +Sends a synthetic `webhook.test` event. Response includes the +HTTP status and any error body from the subscriber. Always run this +after create. + +## Inspect deliveries + +```bash +devhelm webhooks deliveries list --limit=50 +devhelm webhooks deliveries get # full request/response +devhelm webhooks deliveries replay +``` + +Useful for debugging when the user says *"we didn't receive the +event"*. + +## Common gotchas + +- **Wildcards are greedy.** `check.*` includes `check.failed` at full + monitor frequency — if 500 monitors run every 60s, that's 500k + deliveries/day. Always scope tight. +- **Response status for ACK.** 200–299 = accepted. 4xx = permanent + failure, no retry. 5xx or timeout = retry. A slow 200 still blocks + redelivery. +- **HTTPS required** in production. HTTP URLs are allowed for local + dev but warn the user. + +## Complete field reference + +`@_generated/webhooks.fields.md`. Runtime pull: +`devhelm skills schema webhooks`. diff --git a/skills/devhelm-investigate/SKILL.md b/skills/devhelm-investigate/SKILL.md new file mode 100644 index 0000000..e8c595b --- /dev/null +++ b/skills/devhelm-investigate/SKILL.md @@ -0,0 +1,206 @@ +--- +name: devhelm-investigate +description: Diagnose DevHelm monitor failures and answer status questions — why is monitor X failing, what's red right now, show recent check results, list active incidents, inspect uptime over a window, or trace audit history. Use whenever the user asks "why is X down?", "is everything green?", "what happened at 3:14?", "show the last 10 failures", or any read/debug question about monitoring state. +--- + +# DevHelm — Investigate + +You help the user understand the **current and past state** of their +monitoring. This skill is strictly read-only: never create, update, or +delete anything here. For write flows, switch to `devhelm-configure` or +`devhelm-communicate`. + +--- + +## Preconditions + +1. `devhelm --version` succeeds. +2. `devhelm auth me` succeeds. If not, tell the user to `devhelm auth + login` and stop. + +--- + +## Surface selection + +DevHelm exposes the same read data through two surfaces. Choose based +on context: + +| Situation | Surface | Why | +|---|---|---| +| A DevHelm MCP server is configured in the host (Cursor/Claude Code) | **MCP tools** | Zero subprocess cost, structured results, cheaper for follow-up questions. | +| No MCP, or the user is outside a conversational agent | **CLI** (`devhelm list/get/results`) | Universal fallback; every piece of MCP data has a CLI equivalent. | +| User explicitly says "use CLI" | CLI | Respect it. | + +**How to check for MCP:** try calling `list_monitors` (or any DevHelm +MCP tool). If the tool exists in the agent's catalog, MCP is live. If +not, fall back to CLI without further checks. + +The rest of this skill refers to **actions** (read check results, list +incidents, etc.) — the actual tool name differs between MCP and CLI. +The references enumerate the CLI commands; MCP tool names mirror them +(`list_monitors`, `get_monitor`, `list_check_results`, `list_incidents`, +`get_incident`, `list_audit_events`, `query_uptime`). + +--- + +## Triage workflow (the common case) + +**User prompt:** *"Why is api-prod failing?"* / *"Why is monitor X red?"* + +### Step 1 — Locate the monitor + +- If the user named it precisely → `devhelm monitors list --name=` + (or MCP `list_monitors` with the name filter). +- If ambiguous (multiple matches), show the matches and ask which one. +- If not found, report so and offer to create one (hand off to + `devhelm-configure`). + +### Step 2 — Pull current state + +```bash +devhelm monitors get +``` + +Extract: `status` (UP / DOWN / DEGRADED / PAUSED), `lastCheckAt`, +`assertions`, `enabled`. + +- `status == PAUSED` → tell the user the monitor is paused and when; + don't go deeper. +- `status == UP` → tell the user the monitor is currently green and ask + what window they're asking about. + +### Step 3 — Pull the last N check results + +Default window: last 25 results, or last 60 minutes — whichever is +wider. + +```bash +devhelm monitors results --limit=25 --output=json +``` + +For each **failed** check result, extract: + +- `executedAt` (when) +- `region` (which probe region) +- `statusCode` / `responseTimeMs` +- `failureReason` (the assertion that failed, with the observed value) + +### Step 4 — Correlate with incidents + +```bash +devhelm incidents list --monitor= --state=OPEN,CONFIRMED --output=json +devhelm incidents list --monitor= --since=24h --output=json +``` + +If there's an active incident → include its ID, title, state, and +first-failure timestamp. Link to `@references/incidents.md` for +incident fields. + +### Step 5 — Use forensics for confirmed incidents + +For any incident in CONFIRMED / RESOLVED state, the forensic trace +explains *exactly* which check results triggered the state machine and +which policy rule matched: + +```bash +devhelm forensics trace +``` + +The trace returns a sequence of rule evaluations, state transitions, +and the policy snapshot at the moment of trigger — read +`@references/incidents.md` §forensics for how to render it. + +### Step 6 — Respond + +Produce a structured summary: + +``` +Monitor: api-prod (mon_abc123) — DOWN +Target: https://api.example.com/health +Last check: 2026-04-27T15:42:11Z from us-east → FAILED + - Assertion STATUS_CODE EQUALS 200 failed (observed 503) + - 8 of the last 10 checks failed, starting 15:34:02Z + +Incident: inc_xyz789 (CONFIRMED, severity DOWN) + - First failure: 15:34:02Z (3 consecutive from us-east) + - Confirmed: 15:35:30Z (trigger + 2 more from eu-west) + - Age: 8 minutes + - Forensics: devhelm forensics trace inc_xyz789 + +Dashboard: https://app.devhelm.io/monitors/mon_abc123 + +Likely cause: the target is returning 503 consistently across both +regions, which rules out a probe-side issue. Check your origin. + +Next step: `devhelm incidents updates ` to post a public +status message, or switch to skill devhelm-configure if you want to +silence alerting temporarily. +``` + +Pack everything into one reply. Don't make the user ask three times. + +--- + +## Other common questions + +### "Is everything green?" + +```bash +devhelm status # dashboard overview: up/down/degraded counts +devhelm monitors list --status=DOWN,DEGRADED +``` + +Report the totals; list only the non-green monitors. + +### "Show me the last 24 hours" + +```bash +devhelm incidents list --since=24h --output=table +``` + +If none, say so. If many, group by monitor. + +### "What happened at 15:30 UTC?" + +```bash +devhelm audit events --since='2026-04-27T15:00:00Z' --until='2026-04-27T15:45:00Z' --output=table +``` + +Audit events cover user actions (config changes), not check results. +For check results at a specific time, use `monitors results +--since=... --until=...`. + +### "What's our uptime this month?" + +```bash +devhelm monitors uptime --window=30d --output=json +``` + +Return percentage + total downtime duration. For a service-level view +across many monitors, aggregate with `resource-groups`. + +--- + +## Safety rails + +1. **Read-only.** If the user asks for anything that modifies state + (pause, delete, ack), stop and hand off to `devhelm-configure` or + `devhelm-communicate`. +2. **Never expose API keys or secrets.** If an audit event references a + secret value, redact to last-4. +3. **Time-scope your queries.** Don't pull unbounded history; always + cap with `--limit` or `--since/--until`. Default window: last 24h. +4. **Cite data by ID.** When reporting, include monitor ID, incident + ID, check result timestamp — the user can drill down later. +5. **Don't guess causes.** If the data is ambiguous, say so and suggest + a follow-up query. Never invent an explanation that isn't in the + check results or incident record. + +--- + +## References + +- `@references/check-results.md` +- `@references/incidents.md` +- `@references/uptime-queries.md` +- `@references/audit-log.md` diff --git a/skills/devhelm-investigate/references/_generated/audit-events.fields.md b/skills/devhelm-investigate/references/_generated/audit-events.fields.md new file mode 100644 index 0000000..103e49b --- /dev/null +++ b/skills/devhelm-investigate/references/_generated/audit-events.fields.md @@ -0,0 +1,19 @@ +# audit-events — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `AuditEventDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | integer (int64) | ✓ | | Unique audit event identifier | +| `actorId` | integer (int32) | | ✓ | User ID who performed the action; null for system actions | +| `actorEmail` | string | | ✓ | Email of the actor; null for system actions | +| `action` | string | ✓ | | Audit action type (e.g. monitor.created, api_key.revoked) | +| `resourceType` | string | | ✓ | Type of resource affected (e.g. monitor, api_key) | +| `resourceId` | string | | ✓ | ID of the affected resource | +| `resourceName` | string | | ✓ | Human-readable name of the affected resource | +| `metadata` | any | | ✓ | | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the action was performed | + diff --git a/skills/devhelm-investigate/references/_generated/check-results.fields.md b/skills/devhelm-investigate/references/_generated/check-results.fields.md new file mode 100644 index 0000000..bfdd3ab --- /dev/null +++ b/skills/devhelm-investigate/references/_generated/check-results.fields.md @@ -0,0 +1,19 @@ +# check-results — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `CheckResultDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique identifier of the check result | +| `timestamp` | string (date-time) | ✓ | | Timestamp when the check was executed (ISO 8601) | +| `region` | string | ✓ | | Region where the check was executed | +| `responseTimeMs` | integer (int32) | | ✓ | Response time in milliseconds | +| `passed` | boolean | ✓ | | Whether the check passed | +| `failureReason` | string | | ✓ | Reason for failure when passed=false | +| `severityHint` | string | | ✓ | Severity hint: 'down' for hard failures, 'degraded' for warn-only failures, null when passing | +| `details` | any | | ✓ | | +| `checkId` | string (uuid) | | ✓ | Unique execution trace ID for cross-service correlation | + diff --git a/skills/devhelm-investigate/references/_generated/incidents.fields.md b/skills/devhelm-investigate/references/_generated/incidents.fields.md new file mode 100644 index 0000000..7e3a235 --- /dev/null +++ b/skills/devhelm-investigate/references/_generated/incidents.fields.md @@ -0,0 +1,44 @@ +# incidents — field reference + +> Auto-generated from the DevHelm OpenAPI spec. Do not edit by hand. +> Regenerate with `node scripts/generate-skill-references.mjs`. + +## `IncidentDto` (response shape) + +| Field | Type | Required | Nullable | Description | +|---|---|---|---|---| +| `id` | string (uuid) | ✓ | | Unique incident identifier | +| `monitorId` | string (uuid) | | ✓ | Monitor that triggered the incident; null for service or manual incidents | +| `organizationId` | integer (int32) | ✓ | | Organization this incident belongs to | +| `source` | "AUTOMATIC" \| "MANUAL" \| "MONITORS" \| "STATUS_DATA" \| "RESOURCE_GROUP" | ✓ | | Incident origin: MONITOR, SERVICE, or MANUAL | +| `status` | "WATCHING" \| "TRIGGERED" \| "CONFIRMED" \| "RESOLVED" | ✓ | | Current lifecycle status (OPEN, RESOLVED, etc.) | +| `severity` | "DOWN" \| "DEGRADED" \| "MAINTENANCE" | ✓ | | Severity level: DOWN, DEGRADED, or MAINTENANCE | +| `title` | string | | ✓ | Short summary of the incident; null for auto-generated incidents | +| `triggeredByRule` | string | | ✓ | Human-readable description of the trigger rule that fired | +| `affectedRegions` | string[] | ✓ | | Probe regions that observed the failure | +| `reopenCount` | integer (int32) | ✓ | | Number of times this incident has been reopened | +| `createdByUserId` | integer (int32) | | ✓ | User who created the incident (manual incidents only) | +| `statusPageVisible` | boolean | ✓ | | Whether this incident is visible on the status page | +| `serviceIncidentId` | string (uuid) | | ✓ | Linked vendor service incident ID; null for monitor incidents | +| `serviceId` | string (uuid) | | ✓ | Linked service catalog ID; null for monitor incidents | +| `externalRef` | string | | ✓ | External reference ID (e.g. PagerDuty incident ID) | +| `affectedComponents` | string[] | | ✓ | Service components affected by this incident | +| `shortlink` | string | | ✓ | Short URL linking to the incident details | +| `resolutionReason` | "MANUAL" \| "AUTO_RECOVERED" \| "AUTO_RESOLVED" | | ✓ | How the incident was resolved (AUTO_RECOVERED, MANUAL, etc.) | +| `startedAt` | string (date-time) | | ✓ | Timestamp when the incident was detected or created | +| `confirmedAt` | string (date-time) | | ✓ | Timestamp when the incident was confirmed (multi-region confirmation) | +| `resolvedAt` | string (date-time) | | ✓ | Timestamp when the incident was resolved | +| `cooldownUntil` | string (date-time) | | ✓ | Cooldown window end; new incidents suppressed until this time | +| `createdAt` | string (date-time) | ✓ | | Timestamp when the incident record was created | +| `updatedAt` | string (date-time) | ✓ | | Timestamp when the incident was last updated | +| `monitorName` | string | | ✓ | Name of the associated monitor; populated on list responses. Omitted from JSON (undefined to SDKs) on detail responses, treat missing as null. | +| `serviceName` | string | | ✓ | Name of the associated service; populated on list responses. Omitted from JSON (undefined to SDKs) on detail responses, treat missing as null. | +| `serviceSlug` | string | | ✓ | Slug of the associated service; populated on list responses. Omitted from JSON (undefined to SDKs) on detail responses, treat missing as null. | +| `monitorType` | string | | ✓ | Type of the associated monitor; populated on list responses. Omitted from JSON (undefined to SDKs) on detail responses, treat missing as null. | +| `resourceGroupId` | string (uuid) | | ✓ | Resource group that owns this incident; null when not group-managed | +| `resourceGroupName` | string | | ✓ | Name of the resource group; populated on list responses. Omitted from JSON (undefined to SDKs) on detail responses, treat missing as null. | +| `triggeringCheckId` | string (uuid) | | ✓ | Scheduler-minted check execution ID whose result confirmed this incident; joins to check_results, rule_evaluations, and incident_state_transitions. Omitted from JSON (undefined to SDKs) when null, treat missing as null. | +| `triggeredByRuleSnapshotHashHex` | string | | ✓ | Hex SHA-256 of the canonical policy snapshot that fired; combined with triggeredByRuleIndex points to the exact TriggerRule. Omitted from JSON when null, treat missing as null. | +| `triggeredByRuleIndex` | integer (int32) | | ✓ | Index of the fired rule inside the policy's trigger_rules array. Omitted from JSON when null, treat missing as null. | +| `engineVersion` | string | | ✓ | Detection engine semver that evaluated the rule. Omitted from JSON when null, treat missing as null. | + diff --git a/skills/devhelm-investigate/references/audit-log.md b/skills/devhelm-investigate/references/audit-log.md new file mode 100644 index 0000000..195e156 --- /dev/null +++ b/skills/devhelm-investigate/references/audit-log.md @@ -0,0 +1,102 @@ +# Audit Log + +The **audit log** is an immutable record of user actions and +significant system events in the workspace — who changed what, when, +from what IP, via which surface (dashboard / API / CLI / Terraform / +pipeline). + +Use when the user asks: + +- *"Who changed X?"* / *"When was this last modified?"* +- *"What happened around 15:30 UTC?"* +- *"Did we auto-pause this?"* +- *"Who deleted the production monitor?"* + +## Query + +```bash +devhelm audit events \ + --since=24h \ + --actor=user:alice@example.com \ + --resource-type=MONITOR \ + --resource-id= \ + --action=UPDATE,DELETE \ + --output=json +``` + +Common filter combinations: + +| Filter | Answers | +|---|---| +| `--since=1h` | "Anything happen in the last hour?" | +| `--actor=` | "What did Alice do this week?" | +| `--resource-id=` | "Full history of this specific monitor." | +| `--action=DELETE` | "All deletions." | +| `--surface=CLI,TERRAFORM` | "All code-driven changes." | + +## Event shape + +Each audit event includes: + +| Field | Meaning | +|---|---| +| `id` | Stable UUID. | +| `occurredAt` | When the action was accepted by the API. | +| `actor` | User, service-account, API key, or SYSTEM. | +| `surface` | DASHBOARD, API, CLI, TERRAFORM, PIPELINE, SYSTEM. | +| `action` | CREATE, UPDATE, DELETE, PAUSE, RESUME, LOGIN, etc. | +| `resourceType` | MONITOR, ALERT_CHANNEL, NOTIFICATION_POLICY, … | +| `resourceId` / `resourceName` | The target. | +| `metadata` | Action-specific structured data — see below. | +| `requestId` | For support correlation with server-side logs. | + +### metadata for common actions + +- `UPDATE` → `metadata.changedFields[]` with `before` / `after` per + field. Field values are redacted for sensitive types (secrets, API + keys). +- `MEMBER_ROLE_CHANGED` → `metadata.oldRole`, `metadata.newRole`, + `metadata.targetUserId`. +- `LOGIN` → `metadata.ip`, `metadata.userAgent`, + `metadata.mfaMethod`. +- `PAUSE` / `RESUME` → `metadata.reason` (if user supplied one) and + `metadata.autoPaused` (bool — system-triggered pauses). + +Full field reference: `@_generated/audit-events.fields.md`. + +## Correlation with incidents + +Audit events are user-driven; incidents are monitoring-driven. To +answer *"did someone change config right before this incident?"*: + +```bash +INC_ID=inc_xyz +START=$(devhelm incidents get $INC_ID --output=json | jq -r '.startedAt') + +devhelm audit events \ + --until="$START" \ + --since=1h \ + --resource-type=MONITOR,NOTIFICATION_POLICY \ + --action=UPDATE,DELETE +``` + +Windowing 1 hour back from the incident start catches the common +"deploy-caused outage" pattern. + +## Retention + +- Free: 30 days +- Pro: 90 days +- Scale: 1 year +- Enterprise: 2 years + export to customer-owned S3 bucket (optional) + +## Safety rails (restating from the skill) + +- **Never expose API key values or secret contents** visible in + `metadata`. They're redacted at the API boundary, but if you see + any field that contains a literal token, truncate past the first + 6 characters. +- **Respect role boundaries.** VIEWER users can see audit events for + their own actions and system events, but not others' PII (e.g. IPs + of other users). The API enforces this; you'll see redacted fields + in responses. diff --git a/skills/devhelm-investigate/references/check-results.md b/skills/devhelm-investigate/references/check-results.md new file mode 100644 index 0000000..3b3f542 --- /dev/null +++ b/skills/devhelm-investigate/references/check-results.md @@ -0,0 +1,101 @@ +# Check Results + +A **check result** is one execution of a monitor from one region — the +atomic unit of monitoring data. Every monitor produces one check +result per region per frequency interval. High-frequency monitors +across many regions generate a lot; always scope queries. + +## List + +```bash +devhelm monitors results \ + --limit=25 \ + --status=PASSED,FAILED,DEGRADED \ + --region=us-east,eu-west \ + --since=2026-04-27T00:00:00Z \ + --until=2026-04-27T23:59:59Z \ + --output=json +``` + +Defaults: last 25 results, all statuses, all regions. + +## Key fields (per result) + +| Field | Meaning | +|---|---| +| `id` | Check result ID (UUID). Stable; cited by incidents + forensics. | +| `monitorId` | Parent monitor. | +| `executedAt` | Start timestamp (UTC ISO 8601). | +| `region` | Probe region slug. | +| `status` | `PASSED`, `FAILED`, `DEGRADED`. | +| `responseTimeMs` | Wall-clock duration. | +| `statusCode` | HTTP status (HTTP monitors only). | +| `failedAssertions[]` | Which assertions failed and what they observed. | +| `raw` | Request/response snapshot (present for failed results only). | + +For the full generated field list: +`@_generated/check-results.fields.md`. + +## Interpretation + +- **Single-region failure amid passes** → likely probe-side / network + blip. The policy's `regions_required` determines if this becomes an + incident. +- **All-region failure** → origin-side. Walk the `failedAssertions[]` + to find the actual cause (status code, body mismatch, response time + breach, SSL issue). +- **Intermittent `responseTimeMs` spikes** → investigate with the + `uptime` query over a wider window (`@references/uptime-queries.md`) + before calling it a problem; brief spikes are often GC or scale-up. +- **DEGRADED status** → the monitor has a DEGRADED threshold defined + (e.g. `responseTime > 500` but still 2xx). Policy config + determines whether DEGRADED alerts. + +## Forensics: linking results to incidents + +Every incident's forensic trace cites the exact check result IDs that +moved the state machine. For a CONFIRMED incident `inc_X`: + +```bash +devhelm forensics trace inc_X --output=json +``` + +The trace array contains entries like: + +```json +{ + "timestamp": "...", + "transition": "trigger", + "triggeringCheckIds": ["cr_123", "cr_124", "cr_125"], + "rule": { + "triggerCount": 3, + "regionsRequired": 1 + } +} +``` + +Pair that with `devhelm monitors results ` to find those +specific results and explain *what the user's service was doing at +the moment the incident fired*. + +## Retention + +Check results are retained per plan: + +- Free: 3 days +- Pro: 30 days +- Scale: 90 days +- Enterprise: 1 year + +Beyond retention, only aggregated uptime stats remain. If the user +asks about a failure from 6 months ago on Pro, tell them the +per-check raw data is gone; the `devhelm monitors uptime ` +window will still cover it as aggregates. + +## CLI output tips + +- `--output=json` for programmatic / diff-able output. +- `--output=table` for quick terminal viewing — DEGRADED / FAILED + rows are coloured. +- `--output=yaml` for pasting into support tickets (reads nicely in + fixed-width but preserves structure). diff --git a/skills/devhelm-investigate/references/incidents.md b/skills/devhelm-investigate/references/incidents.md new file mode 100644 index 0000000..8a4c1e4 --- /dev/null +++ b/skills/devhelm-investigate/references/incidents.md @@ -0,0 +1,146 @@ +# Incidents + +An **incident** is a persistent state record that tracks an outage or +degradation through its lifecycle. It's created by the detection +engine when a monitor's check results satisfy a notification policy's +trigger condition. + +## Lifecycle + +``` +OPEN (internal) + ↓ (policy trigger condition met) +TRIGGERED + ↓ (policy confirm_count met) +CONFIRMED + ↓ (policy resolve condition met OR user resolves) +RESOLVED + ↓ (new failure within cooldown) +REOPENED → CONFIRMED → RESOLVED ... +``` + +The detection engine is event-sourced: every transition is immutable, +auditable, and has a `reason` field plus a `details.source` field +(`"pipeline"` for automated, `"public-api"` for user-driven). + +## List + +```bash +devhelm incidents list \ + --state=OPEN,TRIGGERED,CONFIRMED \ + --monitor= \ + --resource-group= \ + --since=24h \ + --output=json +``` + +Sensible defaults for a triage prompt: + +- `--state=TRIGGERED,CONFIRMED` (active only) +- `--since=24h` + +## Get + +```bash +devhelm incidents get --output=json +``` + +Key fields: + +- `state` — current state (above enum). +- `severity` — `DOWN`, `DEGRADED`. +- `monitorId`, `monitorName` — the triggering monitor. +- `startedAt` — first failed check timestamp. +- `confirmedAt` — when state flipped to CONFIRMED (if ever). +- `resolvedAt` — when state flipped to RESOLVED (if ever). +- `reopenCount` — how many times it's come back. +- `triggeringCheckId` — the check result that first met the trigger + condition. +- `triggeredByRule` — the rule enum (`consecutive_failures`, + `region_threshold`, etc.). +- `triggeredByRuleSnapshotHashHex` — hash of the policy snapshot at + trigger time. Use with `forensics trace` to reconstruct the exact + policy the engine evaluated. + +Full field reference: `@_generated/incidents.fields.md`. + +## Forensic trace + +The **single most useful debugging tool** for "why did this +incident fire?". Shows every rule evaluation, state transition, and +the policy snapshot at the moment of each transition. + +```bash +devhelm forensics trace --output=json +``` + +Trace entries include: + +- `timestamp` — when the evaluation occurred +- `transition` — one of `trigger`, `confirm`, `resolve`, + `auto_clear`, `reopen` +- `triggeringCheckIds[]` — the specific check result(s) that drove + this transition +- `rule` — the policy rule that matched (incl. thresholds) +- `snapshot` — immutable policy snapshot (only on `trigger`) +- `details.source` — `pipeline` (automated) or `public-api` + (user-driven; e.g. manual resolve) + +Render this as a compact timeline in replies: + +``` +15:34:02Z trigger checks=[cr_1,cr_2,cr_3] rule=consecutive_failures(3) +15:34:32Z confirm checks=[cr_4,cr_5] rule=region_threshold(2) +15:41:12Z resolve checks=[cr_10,cr_11] source=pipeline +``` + +## User-driven actions + +### Resolve + +```bash +devhelm incidents resolve --reason="Deployment rolled back" +``` + +Posted transition reason appears in the forensic trace with +`details.source="public-api"`. + +### Reopen + +```bash +devhelm incidents reopen --body="Still seeing failures after rollback" +``` + +Changes state → CONFIRMED. Writes a public update if the incident is +linked to a status page. + +### Post an update + +```bash +devhelm incidents updates create \ + --status=INVESTIGATING \ + --body="We're looking into this now" \ + --notify-subscribers=false +``` + +Updates on private incidents (not linked to a status page) are +internal-only. Updates on status-page-linked incidents are +customer-facing — switch to `devhelm-communicate` skill for those. + +## Correlation tips + +- **Multiple incidents, same monitor, ≤1 hour apart** → probably one + underlying issue that's flapping. Check `reopenCount` > 0; if + multiple separate incidents with `reopenCount=0` each, the resolve + condition may be too aggressive. +- **Multiple incidents across monitors, same timestamp** → shared + dependency. Cross-reference with `devhelm dependencies list` or + look for an AWS/GCP region-wide event. +- **Intermittent confirmed incidents** → consider raising + `trigger_count` on the policy, or requiring multi-region failures. + +## Retention + +Incidents are retained for the life of the workspace (no +per-plan retention). Updates and forensic traces are retained per +check-result retention (since they cite check result IDs). diff --git a/skills/devhelm-investigate/references/uptime-queries.md b/skills/devhelm-investigate/references/uptime-queries.md new file mode 100644 index 0000000..47336d6 --- /dev/null +++ b/skills/devhelm-investigate/references/uptime-queries.md @@ -0,0 +1,94 @@ +# Uptime Queries + +Uptime queries aggregate check results into **percentages and total +downtime durations** over a time window. Use when the user asks: + +- *"What's our uptime this month?"* +- *"Compare prod vs. staging uptime last 30 days."* +- *"Did we hit our SLO?"* + +## Per-monitor uptime + +```bash +devhelm monitors uptime \ + --window=30d \ + --region=us-east,eu-west \ + --output=json +``` + +Response: + +```json +{ + "monitorId": "mon_...", + "window": "30d", + "uptimePct": 99.87, + "downtimeSeconds": 3360, + "totalSeconds": 2592000, + "regions": { + "us-east": { "uptimePct": 99.92, "downtimeSeconds": 2080 }, + "eu-west": { "uptimePct": 99.82, "downtimeSeconds": 4680 } + }, + "incidents": [ + { "id": "inc_...", "durationSeconds": 1200, "severity": "DOWN" }, + { "id": "inc_...", "durationSeconds": 2160, "severity": "DEGRADED" } + ] +} +``` + +Windows: `1h`, `24h`, `7d`, `30d`, `90d`, or explicit +`--since=` + `--until=`. Beyond plan retention, the +response returns `null` with a `reason`. + +## Service-level (resource group) uptime + +```bash +devhelm resource-groups uptime --window=30d +``` + +Aggregates all member monitors. Default aggregation: **min** across +monitors (i.e. the worst monitor defines the service's uptime). This +matches most SLO definitions; for **avg** or **sum**, pass +`--agg=avg`. + +## Workspace-wide + +No single CLI command — aggregate manually across resource groups or +per-monitor queries, or use the MCP `query_uptime` tool which accepts +a workspace-wide scope. + +## Interpretation tips + +- **99.9% over 30 days** = 43 minutes of downtime. Remind users of + this when they ask "we had two 20-min outages last month, what's + our uptime?" +- **Regional asymmetry** — if one region shows materially lower + uptime than others over the same window, it's usually a local + probe-network issue, not a service issue. Cross-check the status of + the DevHelm region (`devhelm dependencies list | grep devhelm-region` + if applicable). +- **DEGRADED time** — counted against uptime by default. Pass + `--exclude=DEGRADED` if the SLO definition only cares about hard + downtime. +- **Planned maintenance** — paused monitors are excluded from the + window. If the user paused a monitor for 2h during a deployment, + that 2h is subtracted from `totalSeconds`, not counted as downtime. + +## SLO reports + +For monthly SLO compliance reports: + +```bash +devhelm resource-groups uptime checkout --window=30d --output=json \ + | jq '{service: "checkout", uptime: .uptimePct, budget_used: + (100 - .uptimePct) / (100 - 99.9) * 100}' +``` + +The CLI doesn't ship an SLO-budget calculator natively — compose with +`jq` (or similar) and commit the query alongside your runbook. + +## MCP equivalent + +If MCP is available, use `query_uptime` — accepts the same window + +scope and returns structured JSON. Preferred for conversational +follow-ups ("and what about staging?"). diff --git a/skills/devhelm-manage/SKILL.md b/skills/devhelm-manage/SKILL.md new file mode 100644 index 0000000..3db5053 --- /dev/null +++ b/skills/devhelm-manage/SKILL.md @@ -0,0 +1,140 @@ +--- +name: devhelm-manage +description: Manage DevHelm workspace-level administration — API keys, environments, workspace settings, plan/entitlements, and team roster (read-only). Use when the user wants to create or rotate an API key, check what plan they're on, list environments, see team members, or inspect workspace-wide limits. +--- + +# DevHelm — Manage + +You help the user with **workspace administration**: API keys, plan / +billing surface, environments, and team visibility. + +This skill is **not** for creating monitoring resources — for that use +`devhelm-configure`. It's also not for debugging — use +`devhelm-investigate`. This skill is small and safety-conscious by +design: most of its operations touch credentials or plan state. + +--- + +## Preconditions + +1. `devhelm --version` succeeds. +2. `devhelm auth me` succeeds. +3. For any destructive operation (revoke key, delete environment), + double-check the caller has the role to do it — `devhelm auth me` + returns `role`. If role is `VIEWER` or `MEMBER`, stop and tell the + user their role can't perform the action. + +--- + +## Common operations + +### API keys + +| User intent | Command | +|---|---| +| "Create a new API key" | `devhelm api-keys create --name="