feat: add SRE incident response agent using Anthropic Managed Agents

neelay-aign · claude · neelay-aign · commit 678bb693e915 · 2026-04-14T14:30:40.000+02:00
Add a background SRE agent that triages BetterStack incidents for the
Python SDK and creates fix PRs via the GitHub MCP server.

Architecture: BetterStack webhook -&gt; GitHub repository_dispatch -&gt;
GH Actions workflow -&gt; Managed Agent session on Anthropic infra.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/sre-incident-response.yml b/.github/workflows/sre-incident-response.yml
@@ -0,0 +1,46 @@
+name: SRE Incident Response
+
+on:
+  repository_dispatch:
+    types: [betterstack-incident]
+  workflow_dispatch:
+    inputs:
+      incident_id:
+        description: "BetterStack incident ID to triage (leave empty if simulating)"
+        required: false
+      simulate:
+        description: "Use a simulated incident instead of fetching from API"
+        type: boolean
+        default: true
+
+concurrency:
+  group: sre-incident-${{ github.event.client_payload.incident_id || inputs.incident_id || 'manual' }}
+  cancel-in-progress: false
+
+jobs:
+  triage:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    permissions:
+      contents: read
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: astral-sh/setup-uv@v6
+
+      - name: Install dependencies
+        working-directory: sre-agent
+        run: uv sync
+
+      - name: Run SRE agent
+        working-directory: sre-agent
+        run: uv run python -m sre_agent.main
+        env:
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+          SRE_BETTERSTACK_API_TOKEN: ${{ secrets.BETTERSTACK_API_TOKEN }}
+          SRE_AGENT_ID: ${{ secrets.SRE_AGENT_ID }}
+          SRE_ENVIRONMENT_ID: ${{ secrets.SRE_ENVIRONMENT_ID }}
+          SRE_VAULT_ID: ${{ secrets.SRE_VAULT_ID }}
+          SRE_GITHUB_REPO: aignostics/python-sdk
+          INCIDENT_ID: ${{ github.event.client_payload.incident_id || inputs.incident_id }}
+          SIMULATE: ${{ inputs.simulate || 'false' }}
diff --git a/sre-agent/pyproject.toml b/sre-agent/pyproject.toml
@@ -0,0 +1,24 @@
+[project]
+name = "sre-agent"
+version = "0.1.0"
+description = "SRE incident response agent for the Aignostics Python SDK"
+requires-python = ">=3.12"
+dependencies = [
+    "anthropic>=0.52.0",
+    "pydantic-settings>=2.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/sre_agent"]
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
diff --git a/sre-agent/skills/sre-runbook/SKILL.md b/sre-agent/skills/sre-runbook/SKILL.md
@@ -0,0 +1,41 @@
+---
+name: sre-runbook
+description: Repo-specific triage context for Aignostics Python SDK incidents
+---
+
+# Aignostics Python SDK -- SRE Triage Guide
+
+## Incident Types
+
+### "Scheduled Audit" incidents
+- Cause: A dependency has a known CVE, or a license violation was detected.
+- The audit runs hourly via .github/workflows/_scheduled-audit.yml.
+- Tools used: pip-audit, pip-licenses, trivy.
+- Common fix: bump the vulnerable dependency in pyproject.toml,
+  then run `uv lock --upgrade-package <pkg>`.
+
+### "Scheduled Testing" incidents (staging)
+- Cause: Unit, integration, or e2e tests failed against staging.
+- Runs every 6 hours via .github/workflows/_scheduled-test-hourly.yml.
+- Check the workflow run logs for which test(s) failed.
+- Common causes: flaky tests, dependency updates, API contract changes.
+
+### "Scheduled Testing" incidents (production)
+- Cause: Tests failed against production environment.
+- Runs daily via .github/workflows/_scheduled-test-daily.yml.
+- Common causes: platform API changes, credential expiry.
+- These often require human intervention -- create an issue, not a PR.
+
+## Repo-Specific Context
+- Package manager: uv (not pip). Use `uv sync`, `uv add`, `uv run`.
+- Linting: `make lint` (ruff + mypy + pyright)
+- Testing: `make test_unit`, `make test_integration`, `make test_e2e`
+- Security audit: `make audit` (pip-audit + pip-licenses + trivy)
+- Dependency bumps: edit pyproject.toml, run `uv lock --upgrade-package <pkg>`
+- CI workflows live in .github/workflows/
+- Scheduled tests send heartbeats to BetterStack (see _scheduled-test-*.yml)
+
+## PR Conventions
+- Conventional commits: feat(...), fix(...), chore(deps): ...
+- Always add labels: "sre-agent", "skip:test:long_running"
+- Create DRAFT PRs only
diff --git a/sre-agent/src/sre_agent/__init__.py b/sre-agent/src/sre_agent/__init__.py
@@ -0,0 +1 @@
+"""SRE incident response agent for the Aignostics Python SDK."""
diff --git a/sre-agent/src/sre_agent/__main__.py b/sre-agent/src/sre_agent/__main__.py
@@ -0,0 +1,5 @@
+"""Allow running with `python -m sre_agent`."""
+
+from sre_agent.main import main
+
+main()
diff --git a/sre-agent/src/sre_agent/_config.py b/sre-agent/src/sre_agent/_config.py
@@ -0,0 +1,23 @@
+"""Configuration for the SRE incident response agent."""
+
+from __future__ import annotations
+
+from pydantic import SecretStr
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class SREAgentSettings(BaseSettings):
+    """Settings loaded from environment variables."""
+
+    model_config = SettingsConfigDict(env_prefix="SRE_")
+
+    # Anthropic Managed Agent resources (created by _setup.py)
+    agent_id: str
+    environment_id: str
+    vault_id: str
+
+    # BetterStack API (for fetching incident details)
+    betterstack_api_token: SecretStr
+
+    # GitHub repo to mount in the agent session
+    github_repo: str = "aignostics/python-sdk"
diff --git a/sre-agent/src/sre_agent/_setup.py b/sre-agent/src/sre_agent/_setup.py
@@ -0,0 +1,120 @@
+"""One-time setup: create agent, environment, skill, and vault on Anthropic.
+
+Usage:
+    SRE_GITHUB_PAT=ghp_... uv run python -m sre_agent._setup
+
+Prints the resource IDs to store as GitHub Actions secrets.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+from pathlib import Path
+
+import anthropic
+
+SYSTEM_PROMPT = """\
+You are an SRE incident response agent for the Aignostics Python SDK
+(github.com/aignostics/python-sdk).
+
+You have been triggered by a BetterStack incident alert. The alert
+includes the incident name, cause, and -- critically -- the GitHub
+Actions run URL from the failed workflow.
+
+Your job:
+
+1. Read the incident details: name, cause, and the failed run URL.
+2. Use the GitHub MCP to read the failed workflow run's logs. This is
+   your primary source of diagnostic information.
+3. Investigate the root cause:
+   - Read the workflow run logs to identify the specific failure.
+   - Check recent commits on main (git log in the mounted repo).
+   - Read the relevant GitHub Actions workflow YAML files.
+   - Use web_search to look up error messages, CVE details, or docs.
+4. Determine if the issue is fixable with a code change.
+5. If fixable: use the GitHub MCP to create a branch, commit the fix,
+   and open a draft PR with your analysis in the body.
+6. If not fixable or uncertain: use the GitHub MCP to create an issue
+   with your triage findings and recommended next steps.
+
+Constraints:
+- Always create DRAFT PRs, never regular PRs. Humans must review and merge.
+- Always cite evidence for your root cause analysis.
+- For dependency CVEs: bump the minimum safe version, run the audit check.
+- For test failures: check if the test is flaky (search for prior failures)
+  before proposing a code fix.
+- Never modify credentials, secrets, or authentication code.
+- Add the label "sre-agent" to any PR or issue you create.
+- Add the label "skip:test:long_running" to any PR you create.
+
+The repo uses: uv (package manager), pytest (testing), ruff (linting),
+mypy + pyright (type checking). CI runs on GitHub Actions.
+"""
+
+SKILL_DIR = Path(__file__).resolve().parent.parent.parent / "skills" / "sre-runbook"
+
+
+def main() -> None:
+    github_pat = os.environ.get("SRE_GITHUB_PAT", "")
+    if not github_pat:
+        print("Error: SRE_GITHUB_PAT environment variable is required.", file=sys.stderr)
+        sys.exit(1)
+
+    client = anthropic.Anthropic()
+
+    # 1. Upload runbook skill
+    skill_md = (SKILL_DIR / "SKILL.md").read_bytes()
+    skill = client.beta.skills.create(
+        display_title="sre-runbook",
+        files=[("sre-runbook/SKILL.md", skill_md, "text/markdown")],
+    )
+    print(f"Skill created: {skill.id} (version {skill.latest_version})")
+
+    # 2. Create environment
+    environment = client.beta.environments.create(
+        name="sre-incident-response",
+        config={"type": "cloud", "networking": {"type": "limited"}},
+    )
+    print(f"Environment created: {environment.id}")
+
+    # 3. Create vault with GitHub PAT
+    vault = client.beta.vaults.create(name="sre-github")
+    client.beta.vaults.credentials.create(
+        vault_id=vault.id,
+        name="github",
+        token=github_pat,
+    )
+    print(f"Vault created: {vault.id}")
+
+    # 4. Create agent
+    agent = client.beta.agents.create(
+        name="Aignostics SRE Incident Responder",
+        model="claude-sonnet-4-6",
+        system=SYSTEM_PROMPT,
+        mcp_servers=[
+            {
+                "type": "url",
+                "name": "github",
+                "url": "https://api.githubcopilot.com/mcp/",
+            },
+        ],
+        tools=[
+            {"type": "agent_toolset_20260401"},
+            {"type": "mcp_toolset", "mcp_server_name": "github"},
+        ],
+        skills=[
+            {"type": "custom", "skill_id": skill.id, "version": skill.latest_version},
+        ],
+    )
+    print(f"Agent created: {agent.id} (version {agent.version})")
+
+    # Print secrets to configure in GitHub Actions
+    print("\n--- Store these as GitHub Actions secrets ---")
+    print(f"SRE_AGENT_ID={agent.id}")
+    print(f"SRE_ENVIRONMENT_ID={environment.id}")
+    print(f"SRE_VAULT_ID={vault.id}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sre-agent/src/sre_agent/main.py b/sre-agent/src/sre_agent/main.py
diff --git a/sre-agent/tests/test_main.py b/sre-agent/tests/test_main.py
diff --git a/sre-agent/uv.lock b/sre-agent/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+"""SRE incident response agent for the Aignostics Python SDK."""`
-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +"""Allow running with `python -m sre_agent`."""
++
 +from sre_agent.main import main
++
 +main()