diff --git a/.claude/skills/obs-alerts/SKILL.md b/.claude/skills/obs-alerts/SKILL.md new file mode 100644 index 0000000..abc7001 --- /dev/null +++ b/.claude/skills/obs-alerts/SKILL.md @@ -0,0 +1,61 @@ +--- +name: obs-alerts +description: Show active alerts and their resolution steps. Use this skill whenever the user asks about alerts, wants to know if anything is broken or firing, asks about Alertmanager state, mentions alert silences, or wants to troubleshoot a specific alert like OTelCollectorDown, LokiDown, HighSessionCost, or SensitiveFileAccess. Also use when the user says "any alerts?" or "is everything ok" in the context of the obs stack. +disable-model-invocation: false +--- + +# Obs Alerts + +Active alerts, silences, and resolution guidance. + +## Queries to run + +Use `scripts/obs-api.sh`. Run independent queries in parallel. + +### Active alerts + +```bash +scripts/obs-api.sh am /api/v2/alerts --raw --jq 'if length == 0 then "No active alerts" else .[] | "\(.labels.alertname)\t\(.labels.severity // "?")\t\(.status.state)\t\(.startsAt)" end' +``` + +### Silences + +```bash +scripts/obs-api.sh am /api/v2/silences --raw --jq '.[] | select(.status.state == "active") | "\(.matchers | map("\(.name)=\(.value)") | join(", "))\tuntil \(.endsAt)"' +``` + +### Prometheus rule states (firing/pending) + +```bash +scripts/obs-api.sh prom /api/v1/rules --raw --jq '.data.groups[].rules[] | select(.state == "firing" or .state == "pending") | "\(.name)\t\(.state)\t\(.annotations.summary // "")"' +``` + +### Total rule count + +```bash +scripts/obs-api.sh prom /api/v1/rules --raw --jq '[.data.groups[].rules | length] | add' +``` + +## Resolution hints + +For alert-specific fix instructions, read `references/resolution-hints.md` in this skill directory. + +## Output columns + +| Column | Source | +|--------|--------| +| Alert | alertname label | +| Severity | severity label | +| Status | firing/pending | +| Since | startsAt timestamp | +| Description | summary annotation | + +For output format options (table/csv/json), read `.claude/skills/obs-shared/assets/output-formats.md`. + +## Presentation + +1. Alert table: Alert | Severity | Status | Since +2. If alerts are firing, read `references/resolution-hints.md` and provide the fix +3. If silences are active, mention them +4. If no alerts: "All clear. 0 of 16 rules firing." +5. Show total rule count for context diff --git a/.claude/skills/obs-alerts/references/resolution-hints.md b/.claude/skills/obs-alerts/references/resolution-hints.md new file mode 100644 index 0000000..5dda32c --- /dev/null +++ b/.claude/skills/obs-alerts/references/resolution-hints.md @@ -0,0 +1,41 @@ +# Alert Resolution Hints + +Known alerts and suggested fixes for the obs stack. + +## Infrastructure Tier + +| Alert | Fix | +|-------|-----| +| **OTelCollectorDown** | `docker compose restart otel-collector` — check logs: `docker compose logs otel-collector --tail=20` | +| **CollectorExportFailedSpans** | Check collector logs for export errors, verify Tempo is up | +| **CollectorExportFailedMetrics** | Check collector logs, verify Prometheus is up and scraping | +| **CollectorExportFailedLogs** | Check collector logs, verify Loki is up | +| **CollectorHighMemory** | Reduce batch size in `configs/otel-collector/config.yaml` or increase container memory limit | +| **PrometheusHighMemory** | Reduce retention or increase memory limit in docker-compose.yaml | + +## Pipeline Tier + +| Alert | Fix | +|-------|-----| +| **LokiDown** | `docker compose restart loki` — check disk space, WAL at `/loki/ruler-wal` | +| **ShepherdServicesDown** | OTel Collector exporter endpoint (:8889) down — usually means collector needs restart | +| **TempoDown** | `docker compose restart tempo` — check memory (2G limit), verify WAL directory | +| **PrometheusTargetDown** | Check which target is down: `scripts/obs-api.sh prom /api/v1/targets --raw --jq '.data.activeTargets[] | select(.health=="down")'` | +| **LokiRecordingRulesFailing** | Check Loki ruler logs, verify `configs/loki/rules/fake/codex.yaml` syntax | + +## Services Tier + +| Alert | Fix | +|-------|-----| +| **HighSessionCost** | Review session in Grafana Cost dashboard — model choice or long session? Consider switching to cheaper model. | +| **HighTokenBurn** | Check for runaway loops or large file reads. Look at Operations dashboard event stream. | +| **HighToolErrorRate** | Check Quality dashboard — which tools are failing? Common: Read on deleted files, Bash timeout. | +| **SensitiveFileAccess** | Check Operations dashboard for which files were accessed. Review PreToolUse guard patterns. | +| **NoTelemetryReceived** | Hooks not installed or CLI not in use. Run `./hooks/install.sh` and verify with `./scripts/test-signal.sh`. | + +## Inhibit Rules + +These suppress downstream alerts to reduce noise: +- `OTelCollectorDown` → suppresses `ShepherdServicesDown` + all business-logic alerts +- `LokiDown` → suppresses `LokiRecordingRulesFailing` + `HighTokenBurn` +- `ShepherdServicesDown` → suppresses `NoTelemetryReceived`, `HighToolErrorRate`, `HighSessionCost` diff --git a/.claude/skills/obs-cost/SKILL.md b/.claude/skills/obs-cost/SKILL.md new file mode 100644 index 0000000..01c0a38 --- /dev/null +++ b/.claude/skills/obs-cost/SKILL.md @@ -0,0 +1,78 @@ +--- +name: obs-cost +description: Show AI coding cost and token usage. Use this skill whenever the user asks how much they spent, wants a cost breakdown by provider or model, asks about token consumption, wants to compare Claude vs Gemini costs, or mentions anything about budget, spend, billing, or usage dollars. Supports time ranges like today, yesterday, week, 24h. Even if the user just says "how much did I spend" or "cost report" — use this skill. +disable-model-invocation: false +--- + +# Obs Cost Report + +Cost and token usage breakdown by provider and model. + +## Arguments + +User may specify a time range. Default: `24h`. +Mapping: `today` → `24h`, `yesterday` → offset query, `week` → `7d`. +Replace `[24h]` in queries below with the appropriate range. + +## Queries to run + +Use `scripts/obs-api.sh` for all queries. Run independent queries in parallel. + +### Claude cost by model + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | select(.metric.model != "") | "\(.metric.model)\t\(.value[1])"' --data-urlencode 'query=sort_desc(sum by (model) (max_over_time(shepherd_claude_code_cost_usage_USD_total[24h])))' +``` + +### Claude cost total + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=sum(max_over_time(shepherd_claude_code_cost_usage_USD_total[24h]))' +``` + +### Claude tokens by type + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | select(.metric.type != "") | "\(.metric.type)\t\(.value[1])"' --data-urlencode 'query=sort_desc(sum by (type) (max_over_time(shepherd_claude_code_token_usage_tokens_total[24h])))' +``` + +### Gemini tokens by type + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | select(.metric.type != "") | "\(.metric.type)\t\(.value[1])"' --data-urlencode 'query=sort_desc(sum by (type) (max_over_time(shepherd_gemini_cli_token_usage_total[24h])))' +``` + +### Codex tokens + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=sum(sum_over_time(shepherd:codex:tokens_input:1m[24h]))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=sum(sum_over_time(shepherd:codex:tokens_output:1m[24h]))' +``` + +### Session count per provider + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=count(max_over_time(shepherd_claude_code_session_count_total[24h]))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=count(max_over_time(shepherd_gemini_cli_session_count_total[24h]))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=sum(sum_over_time(shepherd:codex:sessions:1m[24h]))' +``` + +## Output columns + +| Section | Columns | +|---------|---------| +| Cost by model | Model \| Cost ($) | +| Tokens | Provider \| Type \| Count | +| Sessions | Provider \| Count | + +For output format options (table/csv/json), read `.claude/skills/obs-shared/assets/output-formats.md`. + +## Presentation + +1. **Total cost** (sum across providers) +2. **Cost by model** table +3. **Token breakdown** per provider (input/output/cache) +4. **Session count** per provider +5. Calculate cost-per-session and tokens-per-dollar where meaningful +6. If all values are 0: "No activity in the last 24h. Stack may not be receiving telemetry — try `/obs-status`." +7. Note: only Claude emits cost metrics. Gemini and Codex show tokens only. diff --git a/.claude/skills/obs-query/SKILL.md b/.claude/skills/obs-query/SKILL.md new file mode 100644 index 0000000..b16349b --- /dev/null +++ b/.claude/skills/obs-query/SKILL.md @@ -0,0 +1,51 @@ +--- +name: obs-query +description: Execute PromQL or LogQL queries against Prometheus and Loki. Use this skill whenever the user wants to run a custom query, check a specific metric value, search logs, asks "what is the value of metric X", wants to explore data in Prometheus or Loki, or pastes a PromQL/LogQL expression. Also use when the user asks to query traces, check recording rules output, or debug metric values that don't match dashboard expectations. +disable-model-invocation: false +--- + +# Obs Query + +Execute arbitrary PromQL or LogQL queries and present results. + +## Query type detection + +- **LogQL**: starts with `{` (stream selector) → route to Loki +- **PromQL**: everything else → route to Prometheus + +## How to execute + +### PromQL instant query + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result' --data-urlencode "query=" +``` + +### PromQL range query + +```bash +scripts/obs-api.sh prom /api/v1/query_range --raw --jq '.data.result' --data-urlencode "query=" --data-urlencode "start=$(date -v-1H +%s)" --data-urlencode "end=$(date +%s)" --data-urlencode "step=60" +``` + +### LogQL query + +```bash +scripts/obs-api.sh loki /loki/api/v1/query_range --raw --jq '.data.result' --data-urlencode "query=" --data-urlencode "limit=20" +``` + +## Examples + +For a comprehensive list of PromQL and LogQL examples, read `references/examples.md` in this skill directory. + +## Output + +For output format options (table/csv/json), read `.claude/skills/obs-shared/assets/output-formats.md`. + +## Instructions + +1. Take the user's query from the skill argument (everything after `/obs-query`) +2. If no query provided, read `references/examples.md` and show common examples +3. Detect query type and execute with appropriate endpoint +4. Format results as table (instant vectors), summary (range), or log lines +5. If query fails, show the error and suggest fixes +6. **Safety**: read-only GET queries only — never POST, PUT, DELETE, or admin endpoints diff --git a/.claude/skills/obs-query/references/examples.md b/.claude/skills/obs-query/references/examples.md new file mode 100644 index 0000000..2159b29 --- /dev/null +++ b/.claude/skills/obs-query/references/examples.md @@ -0,0 +1,61 @@ +# PromQL & LogQL Examples + +Quick reference for querying the obs stack. + +## PromQL (Prometheus) + +### Basics +- `up` — all scrape targets with health status +- `shepherd_tool_calls_total` — raw tool call counters +- `shepherd_events_total` — raw event counters + +### Hook Metrics (aggregated across sessions) +- `sum(increase(shepherd_tool_calls_total[1h]))` — total tool calls in last hour +- `topk(5, sum by (tool) (increase(shepherd_tool_calls_total[24h])))` — top 5 tools +- `sum by (source) (increase(shepherd_events_total[1h]))` — events by provider +- `sum(increase(shepherd_sensitive_file_access_total[24h]))` — sensitive access count + +### Native OTel — Claude +- `shepherd_claude_code_cost_usage_USD_total` — cost per session (has `session_id`, `model` labels) +- `shepherd_claude_code_token_usage_tokens_total` — tokens per session (`type`: input/output/cacheRead/cacheCreation) +- `count(max_over_time(shepherd_claude_code_session_count_total[24h]))` — count distinct sessions + +### Native OTel — Gemini +- `shepherd_gemini_cli_token_usage_total` — tokens (`type`: input/output/thought/cache/tool) +- `shepherd_gemini_cli_tool_call_count_total` — tool calls by `function_name` +- `shepherd_gemini_cli_api_request_count_total` — API requests by `status_code` + +### Recording Rules — Codex +- `shepherd:codex:sessions:1m` — session count (1m buckets, use `sum_over_time`) +- `shepherd:codex:tokens_input:1m` / `shepherd:codex:tokens_output:1m` — tokens +- `shepherd:codex:tool_calls_by_tool:1m` — tool calls by `tool_name` + +### Span Metrics (from Tempo) +- `traces_spanmetrics_calls_total{span_name="claude.session"}` — session trace counts +- `traces_spanmetrics_calls_total{span_name=~"claude.tool.*"}` — tool call traces +- `traces_spanmetrics_latency_bucket{span_name=~"*.tool.*"}` — tool duration histogram + +## LogQL (Loki) + +### Stream Selectors +- `{service_name="claude-code"}` — Claude Code logs +- `{service_name="codex_cli_rs"}` — Codex CLI logs +- `{service_name="gemini-cli"}` — Gemini CLI logs + +### Filtering & Parsing +- `{service_name="claude-code"} | json` — parse JSON log body +- `{service_name="claude-code"} | json | line_format "{{.body}}"` — show just the body +- `{service_name="claude-code"} |= "error"` — filter for errors +- `{service_name="claude-code"} | json | body_event_type="claude_code.tool_result"` — filter by event type + +### Aggregations +- `count_over_time({service_name="claude-code"}[1h])` — log count in last hour +- `rate({service_name="claude-code"}[5m])` — log rate +- `sum by (service_name) (count_over_time({service_name=~".+"}[1h]))` — logs per service + +## Key Gotchas + +- Native OTel metrics have per-session `session_id` label — use `max_over_time()` not `increase()` for totals +- `increase()` returns floats — wrap in `round()` for integer counters +- Codex recording rules: label is `tool_name` (not `tool`) +- Empty model label: filter with `model!=""` when grouping diff --git a/.claude/skills/obs-sessions/SKILL.md b/.claude/skills/obs-sessions/SKILL.md new file mode 100644 index 0000000..06c927b --- /dev/null +++ b/.claude/skills/obs-sessions/SKILL.md @@ -0,0 +1,55 @@ +--- +name: obs-sessions +description: Show recent AI coding sessions with details. Use this skill whenever the user asks about their recent sessions, wants to see session history, asks "what did I do today/yesterday", wants to know which models were used, how many tools were called per session, or asks about session duration and traces. Also use when the user wants to find a specific session or compare sessions across providers. +disable-model-invocation: false +--- + +# Obs Sessions + +Recent sessions across all providers with model, tools, cost, and duration. + +## Queries to run + +Use `scripts/obs-api.sh`. Run independent queries in parallel. + +### Claude sessions (from native OTel cost metrics — has session_id + model) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[:20] | .[] | "\(.metric.session_id)\t\(.metric.model)\t\(.value[1])"' --data-urlencode 'query=sort_desc(max by (session_id, model) (shepherd_claude_code_cost_usage_USD_total))' +``` + +### Tempo traces (all providers — has traceID, service, duration) + +```bash +scripts/obs-api.sh tempo '/api/search?q=%7Bspan%3Aname%3D%22claude.session%22%7D&limit=10' --raw --jq '.traces[:10][] | "\(.traceID)\t\(.rootServiceName)\t\(.durationMs)ms"' +scripts/obs-api.sh tempo '/api/search?q=%7Bspan%3Aname%3D%22gemini.session%22%7D&limit=10' --raw --jq '.traces[:10][] | "\(.traceID)\t\(.rootServiceName)\t\(.durationMs)ms"' +scripts/obs-api.sh tempo '/api/search?q=%7Bspan%3Aname%3D%22codex.session%22%7D&limit=10' --raw --jq '.traces[:10][] | "\(.traceID)\t\(.rootServiceName)\t\(.durationMs)ms"' +``` + +### Session count per provider + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=count(max_over_time(shepherd_claude_code_session_count_total[24h]))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=count(max_over_time(shepherd_gemini_cli_session_count_total[24h]))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=sum(sum_over_time(shepherd:codex:sessions:1m[24h]))' +``` + +## Output columns + +| Column | Source | +|--------|--------| +| Provider | claude/gemini/codex (from Tempo service name) | +| Session | first 8 chars of session_id or traceID | +| Model | from Claude cost metrics (others: from Tempo trace attributes if available) | +| Duration | from Tempo trace | +| Cost | Claude native OTel (others: `—`) | + +For output format options (table/csv/json), read `.claude/skills/obs-shared/assets/output-formats.md`. + +## Presentation + +1. Session table sorted by most recent, max 20 rows +2. Truncate session IDs to first 8 chars +3. Highlight sessions with high cost (>$1), many tools (>50), or errors +4. Mention Grafana Session Timeline dashboard for full trace view +5. If no sessions: "No sessions recorded. Use a CLI with hooks installed, then check `/obs-status`." diff --git a/.claude/skills/obs-shared/assets/output-formats.md b/.claude/skills/obs-shared/assets/output-formats.md new file mode 100644 index 0000000..39667d3 --- /dev/null +++ b/.claude/skills/obs-shared/assets/output-formats.md @@ -0,0 +1,59 @@ +# Output Formats + +All obs skills support three output formats. Default is **table** unless the user requests otherwise. + +## Pretty Table (default) + +Markdown table with aligned columns. Use `—` for missing values. Round numbers to 2 decimal places for costs, 0 for token/call counts. + +``` +| Column1 | Column2 | Column3 | +|---------|---------|---------| +| value | value | value | +``` + +Rules: +- Header row is bold-style (markdown table header) +- Status columns: use UP/DOWN or OK/ERROR (no emojis unless user asks) +- Cost columns: prefix with `$` +- Large numbers: use `k` suffix for thousands (e.g., `124k`) +- Timestamps: relative ("3m ago", "2h ago") unless user asks for absolute + +## CSV + +Comma-separated values. First row is header. Quote fields that contain commas. + +``` +column1,column2,column3 +value,"value, with comma",value +``` + +Rules: +- No markdown formatting +- Timestamps: ISO 8601 (`2026-03-11T21:00:00Z`) +- Numbers: raw (no `k` suffix, no `$` prefix) +- Missing values: empty field + +## JSON + +Array of objects. One object per row. + +```json +[ + {"column1": "value", "column2": 123, "column3": null}, + {"column1": "value", "column2": 456, "column3": null} +] +``` + +Rules: +- Numbers are numbers (not strings) +- Missing values: `null` +- Timestamps: ISO 8601 strings +- Costs: raw float (no `$`) + +## How to detect user's format preference + +- Default: table +- User says "csv", "export", "spreadsheet" → CSV +- User says "json", "raw", "api" → JSON +- User says "table", "pretty", or nothing → table diff --git a/.claude/skills/obs-status/SKILL.md b/.claude/skills/obs-status/SKILL.md new file mode 100644 index 0000000..e1f8c7e --- /dev/null +++ b/.claude/skills/obs-status/SKILL.md @@ -0,0 +1,62 @@ +--- +name: obs-status +description: Check obs stack health and service status. Use this skill whenever the user asks about stack health, whether services are running, if telemetry is flowing, why dashboards show no data, or anything related to the obs stack being up or down. Also use when troubleshooting "no data" issues, checking if Prometheus/Loki/Tempo/Grafana are reachable, or after running docker compose up to verify everything started correctly. +disable-model-invocation: false +--- + +# Obs Stack Status + +Check health of all 6 obs stack services, scrape targets, alerts, and last telemetry. + +## Queries to run + +Use `scripts/obs-api.sh` to query each service. Run independent checks in parallel. + +### Step 1: Service health (run all in parallel) + +```bash +scripts/obs-api.sh grafana /api/health +scripts/obs-api.sh loki /ready +scripts/obs-api.sh prom /-/healthy +scripts/obs-api.sh am /-/healthy +scripts/obs-api.sh tempo /ready +``` + +If a command fails or returns empty, the service is down. + +### Step 2: Prometheus scrape targets + +```bash +scripts/obs-api.sh prom /api/v1/targets --raw --jq '.data.activeTargets[] | "\(.labels.job)\t\(.health)"' +``` + +### Step 3: Active alerts + +```bash +scripts/obs-api.sh am /api/v2/alerts --raw --jq 'if length == 0 then "No active alerts" else .[] | "\(.labels.alertname)\t\(.labels.severity // "?")" end' +``` + +### Step 4: Last telemetry received + +```bash +scripts/obs-api.sh prom '/api/v1/query?query=max(shepherd_events_total)' --raw --jq '.data.result[0].value' +scripts/obs-api.sh prom '/api/v1/query?query=max(shepherd_claude_code_cost_usage_USD_total)' --raw --jq '.data.result[0].value' +``` + +## Output columns + +| Column | Source | +|--------|--------| +| Service | service name | +| Port | 3000, 3100, 9090, 9093, 3200, 4317/4318 | +| Status | UP if healthy response, DOWN if empty/error | + +For output format options (table/csv/json), read `.claude/skills/obs-shared/assets/output-formats.md`. + +## Presentation + +1. Service status table (name | port | status) +2. Scrape targets (job | health) +3. Active alerts with severity (if any) +4. Last telemetry — how long ago +5. If any service is DOWN: suggest `docker compose up -d` or `docker compose logs --tail=20` diff --git a/.claude/skills/obs-tools/SKILL.md b/.claude/skills/obs-tools/SKILL.md new file mode 100644 index 0000000..82a0a89 --- /dev/null +++ b/.claude/skills/obs-tools/SKILL.md @@ -0,0 +1,72 @@ +--- +name: obs-tools +description: Show tool usage statistics and error rates. Use this skill whenever the user asks about which tools are used most, tool error rates, wants to see Read/Edit/Bash/Grep call counts, asks about tool failures, sensitive file access, or wants a breakdown of tool usage by provider or repository. Also trigger when the user mentions "tool stats", "what tools did I use", or asks about tool performance. +disable-model-invocation: false +--- + +# Obs Tools Report + +Tool usage, error rates, and breakdown by provider and repo. + +## Queries to run + +Use `scripts/obs-api.sh`. Run independent queries in parallel. + +### Top 15 tools by call count (24h) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | "\(.metric.tool)\t\(.value[1])"' --data-urlencode 'query=topk(15, sort_desc(sum by (tool) (round(increase(shepherd_tool_calls_total[24h])))))' +``` + +### Tool calls by provider (24h) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | "\(.metric.source)\t\(.value[1])"' --data-urlencode 'query=sort_desc(sum by (source) (round(increase(shepherd_tool_calls_total[24h]))))' +``` + +### Tool calls by repo (24h) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | select(.metric.git_repo != "") | "\(.metric.git_repo)\t\(.value[1])"' --data-urlencode 'query=sort_desc(sum by (git_repo) (round(increase(shepherd_tool_calls_total[24h]))))' +``` + +### Error rate by tool (24h) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | "\(.metric.tool)\t\(.value[1])%"' --data-urlencode 'query=sort_desc(sum by (tool) (round(increase(shepherd_tool_calls_total{tool_status="error"}[24h]))) / sum by (tool) (round(increase(shepherd_tool_calls_total[24h]))) * 100 > 0)' +``` + +### Totals (24h) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=round(sum(increase(shepherd_tool_calls_total[24h])))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=round(sum(increase(shepherd_tool_calls_total{tool_status="error"}[24h])))' +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[0].value[1] // "0"' --data-urlencode 'query=round(sum(increase(shepherd_sensitive_file_access_total[24h])))' +``` + +### Gemini native tool metrics (24h) + +```bash +scripts/obs-api.sh prom /api/v1/query --raw --jq '.data.result[] | "\(.metric.function_name)\t\(.value[1])"' --data-urlencode 'query=topk(10, sort_desc(sum by (function_name) (round(increase(shepherd_gemini_cli_tool_call_count_total[24h])))))' +``` + +## Output columns + +| Column | Source | +|--------|--------| +| Tool | tool name | +| Calls | total invocations | +| Errors | error count | +| Error Rate | errors/calls % | + +For output format options (table/csv/json), read `.claude/skills/obs-shared/assets/output-formats.md`. + +## Presentation + +1. **Summary line**: "847 tool calls, 3 errors (0.4%), 0 sensitive access" +2. **Top tools** table: Tool | Calls | Error Rate +3. **By provider** breakdown +4. **By repo** breakdown +5. Flag tools with error rate > 10% +6. If sensitive file access > 0, highlight as security note +7. If no data: "No tool calls recorded in the last 24h." diff --git a/.gitignore b/.gitignore index f7bda1a..52257ea 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,8 @@ *.swo # AI CLI -.claude/ +.claude/* +!.claude/skills/ .gemini/ # Environment diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a9b1b4..44826df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,16 +4,18 @@ All notable changes to shepard-obs-stack ("The Eye") are documented here. ## [Unreleased] -### Fixed +### Added +- **Claude Code skills** (6 slash commands): `/obs-status`, `/obs-cost`, `/obs-sessions`, + `/obs-tools`, `/obs-alerts`, `/obs-query` — query the obs stack directly from Claude Code + without switching to the browser. Covers health checks, cost reports, session summaries, + tool usage, active alerts, and free-form PromQL/LogQL queries. +- **`scripts/obs-api.sh`** — centralized API client for all obs stack services. Auth-ready: + supports `SHEPARD_API_TOKEN` (Bearer), `SHEPARD_CA_CERT` (TLS), `SHEPARD_GRAFANA_TOKEN` + via environment variables. Defaults to plain HTTP on localhost for single-machine use. - **LokiDown alert was checking OTel Collector, not Loki** — `up{job="shepherd-services"}` monitored the collector exporter (port 8889), not Loki itself. Now uses dedicated `up{job="loki"}` scrape job. Old check renamed to `ShepherdServicesDown`. -- **Compaction arithmetic error in stop.sh** — `grep -c` returns "0" with exit 1, - `|| echo "0"` produced `"0\n0"`, causing `-gt` comparison to fail. - -### Added - - **Test suite** (113 tests, 4 suites): shell syntax (23), config validation (25), hook behavior (41), session parsers (24). Run with `bash tests/run-all.sh`. - **CI workflow** (`.github/workflows/test.yml`): unit tests + Docker E2E smoke. @@ -25,11 +27,18 @@ All notable changes to shepard-obs-stack ("The Eye") are documented here. - **promtool validation** in CI — `promtool check rules` on all Prometheus alert files. - **Alert regression tests** — rule counts per file + expression guards (LokiDown, ShepherdServicesDown, OTelCollectorDown). +### Fixed + +- **LokiDown alert** — split from collector exporter check into dedicated Loki scrape job. +- **Compaction arithmetic error in stop.sh** — `grep -c` returns "0" with exit 1, + `|| echo "0"` produced `"0\n0"`, causing `-gt` comparison to fail. + ### Changed - Alert count: 15 → 16 rules (LokiDown split into LokiDown + ShepherdServicesDown). - Inhibit rules: `OTelCollectorDown` now also suppresses `ShepherdServicesDown`; `ShepherdServicesDown` suppresses `NoTelemetryReceived`, `HighToolErrorRate`, `HighSessionCost`. +- `.gitignore`: un-ignore `.claude/skills/` for tracking slash-command skills in the repo. ## [1.1.0] — 2026-03-05 diff --git a/CLAUDE.md b/CLAUDE.md index 318390a..7e49bcc 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -250,6 +250,25 @@ Session Timeline (08) shows synthetic traces parsed from all 3 CLI session logs Stat/table panels use Prometheus `traces_spanmetrics_calls_total` counters (generated by Tempo's span-metrics processor). Session Traces table uses Tempo trace search. Tool Duration Distribution uses Prometheus `traces_spanmetrics_latency_bucket`. +## Claude Code Skills + +Six slash-command skills in `.claude/skills/` for querying the stack from the conversation: + +| Skill | Purpose | `disable-model-invocation` | +|-------|---------|---------------------------| +| `/obs-status` | Service health, scrape targets, last telemetry, alerts | false | +| `/obs-cost` | Cost by provider/model, token breakdown (24h default) | false | +| `/obs-sessions` | Recent sessions from span-metrics + Tempo traces | false | +| `/obs-tools` | Top tools, error rates, by provider/repo | false | +| `/obs-alerts` | Active Alertmanager alerts + resolution hints | false | +| `/obs-query` | Free-form PromQL/LogQL execution | false | + +All skills use `scripts/obs-api.sh` — centralized API client with env var overrides for auth/TLS: +- `SHEPARD_PROMETHEUS_URL`, `SHEPARD_LOKI_URL`, `SHEPARD_TEMPO_URL`, `SHEPARD_GRAFANA_URL`, `SHEPARD_ALERTMANAGER_URL` +- `SHEPARD_API_TOKEN` (Bearer auth), `SHEPARD_CA_CERT` (TLS CA cert), `SHEPARD_GRAFANA_TOKEN` (Grafana API key) + +Skills are tracked in git (`.gitignore` un-ignores `.claude/skills/`). + ## Config Structure ``` diff --git a/README.md b/README.md index 2ee874e..644f936 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ This fixes that. - [Dashboards](#dashboards) - [How It Works](#how-it-works) - [Hook Setup](#hook-setup) +- [Claude Code Skills](#claude-code-skills) - [Rust Accelerator](#rust-accelerator-optional) - [Alerting](#alerting) - [Services](#services) @@ -62,6 +63,7 @@ This fixes that. - **Eight Grafana dashboards** auto-provisioned: cost, tools, operations, quality, per-provider deep dives, and session timeline - **Minimal dependencies** — Docker, plus `bash`, `curl`, and `jq` on the host for hooks and tests. No Python, no Node, no cloud accounts - **Optional [Rust accelerator](https://github.com/shepard-system/shepard-hooks-rs)** — drop-in `shepard-hook` binary replaces bash+jq+curl. Hooks auto-detect it; falls back to bash if absent +- **Six Claude Code [skills](#claude-code-skills)** — `/obs-status`, `/obs-cost`, `/obs-sessions`, `/obs-tools`, `/obs-alerts`, `/obs-query` — query the stack without leaving your terminal - **Works offline** — everything runs on localhost, your data stays on your machine ## Quick Start @@ -153,6 +155,30 @@ The installer auto-detects installed CLIs and merges hook configuration into the | Codex CLI | `agent-turn-complete` | logs | | Gemini CLI | `AfterTool`, `AfterAgent`, `AfterModel`, `SessionEnd` | metrics + logs + traces | +## Claude Code Skills + +Six slash-command skills for querying the obs stack directly from Claude Code — no browser needed. + +| Skill | What it does | +|-------|-------------| +| `/obs-status` | Stack health: service status, scrape targets, last telemetry, active alerts | +| `/obs-cost` | Cost report by provider and model (supports `today`, `yesterday`, `week`, `24h`) | +| `/obs-sessions` | Recent sessions with model, duration, tool count, cost | +| `/obs-tools` | Top tools, error rates, usage by provider and repo | +| `/obs-alerts` | Active alerts with severity and resolution hints | +| `/obs-query` | Free-form PromQL or LogQL — run any query inline | + +Skills are installed automatically when you clone the repo (they live in `.claude/skills/`). All API calls go through `scripts/obs-api.sh` — a centralized helper that's ready for auth and TLS when you need it: + +```bash +# Default: plain HTTP to localhost (single-machine, no auth) +./scripts/obs-api.sh prometheus /api/v1/query --data-urlencode 'query=up' + +# With auth (set env vars when hardening or going multi-machine) +SHEPARD_API_TOKEN=secret ./scripts/obs-api.sh prometheus /api/v1/query ... +SHEPARD_CA_CERT=/path/to/ca.pem ./scripts/obs-api.sh loki /ready +``` + ## Rust Accelerator (optional) All hooks work out of the box with bash + jq + curl. For faster execution, you can optionally install the [Rust accelerator](https://github.com/shepard-system/shepard-hooks-rs) — a single static binary that replaces the entire bash pipeline: @@ -233,6 +259,13 @@ Native Telegram, Slack, and Discord receivers are included — uncomment and con shepard-obs-stack/ ├── docker-compose.yaml ├── .env.example +├── .claude/skills/ # Claude Code slash-command skills +│ ├── obs-status/ # /obs-status — stack health +│ ├── obs-cost/ # /obs-cost — cost report +│ ├── obs-sessions/ # /obs-sessions — session summary +│ ├── obs-tools/ # /obs-tools — tool usage +│ ├── obs-alerts/ # /obs-alerts — active alerts +│ └── obs-query/ # /obs-query — free-form PromQL/LogQL ├── hooks/ │ ├── bin/ # Rust accelerator binary (gitignored, downloaded) │ ├── lib/ # shared: accelerator, git context, OTLP metrics + traces, sensitive file detection, session parser @@ -244,6 +277,7 @@ shepard-obs-stack/ ├── scripts/ │ ├── init.sh # bootstrap │ ├── install-accelerator.sh # download Rust accelerator to hooks/bin/ +│ ├── obs-api.sh # centralized API client (auth-ready) │ ├── test-signal.sh # pipeline verification (11 checks) │ └── render-c4.sh # render PlantUML → SVG ├── tests/ diff --git a/scripts/obs-api.sh b/scripts/obs-api.sh new file mode 100755 index 0000000..a17d2d6 --- /dev/null +++ b/scripts/obs-api.sh @@ -0,0 +1,123 @@ +#!/usr/bin/env bash +# scripts/obs-api.sh — centralized API client for the obs stack +# +# Usage: +# obs-api.sh [--jq ] [extra-curl-args...] +# obs-api.sh prom /api/v1/query --jq '.data.result[]' --data-urlencode 'query=up' +# obs-api.sh loki /ready +# obs-api.sh am /api/v2/alerts --jq '.[].labels.alertname' +# +# Services: prometheus (prom), loki, tempo, grafana, alertmanager (am), collector (otel) +# +# Options: +# --jq Pipe output through jq with the given filter +# --raw Pass -r to jq (raw output, no quotes). Only with --jq. +# +# Auth-ready: set SHEPARD_API_TOKEN for Bearer auth, SHEPARD_CA_CERT for TLS. +# All env vars have sensible localhost defaults for single-machine use. + +set -u + +# ── Service URLs (override via env) ───────────────────────────────── +SHEPARD_PROMETHEUS_URL="${SHEPARD_PROMETHEUS_URL:-http://localhost:9090}" +SHEPARD_LOKI_URL="${SHEPARD_LOKI_URL:-http://localhost:3100}" +SHEPARD_TEMPO_URL="${SHEPARD_TEMPO_URL:-http://localhost:3200}" +SHEPARD_GRAFANA_URL="${SHEPARD_GRAFANA_URL:-http://localhost:3000}" +SHEPARD_ALERTMANAGER_URL="${SHEPARD_ALERTMANAGER_URL:-http://localhost:9093}" +SHEPARD_COLLECTOR_URL="${SHEPARD_COLLECTOR_URL:-http://localhost:8888}" + +# ── Auth (empty = no auth, set when hardening) ────────────────────── +SHEPARD_API_TOKEN="${SHEPARD_API_TOKEN:-}" +SHEPARD_CA_CERT="${SHEPARD_CA_CERT:-}" +SHEPARD_GRAFANA_TOKEN="${SHEPARD_GRAFANA_TOKEN:-}" + +# ── Resolve service → base URL ────────────────────────────────────── +resolve_url() { + case "$1" in + prometheus|prom) echo "$SHEPARD_PROMETHEUS_URL" ;; + loki) echo "$SHEPARD_LOKI_URL" ;; + tempo) echo "$SHEPARD_TEMPO_URL" ;; + grafana) echo "$SHEPARD_GRAFANA_URL" ;; + alertmanager|am) echo "$SHEPARD_ALERTMANAGER_URL" ;; + collector|otel) echo "$SHEPARD_COLLECTOR_URL" ;; + *) echo "Unknown service: $1" >&2; return 1 ;; + esac +} + +# ── Build auth headers ────────────────────────────────────────────── +auth_args() { + local service="$1" + local args=() + + # Grafana has its own token (API key / service account) + if [[ "$service" == "grafana" && -n "$SHEPARD_GRAFANA_TOKEN" ]]; then + args+=(-H "Authorization: Bearer $SHEPARD_GRAFANA_TOKEN") + elif [[ -n "$SHEPARD_API_TOKEN" ]]; then + args+=(-H "Authorization: Bearer $SHEPARD_API_TOKEN") + fi + + # TLS CA certificate + if [[ -n "$SHEPARD_CA_CERT" ]]; then + args+=(--cacert "$SHEPARD_CA_CERT") + fi + + printf '%s\n' "${args[@]}" +} + +# ── Main ──────────────────────────────────────────────────────────── +if [[ $# -lt 2 ]]; then + echo "Usage: obs-api.sh [--jq ] [--raw] [extra-curl-args...]" >&2 + echo "Services: prometheus, loki, tempo, grafana, alertmanager, collector" >&2 + exit 1 +fi + +SERVICE="$1" +shift +API_PATH="$1" +shift + +# Parse --jq and --raw from remaining args +JQ_FILTER="" +JQ_RAW=false +EXTRA_ARGS=() + +while [[ $# -gt 0 ]]; do + case "$1" in + --jq) + JQ_FILTER="$2" + shift 2 + ;; + --raw) + JQ_RAW=true + shift + ;; + *) + EXTRA_ARGS+=("$1") + shift + ;; + esac +done + +BASE_URL=$(resolve_url "$SERVICE") || exit 1 + +# Build curl command +CURL_ARGS=(-sf --max-time 10) + +# Add auth args +while IFS= read -r arg; do + [[ -n "$arg" ]] && CURL_ARGS+=("$arg") +done < <(auth_args "$SERVICE") + +# Add extra args from caller +CURL_ARGS+=("${EXTRA_ARGS[@]+"${EXTRA_ARGS[@]}"}") + +# Execute +if [[ -n "$JQ_FILTER" ]]; then + JQ_ARGS=() + if $JQ_RAW; then + JQ_ARGS+=(-r) + fi + curl "${CURL_ARGS[@]}" "${BASE_URL}${API_PATH}" 2>/dev/null | jq "${JQ_ARGS[@]}" "$JQ_FILTER" 2>/dev/null +else + curl "${CURL_ARGS[@]}" "${BASE_URL}${API_PATH}" +fi