codeplaneapp · SamuelLHuber · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -53,6 +53,8 @@ deno.lock
 reference/
 codex/
 ghostty/
+observability/data/
+tmp/verification/
 
 # Worktree script SQLite artifacts
 scripts/worktree-feature/*.db*

diff --git a/docs/guides/agent-trace-otel-verification.mdx b/docs/guides/agent-trace-otel-verification.mdx
@@ -0,0 +1,253 @@
+---
+title: Agent Trace OTEL Log Verification
+description: Reproducible local verification for Smithers canonical trace and provider session transcript OTEL log export with Loki.
+---
+
+## Local stack
+
+Start the local stack:
+
+```bash
+smithers observability --detach
+```
+
+Expected endpoints:
+
+- Grafana: `http://localhost:3001`
+- Prometheus: `http://localhost:9090`
+- Tempo: `http://localhost:3200`
+- Loki: `http://localhost:3100`
+- OTEL collector HTTP: `http://localhost:4318`
+
+Validate the stack:
+
+```bash
+docker compose -f observability/docker-compose.otel.yml ps
+curl -sf http://localhost:3100/ready
+curl -sf http://localhost:3001/api/datasources | jq 'map({name,type,url})'
+docker logs observability-otel-collector-1 | tail -n 80
+```
+
+## Enable OTEL export
+
+```bash
+export SMITHERS_OTEL_ENABLED=1
+export OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318
+export OTEL_SERVICE_NAME=smithers-dev
+export SMITHERS_LOG_FORMAT=json
+```
+
+## Demo workflow
+
+Use the built-in reproducible workflow at `workflows/agent-trace-otel-demo.tsx`.
+It emits:
+
+- one Pi-like high-fidelity attempt with canonical trace events plus persisted session transcript rows
+- one Claude-like structured attempt with canonical trace events plus provider session transcript rows
+- one Codex-like structured attempt with canonical trace events plus provider session transcript rows
+- one Gemini-like structured attempt
+- one SDK-style final-only attempt
+- stable run annotations for Loki queries
+
+Run the success case:
+
+```bash
+smithers run workflows/agent-trace-otel-demo.tsx \
+  --run-id agent-trace-otel-demo \
+  --annotations '{"custom.demo":true,"custom.ticket":"OBS-123"}'
+```
+
+Run the malformed JSON failure case:
+
+```bash
+smithers run workflows/agent-trace-otel-demo.tsx \
+  --run-id agent-trace-otel-demo-fail \
+  --input '{"failureMode":"malformed-json"}' \
+  --annotations '{"custom.demo":true,"custom.ticket":"OBS-ERR"}'
+```
+
+Optional durable-local verification:
+
+```bash
+jq 'select(.type == "AgentTraceEvent" or .type == "AgentTraceSummary")' \
+  .smithers/executions/agent-trace-otel-demo/logs/stream.ndjson
+
+find .smithers/executions/agent-trace-otel-demo/logs/agent-trace -type f -maxdepth 1 -print
+```
+
+## Loki queries
+
+Smithers OTEL attributes are exposed to Loki as sanitized structured metadata fields such as:
+
+- `smithers_event_category`
+- `run_id`
+- `workflow_path`
+- `node_id`
+- `node_attempt`
+- `agent_family`
+- `agent_capture_mode`
+- `trace_completeness`
+- `event_kind`
+- `session_row_type`
+
+Smithers exports two related Loki log families:
+
+- `agent-trace`: normalized canonical execution events such as deltas, tool lifecycle, usage, and capture warnings/errors
+- `agent-session`: provider transcript/session rows observed live or backfilled from persisted session logs
+
+Both families share the same run/workflow/node/attempt/agent correlation fields and the same redaction rules.
+`artifact.created` remains local-only and is not exported to Loki.
+
+Use `{service_name="smithers-dev"}` as the stream selector, then filter on structured metadata in the LogQL pipeline. Use `| json` to inspect the structured log body.
+
+All events for one run:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo"
+```
+
+One node attempt:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="pi-rich-trace" | node_attempt="1"
+```
+
+Thinking deltas only:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind="assistant.thinking.delta"
+```
+
+Tool execution only:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind=~"tool\.execution\..*"
+```
+
+Capture errors only:
+
+```logql
+{service_name="smithers-dev"} | event_kind="capture.error"
+```
+
+Inspect the structured JSON body for one run:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | json
+```
+
+Canonical trace rows only:
+
+```logql
+{service_name="smithers-dev"} | smithers_event_category="agent-trace" | run_id="agent-trace-otel-demo"
+```
+
+Provider session rows only:
+
+```logql
+{service_name="smithers-dev"} | smithers_event_category="agent-session" | run_id="agent-trace-otel-demo"
+```
+
+Pi persisted session metadata:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="pi-rich-trace" | session_row_type="model_change"
+```
+
+Claude persisted session queue events:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="claude-structured-trace" | session_row_type="queue-operation"
+```
+
+Codex persisted session reasoning rows:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="codex-structured-trace" | session_row_type="event_msg"
+```
+
+Redaction proof query:
+
+```logql
+{service_name="smithers-dev"} | run_id="agent-trace-otel-demo" |= "REDACTED"
+```
+
+## API query examples
+
+Equivalent direct Loki API checks:
+
+```bash
+curl -sG 'http://localhost:3100/loki/api/v1/query_range' \
+  --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo"' \
+  --data-urlencode 'limit=200' | jq '.data.result[] | {stream, values: (.values | length)}'
+
+curl -sG 'http://localhost:3100/loki/api/v1/query_range' \
+  --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | event_kind="assistant.thinking.delta"' \
+  --data-urlencode 'limit=20' | jq '.data.result[]?.values[]?[1]'
+
+curl -sG 'http://localhost:3100/loki/api/v1/query_range' \
+  --data-urlencode 'query={service_name="smithers-dev"} | run_id="agent-trace-otel-demo" | node_id="codex-structured-trace" | session_row_type="event_msg"' \
+  --data-urlencode 'limit=20' | jq '.data.result[]?.values[]?[1]'
+```
+
+## Tempo trace checks
+
+Tempo search should show Smithers spans once a workflow has run:
+
+```bash
+curl -s http://localhost:3200/api/search | jq .
+curl -s http://localhost:3200/api/search/tags | jq .
+curl -s 'http://localhost:3200/api/search/tag/service.name/values' | jq .
+curl -s 'http://localhost:3200/api/search/tag/runId/values' | jq .
+```
+
+Inspect one trace directly:
+
+```bash
+TRACE_ID=$(curl -s http://localhost:3200/api/search \
+  | jq -r '.traces[] | select(.rootTraceName=="engine:run-workflow") | .traceID' \
+  | head -n 1)
+
+curl -s http://localhost:3200/api/traces/$TRACE_ID | jq .
+```
+
+Expected trace attributes include at least:
+
+- `service.name = smithers-dev`
+- `runId = <RUN_ID>`
+- `workflowPath = <workflow path>`
+
+## Verification checklist
+
+- stack starts successfully in Docker
+- Loki is present and queryable
+- collector logs pipeline is active
+- Pi traces show text deltas, thinking deltas, tool execution lifecycle, final message, usage, and run/node/attempt correlation
+- Pi session transcript rows are queryable in Loki
+- Claude session transcript rows are queryable in Loki
+- Codex session transcript rows are queryable in Loki
+- second agent family is exported with truthful `final-only` completeness classification
+- Gemini `stream-json` attempts preserve structured deltas truthfully
+- Kimi `stream-json` attempts preserve structured tool lifecycle truthfully
+- malformed or truncated structured streams emit `capture.error` and classify as `capture-failed`
+- artifact write failures emit `capture.warning` and degrade to `partial-observed` without losing durable DB truth
+- Tempo search shows Smithers spans and trace attributes including `runId`
+- Prometheus is still scraping the collector successfully
+- secrets are redacted from canonical events, OTEL log bodies, and persisted trace artifacts
+
+## Automated coverage
+
+The fixture-style and workflow-backed tests in `tests/agent-trace.test.tsx` now cover:
+
+- Pi rich success and malformed JSON failure
+- SDK truthful `final-only`
+- Claude Code structured stream preservation
+- Claude session transcript backfill/export
+- Codex structured delta + usage preservation
+- Codex session transcript backfill/export
+- Gemini structured `stream-json` and coarse `cli-json` classification
+- Kimi structured `stream-json` tool lifecycle preservation
+- Pi session transcript backfill/export
+- truncated structured stream failure classification
+- artifact write failure classification
+- OTEL body/attribute shaping for Loki query fields
diff --git a/docs/guides/monitoring-logs.mdx b/docs/guides/monitoring-logs.mdx
@@ -58,8 +58,9 @@ That starts:
 
 - OTLP collector on `http://localhost:4318`
 - Prometheus on `http://localhost:9090`
-- Grafana on `http://localhost:3000`
+- Grafana on `http://localhost:3001`
 - Tempo on `http://localhost:3200`
+- Loki on `http://localhost:3100`
 
 The collector exports metrics on `:8889`, Prometheus scrapes that endpoint, and Grafana comes pre-provisioned with a Smithers dashboard.
 

diff --git a/observability/docker-compose.otel.yml b/observability/docker-compose.otel.yml
@@ -2,7 +2,8 @@ version: "3.9"
 
 services:
   otel-collector:
-    image: otel/opentelemetry-collector-contrib:latest
+    image: otel/opentelemetry-collector-contrib:0.148.0
+    restart: unless-stopped
     command: ["--config", "/etc/otelcol/config.yml"]
     volumes:
       - ./otel-collector-config.yml:/etc/otelcol/config.yml:ro
@@ -11,37 +12,91 @@ services:
       - "4318:4318"   # HTTP (Smithers sends here)
       - "8889:8889"   # Prometheus exporter scrape endpoint
     depends_on:
-      - tempo
+      tempo:
+        condition: service_healthy
+      loki:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "/otelcol-contrib", "--version"]
+      interval: 10s
+      timeout: 5s
+      retries: 12
+      start_period: 10s
+
+  loki:
+    image: grafana/loki:3.3.2
+    restart: unless-stopped
+    command: ["-config.file=/etc/loki/local-config.yaml"]
+    volumes:
+      - ./loki/loki-config.yaml:/etc/loki/local-config.yaml:ro
+      - ./data/loki:/loki
+    ports:
+      - "3100:3100"
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3100/ready"]
+      interval: 10s
+      timeout: 5s
+      retries: 18
+      start_period: 10s
 
   prometheus:
     image: prom/prometheus:v3.2.0
+    restart: unless-stopped
     volumes:
       - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - ./data/prometheus:/prometheus
     ports:
       - "9090:9090"
     depends_on:
-      - otel-collector
+      otel-collector:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:9090/-/ready"]
+      interval: 10s
+      timeout: 5s
+      retries: 18
+      start_period: 10s
 
   tempo:
     image: grafana/tempo:2.7.0
+    restart: unless-stopped
     command: ["-config.file=/etc/tempo/tempo.yml"]
     volumes:
       - ./tempo/tempo.yml:/etc/tempo/tempo.yml:ro
+      - ./data/tempo:/tmp/tempo
     ports:
       - "3200:3200"   # Tempo query API
       - "4317"        # OTLP gRPC (internal)
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3200/api/search"]
+      interval: 10s
+      timeout: 5s
+      retries: 24
+      start_period: 20s
 
   grafana:
     image: grafana/grafana:11.5.0
+    restart: unless-stopped
     environment:
       - GF_SECURITY_ADMIN_PASSWORD=admin
       - GF_AUTH_ANONYMOUS_ENABLED=true
       - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
     volumes:
       - ./grafana/provisioning:/etc/grafana/provisioning:ro
       - ./grafana/dashboards:/var/lib/grafana/dashboards:ro
+      - ./data/grafana:/var/lib/grafana
     ports:
       - "3001:3000"
     depends_on:
-      - prometheus
-      - tempo
+      prometheus:
+        condition: service_healthy
+      tempo:
+        condition: service_healthy
+      loki:
+        condition: service_healthy
+    healthcheck:
+      test: ["CMD", "wget", "-q", "-O-", "http://127.0.0.1:3000/api/health"]
+      interval: 10s
+      timeout: 5s
+      retries: 24
+      start_period: 20s