diff --git a/AGENTS.md b/AGENTS.md index 078cfac2..7483f266 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -28,7 +28,7 @@ The CLI (`cli/selftune/`) is the **agent's API**. The skill definition (`skill/S ```text selftune/ ├── cli/selftune/ # TypeScript package — the CLI -│ ├── index.ts # CLI entry point +│ ├── index.ts # CLI entry point (status, doctor, alpha upload, etc.) │ ├── init.ts # Agent identity bootstrap + config init │ ├── sync.ts # Source-truth sync orchestration │ ├── orchestrate.ts # Autonomy-first loop: sync → evolve → watch @@ -61,11 +61,11 @@ selftune/ │ │ └── openclaw-ingest.ts # OpenClaw session importer (experimental) │ ├── routes/ # HTTP route handlers (extracted from dashboard-server) │ ├── repair/ # Rebuild repaired skill-usage overlays -│ ├── localdb/ # SQLite schema, direct-write, queries, materialization +│ ├── localdb/ # SQLite schema, direct-write, queries, materialization, canonical_upload_staging │ │ ├── db.ts # Database lifecycle + singleton │ │ ├── direct-write.ts # Fail-open insert functions for all tables │ │ ├── queries.ts # Read queries for dashboard + CLI consumers -│ │ ├── schema.ts # Table DDL + indexes +│ │ ├── schema.ts # Table DDL + indexes (includes canonical_upload_staging) │ │ └── materialize.ts # JSONL → SQLite rebuild (startup/backfill only) │ ├── cron/ # Optional OpenClaw-specific scheduler adapter │ ├── memory/ # Evolution memory persistence @@ -84,11 +84,20 @@ selftune/ │ │ └── stopping-criteria.ts # Stopping criteria evaluator │ ├── monitoring/ # Post-deploy monitoring (M4) │ │ └── watch.ts +│ ├── alpha-identity.ts # Alpha user identity (UUID, consent, persistence) +│ ├── alpha-upload-contract.ts # Upload queue infrastructure types + PushUploadResult +│ ├── alpha-upload/ # Alpha remote data pipeline (V2 canonical push to cloud API) +│ │ ├── index.ts # Upload orchestration (prepareUploads, runUploadCycle) +│ │ ├── queue.ts # Local upload queue + watermark tracking +│ │ ├── stage-canonical.ts # JSONL + SQLite → canonical_upload_staging writer +│ │ ├── build-payloads.ts # Staging table → V2 canonical push payload builders +│ │ ├── client.ts # HTTP upload client with Bearer auth (never throws) +│ │ └── flush.ts # Queue flush with exponential backoff (409=success, 401/403=non-retryable) │ ├── contribute/ # Opt-in anonymized data export (M7) │ │ ├── bundle.ts # Bundle assembler │ │ ├── sanitize.ts # Privacy sanitization (conservative/aggressive) │ │ └── contribute.ts # CLI entry point + GitHub submission -│ ├── observability.ts # Health checks, log integrity +│ ├── observability.ts # Health checks, log integrity, alpha queue health │ ├── status.ts # Skill health summary (M6) │ ├── last.ts # Last session insight (M6) │ └── workflows/ # Workflow discovery and persistence @@ -98,9 +107,15 @@ selftune/ │ └── src/hooks/ # Data-fetching hooks against dashboard-server ├── bin/ # npm/node CLI entry point │ └── selftune.cjs -├── skill/ # Agent-facing selftune skill -│ ├── SKILL.md # Skill definition +├── skill/ # Agent-facing selftune skill (self-contained) +│ ├── SKILL.md # Skill definition + routing │ ├── settings_snippet.json +│ ├── agents/ # Specialized subagents (bundled, copied to ~/.claude/agents/ on init) +│ │ ├── diagnosis-analyst.md +│ │ ├── evolution-reviewer.md +│ │ ├── integration-guide.md +│ │ └── pattern-analyst.md +│ ├── assets/ # Config templates (activation rules, settings) │ ├── Workflows/ # Skill workflow routing docs │ │ ├── Contribute.md │ │ ├── Cron.md @@ -120,6 +135,7 @@ selftune/ │ │ └── Watch.md │ └── references/ │ ├── grading-methodology.md +│ ├── interactive-config.md │ ├── invocation-taxonomy.md │ └── logs.md ├── tests/ # Test suite (bun test) @@ -174,7 +190,7 @@ This prevents stale docs and broken contracts. | Dashboard contract (`dashboard-contract.ts`) | `apps/local-dashboard/src/types.ts`, dashboard components that consume the changed fields | | Hook behavior (`hooks/*.ts`) | `skill/Workflows/Initialize.md` hook table, `skill/settings_snippet.json` | | Orchestrate behavior | `skill/Workflows/Orchestrate.md`, `ARCHITECTURE.md` operating modes | -| Agent files (`.claude/agents/*.md`) | `skill/SKILL.md` Specialized Agents table | +| Agent files (`skill/agents/*.md`) | `skill/SKILL.md` Specialized Agents table | | New workflow file | `skill/SKILL.md` Workflow Routing table + Resource Index | | Evolution pipeline changes | `skill/Workflows/Evolve.md`, `docs/design-docs/evolution-pipeline.md` | | Platform adapter (ingestor) changes | `skill/Workflows/Ingest.md`, `README.md` Platforms section | diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 5fb783a5..ea778987 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -1,4 +1,4 @@ - + # Architecture — selftune @@ -50,6 +50,12 @@ flowchart LR SQLite -. WAL watch .-> API API -. SSE push .-> SPA[apps/local-dashboard] API --> CLI[status / last / badge] + + SQLite -. alpha enrolled .-> AlphaUpload[alpha-upload pipeline] + AlphaUpload --> Queue[(upload_queue table)] + Queue --> Flush[flush + retry] + Flush --> CloudAPI[cloud API — POST /api/v1/push] + CloudAPI --> Postgres[(Neon Postgres — canonical tables)] ``` ## Operating Rules @@ -58,6 +64,7 @@ flowchart LR - **Shared local evidence.** Downstream modules communicate through SQLite (primary operational store), append-only JSONL audit trails, and repaired overlays. - **Autonomy with safeguards.** Low-risk description evolution can deploy automatically, but validation, watch, and rollback remain mandatory. - **Local-first product surfaces.** `status`, `last`, and the dashboard read from local evidence, not external services. +- **Alpha data pipeline.** Opted-in users upload V2 canonical push payloads to the cloud API via `alpha-upload/`. Uploads are fail-open and never block the orchestrate loop. - **Generic scheduling first.** `selftune cron setup` is the main automation path (auto-detects platform). `selftune schedule` is a backward-compatible alias. ## Domain Map @@ -78,6 +85,7 @@ flowchart LR | Local DB | `cli/selftune/localdb/` | SQLite materialization and payload-oriented queries | B | | Dashboard | `cli/selftune/dashboard.ts`, `cli/selftune/dashboard-server.ts`, `apps/local-dashboard/` | Local SPA shell, v2 API with SSE live updates, overview/report/status UI | B | | Observability CLI | `cli/selftune/status.ts`, `cli/selftune/last.ts`, `cli/selftune/badge/` | Fast local readouts of health, recent activity, and badge state | B | +| Alpha Upload | `cli/selftune/alpha-upload/`, `cli/selftune/alpha-identity.ts` | Alpha data pipeline: queue, V2 payload build, flush, HTTP transport with API key auth | B | | Contribute | `cli/selftune/contribute/` | Opt-in anonymized export for community signal pooling | C | | Skill | `skill/` | Agent-facing routing table, workflows, and references | B | @@ -164,16 +172,17 @@ don't need agent intelligence or user interaction. ## Data Architecture -SQLite is the operational database for all reads. Hooks and sync write +SQLite is the operational database for local runtime reads. Hooks and sync write directly to SQLite via `localdb/direct-write.ts`. JSONL files are retained -as an append-only audit trail and can be used to rebuild SQLite on demand. +as append-only capture/audit material and can still be used for rebuild/export +paths while the migration is being closed. ```text Primary Store: SQLite (~/.selftune/selftune.db) ├── Hooks write directly via localdb/direct-write.ts (primary write path) ├── Sync writes directly via localdb/direct-write.ts ├── All reads (orchestrate, evolve, grade, status, dashboard) query SQLite -└── WAL-mode watch powers SSE live updates +└── Target freshness model: WAL-mode watch powers SSE live updates Audit Trail: JSONL files (~/.claude/*.jsonl) ├── session_telemetry_log.jsonl Session telemetry records @@ -192,12 +201,23 @@ Core Loop: reads SQLite Rebuild Paths: ├── materialize.ts — runs once on startup for historical JSONL backfill └── selftune export — generates JSONL from SQLite on demand + +Alpha Upload Path (opted-in users only): +├── stage-canonical.ts — reads canonical JSONL + evolution evidence + orchestrate_runs into canonical_upload_staging table +├── build-payloads.ts — reads staging table via single monotonic cursor, produces V2 canonical push payloads +├── flush.ts — POSTs to cloud API (POST /api/v1/push) with Bearer auth, handles 409/401/403 +└── Cloud storage: Neon Postgres (raw_pushes for lossless ingest → canonical tables for analysis) ``` -Hooks and sync write to both SQLite (primary) and JSONL (audit trail) in -parallel. All reads go through SQLite. The materializer runs once on startup -to backfill any historical JSONL data not yet in the database. `selftune export` -can regenerate JSONL from SQLite when needed for portability or debugging. +Hooks and sync currently write to both SQLite (primary local runtime store) +and JSONL (capture/audit trail) in parallel. All local product reads go +through SQLite. The materializer runs once on startup to backfill any +historical JSONL data not yet in the database. `selftune export` can +regenerate JSONL from SQLite when needed for portability or debugging. + +Current freshness caveat: the shipped dashboard still uses legacy JSONL file +watchers for SSE invalidation in `dashboard-server.ts`. WAL-only invalidation +is the intended end-state, but it is not the sole live-refresh path yet. ## Repository Shape diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 67f98561..40a6eb1b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -147,7 +147,7 @@ When modifying JSONL log schemas or adding new fields, update all of these to ke | Symptom | Fix | |---------|-----| | Dashboard shows stale data | `selftune sync --force` | -| SQLite schema mismatch after code change | `rm ~/.selftune/selftune.db && selftune sync --force` (materializer rebuilds from JSONL) | +| SQLite schema mismatch after code change | `selftune export` first, then `rm ~/.selftune/selftune.db && selftune sync --force` | | Missing invocations after hook changes | Verify `~/.claude/settings.json` matchers, then `selftune doctor` | | Need to backfill from transcripts | `selftune ingest claude --force` | diff --git a/README.md b/README.md index 7c03aa32..63932a9d 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,7 @@ Your agent runs these — you just say what you want ("improve my skills", "show | **auto** | `selftune cron setup` | Install OS-level scheduling (cron/launchd/systemd) | | | `selftune watch --skill ` | Monitor after deploy. Auto-rollback on regression. | | **other** | `selftune telemetry` | Manage anonymous usage analytics (status, enable, disable) | +| | `selftune alpha upload` | Run a manual alpha upload cycle and emit a JSON send summary | Full command reference: `selftune --help` diff --git a/apps/local-dashboard/package.json b/apps/local-dashboard/package.json index f6cd0014..5485b3d2 100644 --- a/apps/local-dashboard/package.json +++ b/apps/local-dashboard/package.json @@ -4,7 +4,7 @@ "version": "0.1.0", "type": "module", "scripts": { - "dev": "concurrently \"cd ../.. && bun --watch run cli/selftune/dashboard-server.ts --port 7888\" \"vite\"", + "dev": "concurrently \"cd ../.. && bun --watch run cli/selftune/dashboard-server.ts --port 7888 --runtime-mode dev-server\" \"sh -c 'echo \\\"Waiting for dashboard server on localhost:7888...\\\"; i=0; max=150; until curl -fsS http://localhost:7888/api/health >/dev/null 2>&1; do i=$((i+1)); if [ \\\"$i\\\" -ge \\\"$max\\\" ]; then echo \\\"Dashboard server did not become healthy within 30s\\\"; exit 1; fi; sleep 0.2; done; echo \\\"Dashboard server healthy; starting Vite.\\\"; vite --strictPort'\"", "build": "vite build", "preview": "vite preview", "typecheck": "tsc --noEmit", diff --git a/apps/local-dashboard/src/App.tsx b/apps/local-dashboard/src/App.tsx index 5f0bc769..eabed16f 100644 --- a/apps/local-dashboard/src/App.tsx +++ b/apps/local-dashboard/src/App.tsx @@ -10,6 +10,7 @@ import { Overview } from "@/pages/Overview" import { SkillReport } from "@/pages/SkillReport" import { Status } from "@/pages/Status" import { useOverview } from "@/hooks/useOverview" +import { RuntimeFooter } from "@/components/runtime-footer" import { useSSE } from "@/hooks/useSSE" import type { SkillHealthStatus, SkillSummary } from "@/types" import { deriveStatus, sortByPassRateAndChecks } from "@selftune/ui/lib" @@ -90,6 +91,7 @@ function DashboardShell() { } /> + ) } diff --git a/apps/local-dashboard/src/components/app-sidebar.tsx b/apps/local-dashboard/src/components/app-sidebar.tsx index 4e404888..ce013df1 100644 --- a/apps/local-dashboard/src/components/app-sidebar.tsx +++ b/apps/local-dashboard/src/components/app-sidebar.tsx @@ -32,6 +32,7 @@ import { GlobeIcon, HeartPulseIcon, HelpCircleIcon, + LayoutDashboardIcon, SearchIcon, ServerIcon, XCircleIcon, @@ -200,6 +201,24 @@ export function AppSidebar({ + {/* Dashboard */} + + + + + } + > + + Dashboard + + + + + + {/* Skills */} Skills diff --git a/apps/local-dashboard/src/components/runtime-footer.tsx b/apps/local-dashboard/src/components/runtime-footer.tsx new file mode 100644 index 00000000..5f37a4f5 --- /dev/null +++ b/apps/local-dashboard/src/components/runtime-footer.tsx @@ -0,0 +1,59 @@ +import { useEffect, useState } from "react" +import type { HealthResponse } from "@/types" + +function isHealthResponse(value: unknown): value is HealthResponse { + if (typeof value !== "object" || value === null) return false + const record = value as Record + return ( + typeof record.workspace_root === "string" && + typeof record.git_sha === "string" && + typeof record.db_path === "string" && + typeof record.process_mode === "string" && + (record.watcher_mode === "jsonl" || record.watcher_mode === "none") + ) +} + +export function RuntimeFooter() { + const [health, setHealth] = useState(null) + + useEffect(() => { + fetch("/api/health") + .then((res) => res.json()) + .then((data: unknown) => { + if (isHealthResponse(data)) { + setHealth(data) + } + }) + .catch(() => { + /* non-critical — footer simply stays hidden */ + }) + }, []) + + if (!health) return null + const legacyWatcherMode = health.watcher_mode === "jsonl" + + return ( +
+
+ {health.workspace_root} + {health.git_sha} + {health.db_path} + mode: {health.process_mode} + + watcher: {health.watcher_mode} + + {legacyWatcherMode && ( + + warning: legacy JSONL watcher invalidation + + )} +
+
+ ) +} diff --git a/apps/local-dashboard/src/components/ui/sidebar.tsx b/apps/local-dashboard/src/components/ui/sidebar.tsx index 2c2a19d3..81d2503f 100644 --- a/apps/local-dashboard/src/components/ui/sidebar.tsx +++ b/apps/local-dashboard/src/components/ui/sidebar.tsx @@ -473,7 +473,7 @@ function SidebarMenuItem({ className, ...props }: React.ComponentProps<"li">) { } const sidebarMenuButtonVariants = cva( - "peer/menu-button group/menu-button flex w-full items-center gap-2 overflow-hidden rounded-md p-2 text-left text-sm ring-sidebar-ring outline-hidden transition-[width,height,padding] group-has-data-[sidebar=menu-action]/menu-item:pr-8 group-data-[collapsible=icon]:size-8! group-data-[collapsible=icon]:p-2! hover:bg-sidebar-accent hover:text-sidebar-accent-foreground focus-visible:ring-2 active:bg-sidebar-accent active:text-sidebar-accent-foreground disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 data-open:hover:bg-sidebar-accent data-open:hover:text-sidebar-accent-foreground data-active:bg-sidebar-accent data-active:font-medium data-active:text-sidebar-accent-foreground [&_svg]:size-4 [&_svg]:shrink-0 [&>span:last-child]:truncate", + "peer/menu-button group/menu-button flex w-full items-center gap-2 overflow-hidden rounded-md p-2 text-left text-sm ring-sidebar-ring outline-hidden transition-[width,height,padding] group-has-data-[sidebar=menu-action]/menu-item:pr-8 group-data-[collapsible=icon]:size-8! group-data-[collapsible=icon]:p-2! hover:bg-sidebar-accent hover:text-sidebar-accent-foreground focus-visible:ring-2 active:bg-sidebar-accent active:text-sidebar-accent-foreground disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 data-open:hover:bg-sidebar-accent data-open:hover:text-sidebar-accent-foreground data-active:bg-sidebar-accent data-active:font-medium data-active:text-sidebar-accent-foreground border-l-2 border-transparent data-active:border-primary data-active:rounded-l-none [&_svg]:size-4 [&_svg]:shrink-0 [&>span:last-child]:truncate", { variants: { variant: { @@ -677,7 +677,7 @@ function SidebarMenuSubButton({ props: mergeProps<"a">( { className: cn( - "flex h-7 min-w-0 -translate-x-px items-center gap-2 overflow-hidden rounded-md px-2 text-sidebar-foreground ring-sidebar-ring outline-hidden group-data-[collapsible=icon]:hidden hover:bg-sidebar-accent hover:text-sidebar-accent-foreground focus-visible:ring-2 active:bg-sidebar-accent active:text-sidebar-accent-foreground disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 data-[size=md]:text-sm data-[size=sm]:text-xs data-active:bg-sidebar-accent data-active:text-sidebar-accent-foreground [&>span:last-child]:truncate [&>svg]:size-4 [&>svg]:shrink-0 [&>svg]:text-sidebar-accent-foreground", + "flex h-7 min-w-0 -translate-x-px items-center gap-2 overflow-hidden rounded-md px-2 text-sidebar-foreground ring-sidebar-ring outline-hidden group-data-[collapsible=icon]:hidden hover:bg-sidebar-accent hover:text-sidebar-accent-foreground focus-visible:ring-2 active:bg-sidebar-accent active:text-sidebar-accent-foreground disabled:pointer-events-none disabled:opacity-50 aria-disabled:pointer-events-none aria-disabled:opacity-50 data-[size=md]:text-sm data-[size=sm]:text-xs data-active:bg-sidebar-accent data-active:font-medium data-active:text-sidebar-accent-foreground border-l-2 border-transparent data-active:border-primary data-active:rounded-l-none [&>span:last-child]:truncate [&>svg]:size-4 [&>svg]:shrink-0 [&>svg]:text-sidebar-accent-foreground", className ), }, diff --git a/apps/local-dashboard/src/pages/Overview.test.tsx b/apps/local-dashboard/src/pages/Overview.test.tsx index 8cdd1188..c7481581 100644 --- a/apps/local-dashboard/src/pages/Overview.test.tsx +++ b/apps/local-dashboard/src/pages/Overview.test.tsx @@ -3,6 +3,9 @@ import { describe, expect, it, vi } from "vitest"; // Mock heavy external dependencies to avoid import timeouts vi.mock("@selftune/ui/components", () => ({ ActivityPanel: () => null, + EvidenceViewer: () => null, + EvolutionTimeline: () => null, + InfoTip: () => null, OrchestrateRunsPanel: () => null, SectionCards: () => null, SkillHealthGrid: () => null, @@ -18,14 +21,30 @@ vi.mock("@/components/ui/skeleton", () => ({ vi.mock("react-router-dom", () => ({ Link: () => null, + useNavigate: () => () => {}, + useParams: () => ({ name: "test-skill" }), + useSearchParams: () => [new URLSearchParams(), () => {}], })); vi.mock("lucide-react", () => ({ AlertCircleIcon: () => null, + AlertOctagonIcon: () => null, + ActivityIcon: () => null, + ArrowLeftIcon: () => null, + ChevronRightIcon: () => null, + ClockIcon: () => null, + CoinsIcon: () => null, + EyeIcon: () => null, + FlaskConicalIcon: () => null, + FolderIcon: () => null, + LayersIcon: () => null, + MessageSquareTextIcon: () => null, RefreshCwIcon: () => null, RocketIcon: () => null, - LayersIcon: () => null, - ActivityIcon: () => null, + ServerIcon: () => null, + TargetIcon: () => null, + TrendingDownIcon: () => null, + TrendingUpIcon: () => null, XIcon: () => null, })); diff --git a/apps/local-dashboard/src/pages/Overview.tsx b/apps/local-dashboard/src/pages/Overview.tsx index cf833c6b..cd083f52 100644 --- a/apps/local-dashboard/src/pages/Overview.tsx +++ b/apps/local-dashboard/src/pages/Overview.tsx @@ -1,5 +1,5 @@ import { useMemo, useState } from "react" -import { Link } from "react-router-dom" +import { Link, useNavigate } from "react-router-dom" import { ActivityPanel, OrchestrateRunsPanel, @@ -124,6 +124,7 @@ export function Overview({ onStatusFilterChange: (v: SkillHealthStatus | "ALL") => void overviewQuery: UseQueryResult }) { + const navigate = useNavigate() const { data, isPending, isError, error, refetch } = overviewQuery const orchestrateQuery = useOrchestrateRuns() @@ -189,6 +190,10 @@ export function Overview({ ? gradedSkills.reduce((sum, s) => sum + s.pass_rate, 0) / gradedSkills.length : null + const handleSelectProposal = (skillName: string, proposalId: string) => { + navigate(`/skills/${encodeURIComponent(skillName)}?proposal=${encodeURIComponent(proposalId)}`) + } + return (
@@ -216,6 +221,7 @@ export function Overview({ evolution={overview.evolution} pendingProposals={overview.pending_proposals} unmatchedQueries={overview.unmatched_queries} + onSelectProposal={handleSelectProposal} /> {orchestrateQuery.isPending ? ( diff --git a/apps/local-dashboard/src/pages/SkillReport.test.tsx b/apps/local-dashboard/src/pages/SkillReport.test.tsx index d7ea48d2..e18b8b58 100644 --- a/apps/local-dashboard/src/pages/SkillReport.test.tsx +++ b/apps/local-dashboard/src/pages/SkillReport.test.tsx @@ -26,9 +26,13 @@ vi.mock("@selftune/ui/primitives", () => ({ })); vi.mock("@selftune/ui/components", () => ({ + ActivityPanel: () => null, EvolutionTimeline: () => null, EvidenceViewer: () => null, InfoTip: () => null, + OrchestrateRunsPanel: () => null, + SectionCards: () => null, + SkillHealthGrid: () => null, })); vi.mock("@selftune/ui/lib", () => ({ @@ -44,26 +48,30 @@ vi.mock("@/components/ui/skeleton", () => ({ vi.mock("react-router-dom", () => ({ Link: () => null, + useNavigate: () => () => {}, useParams: () => ({ name: "test-skill" }), + useSearchParams: () => [new URLSearchParams(), () => {}], })); vi.mock("lucide-react", () => ({ AlertCircleIcon: () => null, - ArrowLeftIcon: () => null, - FlaskConicalIcon: () => null, ActivityIcon: () => null, - EyeIcon: () => null, - RefreshCwIcon: () => null, + ArrowLeftIcon: () => null, + ChevronRightIcon: () => null, + ClockIcon: () => null, + CoinsIcon: () => null, LayersIcon: () => null, + RefreshCwIcon: () => null, + RocketIcon: () => null, + XIcon: () => null, + FlaskConicalIcon: () => null, TrendingUpIcon: () => null, TrendingDownIcon: () => null, - CoinsIcon: () => null, - ChevronRightIcon: () => null, - ClockIcon: () => null, AlertOctagonIcon: () => null, TargetIcon: () => null, MessageSquareTextIcon: () => null, ServerIcon: () => null, + EyeIcon: () => null, FolderIcon: () => null, })); diff --git a/apps/local-dashboard/src/pages/SkillReport.tsx b/apps/local-dashboard/src/pages/SkillReport.tsx index 69793c43..195a596b 100644 --- a/apps/local-dashboard/src/pages/SkillReport.tsx +++ b/apps/local-dashboard/src/pages/SkillReport.tsx @@ -1,5 +1,5 @@ import { useEffect, useState } from "react" -import { Link, useParams } from "react-router-dom" +import { Link, useParams, useSearchParams } from "react-router-dom" import { Badge, Button, @@ -189,13 +189,8 @@ function SessionGroup({ sessionId, meta, invocations, defaultExpanded }: { export function SkillReport() { const { name } = useParams<{ name: string }>() + const [searchParams, setSearchParams] = useSearchParams() const { data, isPending, isError, error, refetch } = useSkillReport(name) - const [selectedProposal, setSelectedProposal] = useState(null) - - // Reset local state when navigating between skills - useEffect(() => { - setSelectedProposal(null) - }, [name]) if (!name) { return ( @@ -280,8 +275,32 @@ export function SkillReport() { const hasEvolution = (selftune_stats?.run_count ?? 0) > 0 const missed = duration_stats?.missed_triggers ?? 0 - // Auto-select first proposal if none selected - const activeProposal = selectedProposal ?? (evolution.length > 0 ? evolution[0].proposal_id : null) + const proposalIds = new Set(evolution.map((entry) => entry.proposal_id)) + const requestedProposal = searchParams.get("proposal") + const activeProposal = requestedProposal && proposalIds.has(requestedProposal) + ? requestedProposal + : (evolution.length > 0 ? evolution[0].proposal_id : null) + + useEffect(() => { + const current = searchParams.get("proposal") + if (activeProposal && current !== activeProposal) { + const next = new URLSearchParams(searchParams) + next.set("proposal", activeProposal) + setSearchParams(next, { replace: true }) + return + } + if (!activeProposal && current) { + const next = new URLSearchParams(searchParams) + next.delete("proposal") + setSearchParams(next, { replace: true }) + } + }, [activeProposal, searchParams, setSearchParams]) + + const handleSelectProposal = (proposalId: string) => { + const next = new URLSearchParams(searchParams) + next.set("proposal", proposalId) + setSearchParams(next, { replace: true }) + } // Unique models/platforms from session metadata const uniqueModels = [...new Set((session_metadata ?? []).map((s) => s.model).filter(Boolean))] @@ -511,7 +530,7 @@ export function SkillReport() { )} @@ -584,7 +603,7 @@ export function SkillReport() {
+ {freshnessCheck?.status === "warn" && ( + + + + + Legacy freshness mode active + + + {freshnessCheck.message} + + + + )} + {/* Summary cards */}
diff --git a/apps/local-dashboard/src/types.ts b/apps/local-dashboard/src/types.ts index 4277773d..223facb9 100644 --- a/apps/local-dashboard/src/types.ts +++ b/apps/local-dashboard/src/types.ts @@ -19,6 +19,7 @@ export type { CanonicalInvocation, DoctorResult, HealthCheck, + HealthResponse, HealthStatus, OrchestrateRunsResponse, OverviewPayload, diff --git a/bun.lock b/bun.lock index 84e13e43..a396b3d3 100644 --- a/bun.lock +++ b/bun.lock @@ -56,6 +56,9 @@ "packages/telemetry-contract": { "name": "@selftune/telemetry-contract", "version": "1.0.0", + "dependencies": { + "zod": "^3.24.0", + }, }, "packages/ui": { "name": "@selftune/ui", @@ -1378,6 +1381,8 @@ "@inquirer/core/wrap-ansi": ["wrap-ansi@6.2.0", "", { "dependencies": { "ansi-styles": "^4.0.0", "string-width": "^4.1.0", "strip-ansi": "^6.0.0" } }, "sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA=="], + "@selftune/telemetry-contract/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="], + "@tailwindcss/oxide-wasm32-wasi/@emnapi/core": ["@emnapi/core@1.9.0", "", { "dependencies": { "@emnapi/wasi-threads": "1.2.0", "tslib": "^2.4.0" }, "bundled": true }, "sha512-0DQ98G9ZQZOxfUcQn1waV2yS8aWdZ6kJMbYCJB3oUBecjWYO1fqJ+a1DRfPF3O5JEkwqwP1A9QEN/9mYm2Yd0w=="], "@tailwindcss/oxide-wasm32-wasi/@emnapi/runtime": ["@emnapi/runtime@1.9.0", "", { "dependencies": { "tslib": "^2.4.0" }, "bundled": true }, "sha512-QN75eB0IH2ywSpRpNddCRfQIhmJYBCJ1x5Lb3IscKAL8bMnVAKnRg8dCoXbHzVLLH7P38N2Z3mtulB7W0J0FKw=="], diff --git a/cli/selftune/agent-guidance.ts b/cli/selftune/agent-guidance.ts new file mode 100644 index 00000000..3ec582ad --- /dev/null +++ b/cli/selftune/agent-guidance.ts @@ -0,0 +1,96 @@ +import { getAlphaLinkState } from "./alpha-identity.js"; +import type { AgentCommandGuidance, AlphaIdentity, AlphaLinkState } from "./types.js"; + +function emailArg(email?: string): string { + return email?.trim() ? email : ""; +} + +function buildAlphaInitCommand(options?: { + email?: string; + includeKey?: boolean; + force?: boolean; +}): string { + const parts = ["selftune", "init", "--alpha", "--alpha-email", emailArg(options?.email)]; + if (options?.includeKey) { + parts.push("--alpha-key", ""); + } + if (options?.force) { + parts.push("--force"); + } + return parts.join(" "); +} + +function buildGuidance( + code: string, + message: string, + nextCommand: string, + blocking: boolean, + suggestedCommands: string[], +): AgentCommandGuidance { + return { + code, + message, + next_command: nextCommand, + suggested_commands: suggestedCommands, + blocking, + }; +} + +export function getAlphaGuidanceForState( + state: AlphaLinkState, + options?: { email?: string }, +): AgentCommandGuidance { + switch (state) { + case "not_linked": + return buildGuidance( + "alpha_cloud_link_required", + "Alpha upload is not linked. Sign in to app.selftune.dev, enroll in alpha, mint an st_live_* credential, then store it locally.", + buildAlphaInitCommand({ email: options?.email, includeKey: true }), + true, + ["selftune status", "selftune doctor"], + ); + case "linked_not_enrolled": + return buildGuidance( + "alpha_enrollment_incomplete", + "Cloud account is linked but alpha enrollment is incomplete. Finish enrollment in app.selftune.dev, then refresh the local credential.", + buildAlphaInitCommand({ email: options?.email, includeKey: true, force: true }), + true, + ["selftune status", "selftune doctor"], + ); + case "enrolled_no_credential": + return buildGuidance( + "alpha_credential_required", + "Alpha enrollment exists, but the local upload credential is missing or invalid.", + buildAlphaInitCommand({ email: options?.email, includeKey: true, force: true }), + true, + ["selftune status", "selftune doctor"], + ); + case "ready": + return buildGuidance( + "alpha_upload_ready", + "Alpha upload is configured and ready.", + "selftune alpha upload", + false, + ["selftune status", "selftune doctor"], + ); + } +} + +export function getAlphaGuidance(identity: AlphaIdentity | null): AgentCommandGuidance { + if (!identity) { + return getAlphaGuidanceForState("not_linked"); + } + return getAlphaGuidanceForState(getAlphaLinkState(identity), { email: identity.email }); +} + +export function formatGuidanceLines( + guidance: AgentCommandGuidance, + options?: { indent?: string }, +): string[] { + const indent = options?.indent ?? " "; + const lines = [`${indent}Next command: ${guidance.next_command}`]; + if (guidance.suggested_commands.length > 0) { + lines.push(`${indent}Suggested commands: ${guidance.suggested_commands.join(", ")}`); + } + return lines; +} diff --git a/cli/selftune/alpha-identity.ts b/cli/selftune/alpha-identity.ts new file mode 100644 index 00000000..b7faaffd --- /dev/null +++ b/cli/selftune/alpha-identity.ts @@ -0,0 +1,157 @@ +/** + * Alpha program identity management — cached cloud identity model. + * + * Local config is a cache of cloud-linked identity, not the source of truth. + * The cloud_user_id field is the primary "linked" indicator. Legacy local-only + * identities (user_id without cloud_user_id) are detected by migrateLocalIdentity(). + * + * Handles stable user identity generation, config persistence, + * and consent notice for the selftune alpha program. + */ + +import { randomUUID } from "node:crypto"; +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { dirname } from "node:path"; + +import type { AlphaIdentity, AlphaLinkState, SelftuneConfig } from "./types.js"; + +// --------------------------------------------------------------------------- +// User ID generation +// --------------------------------------------------------------------------- + +/** Generate a stable UUID for alpha user identity. */ +export function generateUserId(): string { + return randomUUID(); +} + +// --------------------------------------------------------------------------- +// Config read/write helpers +// --------------------------------------------------------------------------- + +/** + * Read the alpha identity block from the selftune config file. + * Returns null if config does not exist or has no alpha block. + */ +export function readAlphaIdentity(configPath: string): AlphaIdentity | null { + if (!existsSync(configPath)) return null; + + try { + const raw = readFileSync(configPath, "utf-8"); + const config = JSON.parse(raw) as SelftuneConfig; + return config.alpha ?? null; + } catch { + return null; + } +} + +/** + * Write the alpha identity block into the selftune config file. + * Reads existing config, merges the alpha block, and writes back. + * Creates parent directories if needed. + */ +export function writeAlphaIdentity(configPath: string, identity: AlphaIdentity): void { + let config: Record = {}; + + if (existsSync(configPath)) { + try { + config = JSON.parse(readFileSync(configPath, "utf-8")); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error( + `Unable to update alpha identity: ${configPath} is not valid JSON (${message})`, + ); + } + } + + config.alpha = identity; + + mkdirSync(dirname(configPath), { recursive: true }); + writeFileSync(configPath, JSON.stringify(config, null, 2), "utf-8"); +} + +// --------------------------------------------------------------------------- +// Link state helper — cloud-first model +// --------------------------------------------------------------------------- + +/** Check if an API key has the expected st_live_ or st_test_ prefix. */ +export function isValidApiKeyFormat(key: string): boolean { + return key.startsWith("st_live_") || key.startsWith("st_test_"); +} + +/** + * Derive the cloud link readiness state from an AlphaIdentity. + * + * State machine: + * null -> "not_linked" + * not enrolled, no cloud_user_id -> "not_linked" + * not enrolled, has cloud_user_id -> "linked_not_enrolled" + * enrolled, no valid api_key -> "enrolled_no_credential" + * enrolled, valid api_key -> "ready" + * + * cloud_user_id enriches the identity (confirms cloud link) but is not a gate. + * The direct-key path (--alpha-key) sets api_key without cloud_user_id, and + * that is a valid "ready" state. cloud_user_id can be backfilled later. + */ +export function getAlphaLinkState(identity: AlphaIdentity | null): AlphaLinkState { + if (!identity) return "not_linked"; + if (!identity.enrolled) return identity.cloud_user_id ? "linked_not_enrolled" : "not_linked"; + if (!identity.api_key || !isValidApiKeyFormat(identity.api_key)) return "enrolled_no_credential"; + // Enrolled + valid key = ready (cloud_user_id is bonus, not gate) + return "ready"; +} + +// --------------------------------------------------------------------------- +// Migration helper +// --------------------------------------------------------------------------- + +/** + * Detect legacy local-only alpha blocks and mark them as needing cloud link. + * A legacy identity has email + user_id but no cloud_user_id. + */ +export function migrateLocalIdentity(identity: AlphaIdentity): { + needsCloudLink: boolean; + identity: AlphaIdentity; +} { + if (identity.cloud_user_id) { + return { needsCloudLink: false, identity }; + } + // Legacy: has local user_id but no cloud link + return { needsCloudLink: true, identity }; +} + +// --------------------------------------------------------------------------- +// Consent notice +// --------------------------------------------------------------------------- + +export const ALPHA_CONSENT_NOTICE = ` +======================================== + selftune Alpha Program +======================================== + +You are enrolling in the selftune alpha program. + +WHAT IS COLLECTED: + - Skill invocations and trigger metadata + - Session metadata (timestamps, tool counts, error counts) + - Evolution outcomes (proposals, pass rates, deployments) + - Raw user prompt/query text submitted during captured sessions + +WHAT IS NOT COLLECTED: + - File contents or source code + - Full transcript bodies beyond the captured prompt/query text + - Structured repository names or file paths as separate fields + +IMPORTANT: + Raw prompt/query text is uploaded unchanged for the friendly alpha cohort. + If your prompt includes repository names, file paths, or secrets, that text + may be included in the alpha data you choose to share. + +Your alpha identity (email, display name, and any upload API key) +is stored locally in ~/.selftune/config.json and used for alpha coordination +and authenticated uploads. + +TO UNENROLL: + selftune init --no-alpha + +======================================== +`; diff --git a/cli/selftune/alpha-upload-contract.ts b/cli/selftune/alpha-upload-contract.ts new file mode 100644 index 00000000..937021d9 --- /dev/null +++ b/cli/selftune/alpha-upload-contract.ts @@ -0,0 +1,52 @@ +/** + * Alpha upload contract — V2 canonical push payloads. + * + * Defines the queue infrastructure types used by the upload pipeline. + * Payload shapes are now V2 canonical records assembled by buildPushPayloadV2() + * in canonical-export.ts — no bespoke Alpha* payload types needed. + */ + +// -- Response ----------------------------------------------------------------- + +export interface PushUploadResult { + success: boolean; + push_id?: string; + errors: string[]; + _status?: number; +} + +// -- Queue types (used by flush engine) --------------------------------------- + +export type QueueItemStatus = "pending" | "sending" | "sent" | "failed"; + +export type AlphaPayloadType = + | "sessions" + | "prompts" + | "invocations" + | "execution_facts" + | "evolution_evidence" + | "push"; // unified V2 push payload + +export interface QueueItem { + id: number; + payload_type: AlphaPayloadType; + payload_json: string; + status: QueueItemStatus; + attempts: number; + created_at: string; + updated_at: string; + last_error: string | null; +} + +export interface QueueOperations { + getPending(limit: number): QueueItem[]; + markSending(id: number): boolean; + markSent(id: number): boolean; + markFailed(id: number, error?: string): boolean; +} + +export interface FlushSummary { + sent: number; + failed: number; + skipped: number; +} diff --git a/cli/selftune/alpha-upload/build-payloads.ts b/cli/selftune/alpha-upload/build-payloads.ts new file mode 100644 index 00000000..ccb0cd84 --- /dev/null +++ b/cli/selftune/alpha-upload/build-payloads.ts @@ -0,0 +1,151 @@ +/** + * V2 canonical push payload builder (staging-based). + * + * Reads from the canonical_upload_staging table using a single monotonic + * cursor (local_seq). Each staged row contains the full canonical record + * JSON, so no fields are dropped or hardcoded during payload construction. + * + * Evolution evidence rows (record_kind = "evolution_evidence") are separated + * and placed in the canonical.evolution_evidence array. + */ + +import type { Database } from "bun:sqlite"; +import type { CanonicalRecord } from "@selftune/telemetry-contract"; +import { buildPushPayloadV2 } from "../canonical-export.js"; +import type { EvolutionEvidenceEntry } from "../types.js"; + +// -- Types -------------------------------------------------------------------- + +export interface BuildV2Result { + payload: Record; + lastSeq: number; +} + +// -- Constants ---------------------------------------------------------------- + +const DEFAULT_LIMIT = 500; + +// -- Helpers ------------------------------------------------------------------ + +/** Parse a JSON string, returning null on failure. */ +function safeParseJson(json: string | null): T | null { + if (!json) return null; + try { + return JSON.parse(json) as T; + } catch { + return null; + } +} + +// -- Main builder ------------------------------------------------------------- + +/** + * Build a V2 canonical push payload from the staging table. + * + * Reads records from canonical_upload_staging WHERE local_seq > afterSeq, + * groups them by record_kind, and assembles a V2 push payload. + * + * Returns null when no new records exist after afterSeq. + */ +export function buildV2PushPayload( + db: Database, + afterSeq?: number, + limit: number = DEFAULT_LIMIT, +): BuildV2Result | null { + const whereClause = afterSeq !== undefined ? "WHERE local_seq > ?" : ""; + const params = afterSeq !== undefined ? [afterSeq, limit] : [limit]; + + const sql = ` + SELECT local_seq, record_kind, record_json + FROM canonical_upload_staging + ${whereClause} + ORDER BY local_seq ASC + LIMIT ? + `; + + const rows = db.query(sql).all(...params) as Array<{ + local_seq: number; + record_kind: string; + record_json: string; + }>; + + if (rows.length === 0) return null; + + const canonicalRecords: CanonicalRecord[] = []; + const evidenceEntries: EvolutionEvidenceEntry[] = []; + const orchestrateRuns: Record[] = []; + let lastParsedSeq: number | null = null; + let hitMalformedRow = false; + + for (const row of rows) { + const parsed = safeParseJson>(row.record_json); + if (!parsed) { + hitMalformedRow = true; + break; + } + + if (row.record_kind === "evolution_evidence") { + const timestamp = + typeof parsed.timestamp === "string" && parsed.timestamp.trim().length > 0 + ? parsed.timestamp + : null; + const proposalId = + typeof parsed.proposal_id === "string" && parsed.proposal_id.trim().length > 0 + ? parsed.proposal_id + : null; + if (!timestamp || !proposalId) { + hitMalformedRow = true; + break; + } + + // Evolution evidence has its own shape + evidenceEntries.push({ + timestamp, + proposal_id: proposalId, + skill_name: parsed.skill_name as string, + skill_path: (parsed.skill_path as string) ?? "", + target: (parsed.target as EvolutionEvidenceEntry["target"]) ?? "description", + stage: (parsed.stage as EvolutionEvidenceEntry["stage"]) ?? "created", + rationale: parsed.rationale as string | undefined, + confidence: parsed.confidence as number | undefined, + details: parsed.details as string | undefined, + original_text: parsed.original_text as string | undefined, + proposed_text: parsed.proposed_text as string | undefined, + eval_set: parsed.eval_set_json as EvolutionEvidenceEntry["eval_set"], + validation: parsed.validation_json as EvolutionEvidenceEntry["validation"], + evidence_id: parsed.evidence_id as string | undefined, + }); + } else if (row.record_kind === "orchestrate_run") { + // Orchestrate run records -- pass through as-is + orchestrateRuns.push(parsed); + } else { + // Canonical telemetry records -- pass through as-is + canonicalRecords.push(parsed as unknown as CanonicalRecord); + } + + lastParsedSeq = row.local_seq; + } + + // If nothing parsed successfully, return null + if ( + canonicalRecords.length === 0 && + evidenceEntries.length === 0 && + orchestrateRuns.length === 0 + ) { + return null; + } + + const payload = buildPushPayloadV2(canonicalRecords, evidenceEntries, orchestrateRuns); + if (lastParsedSeq === null) { + return null; + } + const lastSeq = lastParsedSeq; + + if (hitMalformedRow && (process.env.DEBUG || process.env.NODE_ENV === "development")) { + console.error( + "[alpha-upload/build-payloads] encountered malformed staged row; cursor held at last valid seq", + ); + } + + return { payload, lastSeq }; +} diff --git a/cli/selftune/alpha-upload/client.ts b/cli/selftune/alpha-upload/client.ts new file mode 100644 index 00000000..ad91610d --- /dev/null +++ b/cli/selftune/alpha-upload/client.ts @@ -0,0 +1,113 @@ +/** + * Alpha upload HTTP client. + * + * POSTs V2 canonical push payloads to the cloud API's POST /api/v1/push. + * Uses native fetch (Bun built-in). Never throws -- returns a + * PushUploadResult indicating success or failure. + */ + +import type { PushUploadResult } from "../alpha-upload-contract.js"; +import { getSelftuneVersion } from "../utils/selftune-meta.js"; + +function isPushUploadResult(value: unknown): value is PushUploadResult { + if (typeof value !== "object" || value === null) return false; + const record = value as Record; + return ( + typeof record.success === "boolean" && + Array.isArray(record.errors) && + record.errors.every((entry) => typeof entry === "string") && + (record.push_id === undefined || typeof record.push_id === "string") && + (record._status === undefined || typeof record._status === "number") + ); +} + +function isAcceptedPushResponse(value: unknown): value is { status: "accepted"; push_id: string } { + if (typeof value !== "object" || value === null) return false; + const record = value as Record; + return record.status === "accepted" && typeof record.push_id === "string"; +} + +/** + * Upload a single V2 push payload to the given endpoint. + * + * Returns a typed result. Never throws -- network errors and HTTP + * failures are captured in the result. + */ +export async function uploadPushPayload( + payload: Record, + endpoint: string, + apiKey?: string, +): Promise { + try { + const headers: Record = { + "Content-Type": "application/json", + "User-Agent": `selftune/${getSelftuneVersion()}`, + }; + + if (apiKey) { + headers.Authorization = `Bearer ${apiKey}`; + } + + const response = await fetch(endpoint, { + method: "POST", + headers, + body: JSON.stringify(payload), + signal: AbortSignal.timeout(30_000), + }); + + if (response.ok) { + // Read body as text first — Bun consumes the stream on .json(), + // so a failed .json() followed by .text() would throw. + const body = await response.text(); + if (body.length === 0) { + return { + success: true, + push_id: (payload as { push_id?: string }).push_id, + errors: [], + _status: response.status, + }; + } + try { + const parsed: unknown = JSON.parse(body); + if (isPushUploadResult(parsed)) { + return { ...parsed, _status: parsed._status ?? response.status }; + } + if (isAcceptedPushResponse(parsed)) { + return { + success: true, + push_id: parsed.push_id, + errors: [], + _status: response.status, + }; + } + return { + success: false, + errors: ["Invalid JSON response shape for PushUploadResult"], + _status: response.status, + }; + } catch { + return { + success: false, + errors: [`Unexpected non-JSON response body: ${body.slice(0, 200)}`], + _status: response.status, + }; + } + } + + // Non-2xx response -- read error text for diagnostics + const errorText = await response.text().catch(() => "unknown error"); + return { + success: false, + errors: [`HTTP ${response.status}: ${errorText.slice(0, 200)}`], + _status: response.status, + }; + } catch (err) { + // Network-level failure (DNS, timeout, connection refused, etc.) + const message = err instanceof Error ? err.message : String(err); + return { + success: false, + errors: [message], + _status: 0, + }; + } +} diff --git a/cli/selftune/alpha-upload/flush.ts b/cli/selftune/alpha-upload/flush.ts new file mode 100644 index 00000000..d76efd0d --- /dev/null +++ b/cli/selftune/alpha-upload/flush.ts @@ -0,0 +1,191 @@ +/** + * Alpha upload flush engine. + * + * Drains the local upload queue by reading pending items, uploading + * them via the HTTP client, and updating their status. Implements + * retry with exponential backoff for transient (5xx/network) failures. + * + * Special status handling: + * - 409 (duplicate push_id) is treated as success + * - 401/403 (auth failures) are non-retryable with descriptive errors + * - 4xx (client errors) are not retried + */ + +import type { FlushSummary, QueueOperations } from "../alpha-upload-contract.js"; +import { uploadPushPayload } from "./client.js"; + +// --------------------------------------------------------------------------- +// Options +// --------------------------------------------------------------------------- + +/** Options for the flush engine. */ +export interface FlushOptions { + /** Maximum number of items to read per flush batch (default: 50). */ + batchSize?: number; + /** Maximum upload attempts per item before marking permanently failed (default: 5). */ + maxRetries?: number; + /** When true, log what would be sent without making HTTP calls (default: false). */ + dryRun?: boolean; + /** API key for Bearer auth on the cloud endpoint. */ + apiKey?: string; +} + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +const DEFAULT_BATCH_SIZE = 50; +const DEFAULT_MAX_RETRIES = 5; +const INITIAL_BACKOFF_MS = 1_000; +const MAX_BACKOFF_MS = 16_000; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** Returns true for HTTP status codes that are transient and worth retrying. */ +function isRetryable(status: number): boolean { + return status === 0 || status === 429 || status >= 500; +} + +/** Returns true for auth errors that should not be retried. */ +function isAuthError(status: number): boolean { + return status === 401 || status === 403; +} + +/** Sleep for the given number of milliseconds. */ +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +/** Calculate exponential backoff with cap. */ +function backoffMs(attempt: number): number { + const ms = INITIAL_BACKOFF_MS * 2 ** attempt; + return Math.min(ms, MAX_BACKOFF_MS); +} + +/** Extract HTTP status from result. */ +function getStatus(result: Record): number { + return (result as { _status?: number })._status ?? (result.success ? 200 : 0); +} + +// --------------------------------------------------------------------------- +// Flush engine +// --------------------------------------------------------------------------- + +/** + * Flush the upload queue -- read pending items, upload them, update status. + */ +export async function flushQueue( + queue: QueueOperations, + endpoint: string, + options?: FlushOptions, +): Promise { + const batchSize = options?.batchSize ?? DEFAULT_BATCH_SIZE; + const maxRetries = options?.maxRetries ?? DEFAULT_MAX_RETRIES; + const dryRun = options?.dryRun ?? false; + const apiKey = options?.apiKey; + + const summary: FlushSummary = { sent: 0, failed: 0, skipped: 0 }; + + const items = queue.getPending(batchSize); + + if (items.length === 0) { + return summary; + } + + for (const item of items) { + const markFailedSafely = (message: string): void => { + if (!queue.markFailed(item.id, message)) { + console.error(`[alpha upload] Failed to persist queue failure state for item ${item.id}`); + } + }; + + if (item.attempts >= maxRetries) { + markFailedSafely("exhausted retries"); + summary.failed++; + continue; + } + + if (dryRun) { + summary.skipped++; + continue; + } + + let payload: Record; + try { + payload = JSON.parse(item.payload_json) as Record; + } catch { + markFailedSafely("corrupt payload JSON"); + summary.failed++; + continue; + } + + if (!queue.markSending(item.id)) { + console.error(`[alpha upload] Failed to mark queue item ${item.id} as sending`); + summary.failed++; + continue; + } + + let succeeded = false; + const attemptsRemaining = maxRetries - item.attempts; + + for (let attempt = 0; attempt < attemptsRemaining; attempt++) { + if (attempt > 0) { + await sleep(backoffMs(attempt - 1)); + } + + const result = await uploadPushPayload(payload, endpoint, apiKey); + const status = getStatus(result as unknown as Record); + + if (result.success) { + if (!queue.markSent(item.id)) { + markFailedSafely("local queue state update failed after successful upload"); + summary.failed++; + } else { + summary.sent++; + } + succeeded = true; + break; + } + + // 409 Conflict = duplicate push_id, treat as success + if (status === 409) { + if (!queue.markSent(item.id)) { + markFailedSafely("local queue state update failed after duplicate upload"); + summary.failed++; + } else { + summary.sent++; + } + succeeded = true; + break; + } + + // Auth errors are non-retryable + if (isAuthError(status)) { + const authMessage = + status === 401 + ? "Authentication failed: invalid or missing API key. Run 'selftune init --alpha --alpha-key ' to set your API key." + : "Authorization denied: your API key does not have permission to upload. Run 'selftune doctor' to verify enrollment and cloud link, then re-run 'selftune init --alpha --alpha-email --alpha-key ' if needed."; + markFailedSafely(authMessage); + summary.failed++; + succeeded = true; + break; + } + + if (!isRetryable(status)) { + markFailedSafely(result.errors[0] ?? `Upload failed with HTTP ${status}`); + summary.failed++; + succeeded = true; + break; + } + } + + if (!succeeded) { + markFailedSafely("exhausted retries"); + summary.failed++; + } + } + + return summary; +} diff --git a/cli/selftune/alpha-upload/index.ts b/cli/selftune/alpha-upload/index.ts new file mode 100644 index 00000000..593e20ac --- /dev/null +++ b/cli/selftune/alpha-upload/index.ts @@ -0,0 +1,194 @@ +/** + * Alpha upload orchestration module. + * + * Coordinates the full upload cycle: + * 1. Stage canonical records from JSONL + evolution evidence into staging table + * 2. Read new staged records since watermark via single cursor + * 3. Build a V2 canonical push payload + * 4. Enqueue it in the local upload queue + * 5. Flush the queue to POST /api/v1/push + * + * Guards: + * - Only runs when alpha enrolled (config.alpha?.enrolled === true) + * - Fail-open: never throws, returns empty summary on errors + * - Reads endpoint from config or SELFTUNE_ALPHA_ENDPOINT env var + */ + +import type { Database } from "bun:sqlite"; + +import type { + QueueItem as ContractQueueItem, + FlushSummary, + QueueOperations, +} from "../alpha-upload-contract.js"; +import { buildV2PushPayload } from "./build-payloads.js"; +import { flushQueue } from "./flush.js"; +import { + enqueueUpload, + getPendingUploads, + markFailed, + markSending, + markSent, + readWatermark, + writeWatermark, +} from "./queue.js"; +import { stageCanonicalRecords } from "./stage-canonical.js"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +const DEFAULT_ENDPOINT = "https://api.selftune.dev/api/v1/push"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface PrepareResult { + enqueued: number; + types: string[]; +} + +export interface UploadCycleOptions { + enrolled: boolean; + userId?: string; + agentType?: string; + selftuneVersion?: string; + endpoint?: string; + dryRun?: boolean; + apiKey?: string; + /** Override canonical log path (for testing). */ + canonicalLogPath?: string; +} + +export interface UploadCycleSummary { + enrolled: boolean; + prepared: number; + sent: number; + failed: number; + skipped: number; +} + +// --------------------------------------------------------------------------- +// prepareUploads -- stage, build V2 payload, enqueue +// --------------------------------------------------------------------------- + +/** + * Stage canonical records, read new staged rows since watermark, + * build a single V2 push payload, and enqueue it. Never throws. + */ +export function prepareUploads( + db: Database, + _userId: string, + _agentType: string, + _selftuneVersion: string, + canonicalLogPath?: string, +): PrepareResult { + const result: PrepareResult = { enqueued: 0, types: [] }; + + try { + // Step 1: Stage canonical records from JSONL + evolution evidence + stageCanonicalRecords(db, canonicalLogPath); + + // Step 2: Read watermark (single cursor for all record types) + const afterSeq = readWatermark(db, "canonical") ?? undefined; + + // Step 3: Build payload from staging table + const build = buildV2PushPayload(db, afterSeq); + + if (!build) return result; + + // Step 4: Enqueue the payload + advance watermark atomically + const tx = db.transaction(() => { + const ok = enqueueUpload(db, "push", JSON.stringify(build.payload)); + if (!ok) { + throw new Error("enqueueUpload failed"); + } + + if (!writeWatermark(db, "canonical", build.lastSeq)) { + throw new Error("writeWatermark failed"); + } + + result.enqueued = 1; + result.types.push("canonical"); + }); + tx(); + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload] prepareUploads failed:", err); + } + } + + return result; +} + +// --------------------------------------------------------------------------- +// runUploadCycle -- the full cycle: prepare -> flush -> return summary +// --------------------------------------------------------------------------- + +/** + * Run a full upload cycle: stage + read new data, enqueue it, flush to remote. + * Guards on enrollment -- returns empty summary if not enrolled. + * Never throws. + */ +export async function runUploadCycle( + db: Database, + options: UploadCycleOptions, +): Promise { + const emptySummary: UploadCycleSummary = { + enrolled: options.enrolled, + prepared: 0, + sent: 0, + failed: 0, + skipped: 0, + }; + + // Guard: must be enrolled + if (!options.enrolled) { + return emptySummary; + } + + try { + const userId = options.userId ?? "unknown"; + const agentType = options.agentType ?? "unknown"; + const selftuneVersion = options.selftuneVersion ?? "0.0.0"; + const endpoint = process.env.SELFTUNE_ALPHA_ENDPOINT ?? options.endpoint ?? DEFAULT_ENDPOINT; + const dryRun = options.dryRun ?? false; + const apiKey = options.apiKey; + + // Step 1: Prepare -- stage, build V2 payload, enqueue + const prepared = prepareUploads( + db, + userId, + agentType, + selftuneVersion, + options.canonicalLogPath, + ); + + // Step 2: Flush -- drain the queue to the remote endpoint + const queueOps: QueueOperations = { + getPending: (limit: number) => getPendingUploads(db, limit) as ContractQueueItem[], + markSending: (id: number) => markSending(db, [id]), + markSent: (id: number) => markSent(db, [id]), + markFailed: (id: number, error?: string) => markFailed(db, id, error ?? "unknown"), + }; + + const flush: FlushSummary = await flushQueue(queueOps, endpoint, { + dryRun, + apiKey, + }); + + return { + enrolled: true, + prepared: prepared.enqueued, + sent: flush.sent, + failed: flush.failed, + skipped: flush.skipped, + }; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload] runUploadCycle failed:", err); + } + return emptySummary; + } +} diff --git a/cli/selftune/alpha-upload/queue.ts b/cli/selftune/alpha-upload/queue.ts new file mode 100644 index 00000000..c8b075f4 --- /dev/null +++ b/cli/selftune/alpha-upload/queue.ts @@ -0,0 +1,252 @@ +/** + * Alpha upload queue — local queue and watermark storage layer. + * + * Queues payload items for upload to the alpha remote endpoint. + * No HTTP code — this module only manages the SQLite queue state. + * + * All public functions follow the fail-open pattern from direct-write.ts: + * they catch errors internally and return boolean success / safe defaults. + */ + +import type { Database } from "bun:sqlite"; + +// -- Types -------------------------------------------------------------------- + +export interface QueueItem { + id: number; + payload_type: string; + payload_json: string; + status: string; + attempts: number; + created_at: string; + updated_at: string; + last_error: string | null; +} + +export interface QueueStats { + pending: number; + sending: number; + sent: number; + failed: number; +} + +// -- Queue operations --------------------------------------------------------- + +/** + * Insert a new pending item into the upload queue. + * Returns true on success, false on failure (fail-open). + */ +export function enqueueUpload(db: Database, payloadType: string, payloadJson: string): boolean { + try { + const now = new Date().toISOString(); + db.run( + `INSERT INTO upload_queue (payload_type, payload_json, status, attempts, created_at, updated_at) + VALUES (?, ?, 'pending', 0, ?, ?)`, + [payloadType, payloadJson, now, now], + ); + return true; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] enqueueUpload failed:", err); + } + return false; + } +} + +/** + * Get pending upload items, oldest first. + * Default limit is 50. + */ +export function getPendingUploads(db: Database, limit = 50): QueueItem[] { + try { + return db + .query( + `SELECT id, payload_type, payload_json, status, attempts, created_at, updated_at, last_error + FROM upload_queue + WHERE status = 'pending' + ORDER BY id ASC + LIMIT ?`, + ) + .all(limit) as QueueItem[]; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] getPendingUploads failed:", err); + } + return []; + } +} + +/** + * Transition pending items to sending status. + * Only transitions items that are currently 'pending'. + */ +export function markSending(db: Database, ids: number[]): boolean { + if (ids.length === 0) return true; + try { + const now = new Date().toISOString(); + const placeholders = ids.map(() => "?").join(","); + db.run( + `UPDATE upload_queue + SET status = 'sending', updated_at = ? + WHERE id IN (${placeholders}) AND status = 'pending'`, + [now, ...ids], + ); + return true; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] markSending failed:", err); + } + return false; + } +} + +/** + * Transition sending items to sent status. + * Also updates the watermark per payload_type to the max id in the batch. + */ +export function markSent(db: Database, ids: number[]): boolean { + if (ids.length === 0) return true; + try { + const now = new Date().toISOString(); + const placeholders = ids.map(() => "?").join(","); + + db.run("BEGIN TRANSACTION"); + try { + const sendingRows = db + .query( + `SELECT id, payload_type + FROM upload_queue + WHERE id IN (${placeholders}) AND status = 'sending'`, + ) + .all(...ids) as Array<{ id: number; payload_type: string }>; + + // Mark items as sent + db.run( + `UPDATE upload_queue + SET status = 'sent', updated_at = ? + WHERE id IN (${placeholders}) AND status = 'sending'`, + [now, ...ids], + ); + + // Update watermarks only for rows that actually transitioned from "sending". + const maxByType = new Map(); + for (const row of sendingRows) { + const current = maxByType.get(row.payload_type) ?? 0; + if (row.id > current) { + maxByType.set(row.payload_type, row.id); + } + } + + for (const [payloadType, maxId] of maxByType.entries()) { + db.run( + `INSERT INTO upload_watermarks (payload_type, last_uploaded_id, updated_at) + VALUES (?, ?, ?) + ON CONFLICT(payload_type) DO UPDATE SET + last_uploaded_id = excluded.last_uploaded_id, + updated_at = excluded.updated_at`, + [payloadType, maxId, now], + ); + } + + db.run("COMMIT"); + } catch (err) { + db.run("ROLLBACK"); + throw err; + } + return true; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] markSent failed:", err); + } + return false; + } +} + +/** + * Transition a sending item to failed status. + * Increments the attempts counter and records the error message. + */ +export function markFailed(db: Database, id: number, error: string): boolean { + try { + const now = new Date().toISOString(); + db.run( + `UPDATE upload_queue + SET status = 'failed', attempts = attempts + 1, last_error = ?, updated_at = ? + WHERE id = ? AND status = 'sending'`, + [error, now, id], + ); + return true; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] markFailed failed:", err); + } + return false; + } +} + +/** + * Get counts of items by status. + */ +export function getQueueStats(db: Database): QueueStats { + try { + const row = db + .query( + `SELECT + COALESCE(SUM(CASE WHEN status = 'pending' THEN 1 ELSE 0 END), 0) as pending, + COALESCE(SUM(CASE WHEN status = 'sending' THEN 1 ELSE 0 END), 0) as sending, + COALESCE(SUM(CASE WHEN status = 'sent' THEN 1 ELSE 0 END), 0) as sent, + COALESCE(SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END), 0) as failed + FROM upload_queue`, + ) + .get() as QueueStats; + return row; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] getQueueStats failed:", err); + } + return { pending: 0, sending: 0, sent: 0, failed: 0 }; + } +} + +// -- Watermark operations ----------------------------------------------------- + +/** + * Read the last uploaded ID for a given payload type. + * Returns null if no watermark exists. + */ +export function readWatermark(db: Database, payloadType: string): number | null { + try { + const row = db + .query("SELECT last_uploaded_id FROM upload_watermarks WHERE payload_type = ?") + .get(payloadType) as { last_uploaded_id: number } | null; + return row?.last_uploaded_id ?? null; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] readWatermark failed:", err); + } + return null; + } +} + +/** + * Upsert the watermark for a given payload type. + */ +export function writeWatermark(db: Database, payloadType: string, lastId: number): boolean { + try { + const now = new Date().toISOString(); + db.run( + `INSERT INTO upload_watermarks (payload_type, last_uploaded_id, updated_at) + VALUES (?, ?, ?) + ON CONFLICT(payload_type) DO UPDATE SET + last_uploaded_id = excluded.last_uploaded_id, + updated_at = excluded.updated_at`, + [payloadType, lastId, now], + ); + return true; + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[alpha-upload/queue] writeWatermark failed:", err); + } + return false; + } +} diff --git a/cli/selftune/alpha-upload/stage-canonical.ts b/cli/selftune/alpha-upload/stage-canonical.ts new file mode 100644 index 00000000..6860e27b --- /dev/null +++ b/cli/selftune/alpha-upload/stage-canonical.ts @@ -0,0 +1,242 @@ +/** + * Canonical upload staging writer. + * + * Reads canonical records from the JSONL source-of-truth log and evolution + * evidence from SQLite, then inserts them into a single monotonic staging + * table for lossless upload batching. + * + * The staging table preserves the full canonical record JSON -- no field + * dropping, no hardcoding of provenance fields. + */ + +import type { Database } from "bun:sqlite"; +import { createHash } from "node:crypto"; +import type { CanonicalRecord } from "@selftune/telemetry-contract"; +import { isCanonicalRecord } from "@selftune/telemetry-contract"; +import { CANONICAL_LOG } from "../constants.js"; +import { getOrchestrateRuns, queryEvolutionEvidence } from "../localdb/queries.js"; +import { readJsonl } from "../utils/jsonl.js"; + +// -- Helpers ------------------------------------------------------------------ + +/** + * Generate a deterministic execution_fact_id from the record's natural key. + * + * Uses a SHA-256 hash of the composite key (session_id, occurred_at, prompt_id) + * so that re-staging the same record always produces the same ID. + */ +export function generateExecutionFactId(record: Record): string { + const key = `${record.session_id}:${record.occurred_at}:${record.prompt_id ?? ""}`; + return `ef_${createHash("sha256").update(key).digest("hex").slice(0, 16)}`; +} + +/** + * Generate a deterministic evidence_id from the evidence record's natural key. + * + * Uses a SHA-256 hash of the composite key (proposal_id, target, stage, + * skill_name, timestamp) so that re-staging the same evidence event always produces the + * same ID — but distinct events (e.g., two "validate" stages at different + * times) get different IDs. + */ +export function generateEvidenceId(record: Record): string { + const key = `${record.proposal_id ?? ""}:${record.target ?? ""}:${record.stage ?? ""}:${record.skill_name ?? ""}:${record.timestamp ?? record.normalized_at ?? ""}`; + return `ev_${createHash("sha256").update(key).digest("hex").slice(0, 16)}`; +} + +/** + * Enrich a raw parsed record: if it is an execution_fact missing + * execution_fact_id, inject a deterministic one. + * + * Returns the (possibly enriched) record unchanged for all other kinds. + */ +function enrichRecord(raw: Record): Record { + if (raw.record_kind !== "execution_fact") return raw; + if ( + raw.execution_fact_id && + typeof raw.execution_fact_id === "string" && + raw.execution_fact_id.length > 0 + ) { + return raw; + } + return { ...raw, execution_fact_id: generateExecutionFactId(raw) }; +} + +/** + * Read canonical records from JSONL, enriching execution_facts that are + * missing execution_fact_id before applying the canonical record validator. + * + * This ensures older canonical logs (written before execution_fact_id was + * required) can still be staged and uploaded. + */ +function readAndEnrichCanonicalRecords(logPath: string): CanonicalRecord[] { + const rawRecords = readJsonl>(logPath); + const enriched = rawRecords.map(enrichRecord); + return enriched.filter(isCanonicalRecord) as CanonicalRecord[]; +} + +/** + * Extract a stable record_id from a canonical record. + * + * Uses the natural primary key for each record kind: + * - session: session_id + * - prompt: prompt_id + * - skill_invocation: skill_invocation_id + * - execution_fact: execution_fact_id + * - normalization_run: run_id + */ +function extractRecordId(record: CanonicalRecord): string { + switch (record.record_kind) { + case "session": + return record.session_id; + case "prompt": + return record.prompt_id; + case "skill_invocation": + return record.skill_invocation_id; + case "execution_fact": + return record.execution_fact_id; + case "normalization_run": + return record.run_id; + } +} + +/** + * Extract session_id from a canonical record (if the record has one). + */ +function extractSessionId(record: CanonicalRecord): string | null { + if ("session_id" in record) return record.session_id; + return null; +} + +/** + * Extract prompt_id from a canonical record (if the record has one). + */ +function extractPromptId(record: CanonicalRecord): string | null { + if ("prompt_id" in record) return record.prompt_id; + return null; +} + +/** + * Extract normalized_at from a canonical record. + */ +function extractNormalizedAt(record: CanonicalRecord): string { + return record.normalized_at; +} + +// -- Main staging function ---------------------------------------------------- + +/** + * Stage canonical records from the JSONL log and evolution evidence from SQLite + * into the canonical_upload_staging table. + * + * Uses INSERT OR IGNORE for dedup by (record_kind, record_id). + * + * @param db - SQLite database handle + * @param logPath - Path to canonical JSONL log (defaults to CANONICAL_LOG) + * @returns Number of newly staged records + */ +export function stageCanonicalRecords(db: Database, logPath: string = CANONICAL_LOG): number { + let staged = 0; + const now = new Date().toISOString(); + + const stmt = db.prepare(` + INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, prompt_id, normalized_at, staged_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + `); + + // 1. Stage canonical records from JSONL (enriching missing execution_fact_id) + const records = readAndEnrichCanonicalRecords(logPath); + for (const record of records) { + const recordId = extractRecordId(record); + const result = stmt.run( + record.record_kind, + recordId, + JSON.stringify(record), + extractSessionId(record), + extractPromptId(record), + extractNormalizedAt(record), + now, + ); + if (result.changes > 0) staged++; + } + + // 2. Stage evolution evidence from SQLite + try { + const evidence = queryEvolutionEvidence(db); + for (const entry of evidence) { + const evidenceRecord: Record = { + skill_name: entry.skill_name, + skill_path: entry.skill_path, + proposal_id: entry.proposal_id, + target: entry.target, + stage: entry.stage, + rationale: entry.rationale, + confidence: entry.confidence, + details: entry.details, + original_text: entry.original_text, + proposed_text: entry.proposed_text, + eval_set_json: entry.eval_set, + validation_json: entry.validation, + timestamp: entry.timestamp, + }; + // Generate deterministic evidence_id if not already present + const evidenceId = generateEvidenceId(evidenceRecord); + evidenceRecord.evidence_id = evidenceId; + const recordId = evidenceId; + const recordJson = JSON.stringify(evidenceRecord); + + const result = stmt.run( + "evolution_evidence", + recordId, + recordJson, + null, // no session_id for evolution evidence + null, // no prompt_id + entry.timestamp, + now, + ); + if (result.changes > 0) staged++; + } + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[stage-canonical] failed to stage evolution evidence:", err); + } + } + + // 3. Stage orchestrate runs from SQLite + try { + const runs = getOrchestrateRuns(db, 10000); + for (const run of runs) { + const recordJson = JSON.stringify({ + run_id: run.run_id, + timestamp: run.timestamp, + elapsed_ms: run.elapsed_ms, + dry_run: run.dry_run, + approval_mode: run.approval_mode, + total_skills: run.total_skills, + evaluated: run.evaluated, + evolved: run.evolved, + deployed: run.deployed, + watched: run.watched, + skipped: run.skipped, + skill_actions: run.skill_actions, + }); + + const result = stmt.run( + "orchestrate_run", + run.run_id, + recordJson, + null, // no session_id for orchestrate runs + null, // no prompt_id + run.timestamp, + now, + ); + if (result.changes > 0) staged++; + } + } catch (err) { + if (process.env.DEBUG || process.env.NODE_ENV === "development") { + console.error("[stage-canonical] failed to stage orchestrate runs:", err); + } + } + + return staged; +} diff --git a/cli/selftune/auth/device-code.ts b/cli/selftune/auth/device-code.ts new file mode 100644 index 00000000..765fedb1 --- /dev/null +++ b/cli/selftune/auth/device-code.ts @@ -0,0 +1,110 @@ +/** + * Device-code authentication flow for CLI -> cloud linking. + * + * Flow: + * 1. CLI requests a device code from the cloud API + * 2. CLI prints verification URL + user code for the agent to relay + * 3. CLI attempts to open browser + * 4. CLI polls until approved, denied, or expired + */ + +export interface DeviceCodeGrant { + device_code: string; + user_code: string; + verification_url: string; + expires_in: number; + interval: number; +} + +export interface DeviceCodeResult { + api_key: string; + cloud_user_id: string; + org_id: string; +} + +/** + * Derive the cloud API base URL from SELFTUNE_ALPHA_ENDPOINT. + * The endpoint is the push URL (e.g., https://api.selftune.dev/api/v1/push). + * Strip /push to get the base. + */ +export function getBaseUrl(): string { + const pushEndpoint = + process.env.SELFTUNE_ALPHA_ENDPOINT ?? "https://api.selftune.dev/api/v1/push"; + return pushEndpoint.replace(/\/push$/, ""); +} + +/** + * Request a new device code from the cloud API. + */ +export async function requestDeviceCode(): Promise { + const baseUrl = getBaseUrl(); + const response = await fetch(`${baseUrl}/device-code`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ client_id: "selftune-cli", scope: "push read" }), + }); + + if (!response.ok) { + throw new Error(`Device code request failed: ${response.status} ${response.statusText}`); + } + + return response.json() as Promise; +} + +/** + * Poll for device-code completion. Resolves when approved, rejects on expired/denied/timeout. + */ +export async function pollDeviceCode( + deviceCode: string, + interval: number, + expiresIn: number, +): Promise { + const baseUrl = getBaseUrl(); + const deadline = Date.now() + expiresIn * 1000; + + while (Date.now() < deadline) { + await new Promise((resolve) => setTimeout(resolve, interval * 1000)); + + const response = await fetch(`${baseUrl}/device-code/poll`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ device_code: deviceCode, client_id: "selftune-cli" }), + }); + + // Parse body as JSON; on non-2xx responses the cloud may return + // JSON with a status field (e.g. 403 → { status: "denied" }) or + // non-JSON (e.g. 503 gateway error). Handle both gracefully. + let result: Record; + try { + result = (await response.json()) as Record; + } catch { + // Non-JSON body — fall through to HTTP status check + if (!response.ok) { + throw new Error(`Poll failed: ${response.status}`); + } + // 2xx with unparseable body is unexpected; treat as pending + continue; + } + + if (result.status === "approved") { + return { + api_key: result.api_key, + cloud_user_id: result.cloud_user_id, + org_id: result.org_id, + }; + } + + if (result.status === "expired") throw new Error("Device code expired. Please retry."); + if (result.status === "denied") throw new Error("Device code denied by user."); + + // Non-2xx without a recognized status in the body is a genuine error + if (!response.ok) { + throw new Error(`Poll failed: ${response.status}`); + } + + // status === "pending" -- continue polling + process.stderr.write("."); + } + + throw new Error("Device code polling timed out."); +} diff --git a/cli/selftune/canonical-export.ts b/cli/selftune/canonical-export.ts index 9a5cd191..3331f220 100644 --- a/cli/selftune/canonical-export.ts +++ b/cli/selftune/canonical-export.ts @@ -83,6 +83,7 @@ export function loadCanonicalRecordsForExport( export function buildPushPayloadV2( records: CanonicalRecord[], evidenceEntries: EvolutionEvidenceEntry[] = [], + orchestrateRuns: Record[] = [], ): Record { const sessions = records.filter((record) => record.record_kind === "session"); const prompts = records.filter((record) => record.record_kind === "prompt"); @@ -103,17 +104,22 @@ export function buildPushPayloadV2( execution_facts: executionFacts, normalization_runs: normalizationRuns, evolution_evidence: evidenceEntries.map((entry) => ({ + evidence_id: entry.evidence_id, + timestamp: entry.timestamp, skill_name: entry.skill_name, + skill_path: entry.skill_path, proposal_id: entry.proposal_id, target: entry.target, stage: entry.stage, rationale: entry.rationale, confidence: entry.confidence, + details: entry.details, original_text: entry.original_text, proposed_text: entry.proposed_text, eval_set_json: entry.eval_set, validation_json: entry.validation, })), + orchestrate_runs: orchestrateRuns, }, }; } diff --git a/cli/selftune/constants.ts b/cli/selftune/constants.ts index 56172e19..d62e3253 100644 --- a/cli/selftune/constants.ts +++ b/cli/selftune/constants.ts @@ -5,10 +5,22 @@ import { homedir } from "node:os"; import { join } from "node:path"; -export const SELFTUNE_CONFIG_DIR = join(homedir(), ".selftune"); +const resolvedHome = process.env.SELFTUNE_HOME; +const defaultHome = resolvedHome ?? homedir(); +const claudeHomeDir = + process.env.SELFTUNE_CLAUDE_DIR ?? + (resolvedHome ? join(defaultHome, ".claude") : join(homedir(), ".claude")); +const openclawHomeDir = + process.env.SELFTUNE_OPENCLAW_DIR ?? + (resolvedHome ? join(defaultHome, ".openclaw") : join(homedir(), ".openclaw")); + +export const SELFTUNE_CONFIG_DIR = + (process.env.SELFTUNE_CONFIG_DIR || undefined) ?? + (resolvedHome ? join(defaultHome, ".selftune") : join(homedir(), ".selftune")); + export const SELFTUNE_CONFIG_PATH = join(SELFTUNE_CONFIG_DIR, "config.json"); -export const LOG_DIR = join(homedir(), ".claude"); +export const LOG_DIR = (process.env.SELFTUNE_LOG_DIR || undefined) ?? claudeHomeDir; export const TELEMETRY_LOG = join(LOG_DIR, "session_telemetry_log.jsonl"); export const SKILL_LOG = join(LOG_DIR, "skill_usage_log.jsonl"); @@ -106,22 +118,30 @@ export function canonicalSessionStatePath(sessionId: string): string { } /** Claude Code settings file path. */ -export const CLAUDE_SETTINGS_PATH = join(homedir(), ".claude", "settings.json"); +export const CLAUDE_SETTINGS_PATH = + process.env.SELFTUNE_CLAUDE_SETTINGS_PATH ?? join(claudeHomeDir, "settings.json"); /** Path to Claude Code projects directory containing session transcripts. */ -export const CLAUDE_CODE_PROJECTS_DIR = join(homedir(), ".claude", "projects"); +export const CLAUDE_CODE_PROJECTS_DIR = + process.env.SELFTUNE_CLAUDE_PROJECTS_DIR ?? join(claudeHomeDir, "projects"); /** Marker file tracking which Claude Code sessions have been ingested. */ -export const CLAUDE_CODE_MARKER = join(homedir(), ".claude", "claude_code_ingested_sessions.json"); +export const CLAUDE_CODE_MARKER = + process.env.SELFTUNE_CLAUDE_MARKER_PATH ?? + join(claudeHomeDir, "claude_code_ingested_sessions.json"); /** Marker file tracking which Codex rollout files have been ingested. */ -export const CODEX_INGEST_MARKER = join(homedir(), ".claude", "codex_ingested_rollouts.json"); +export const CODEX_INGEST_MARKER = + process.env.SELFTUNE_CODEX_MARKER_PATH ?? join(claudeHomeDir, "codex_ingested_rollouts.json"); /** Marker file tracking which OpenCode sessions have been ingested. */ -export const OPENCODE_INGEST_MARKER = join(homedir(), ".claude", "opencode_ingested_sessions.json"); +export const OPENCODE_INGEST_MARKER = + process.env.SELFTUNE_OPENCODE_MARKER_PATH ?? + join(claudeHomeDir, "opencode_ingested_sessions.json"); /** OpenClaw agents directory containing session data. */ -export const OPENCLAW_AGENTS_DIR = join(homedir(), ".openclaw", "agents"); +export const OPENCLAW_AGENTS_DIR = + process.env.SELFTUNE_OPENCLAW_AGENTS_DIR ?? join(openclawHomeDir, "agents"); /** Marker file tracking which OpenClaw sessions have been ingested. */ export const OPENCLAW_INGEST_MARKER = join(SELFTUNE_CONFIG_DIR, "openclaw-ingest-marker.json"); diff --git a/cli/selftune/dashboard-contract.ts b/cli/selftune/dashboard-contract.ts index f849d270..42cc89e6 100644 --- a/cli/selftune/dashboard-contract.ts +++ b/cli/selftune/dashboard-contract.ts @@ -28,6 +28,7 @@ export interface EvalSnapshot { export interface EvolutionEntry { timestamp: string; proposal_id: string; + skill_name?: string; action: string; details: string; eval_snapshot?: EvalSnapshot | null; @@ -185,6 +186,25 @@ export interface OrchestrateRunsResponse { runs: OrchestrateRunReport[]; } +// -- Health endpoint response ------------------------------------------------- + +export interface HealthResponse { + ok: boolean; + service: string; + version: string; + spa: boolean; + v2_data_available: boolean; + workspace_root: string; + git_sha: string; + db_path: string; + log_dir: string; + config_dir: string; + watcher_mode: "jsonl" | "none"; + process_mode: "standalone" | "dev-server" | "test"; + host: string; + port: number; +} + // -- Doctor / health check types ---------------------------------------------- export type { DoctorResult, HealthCheck, HealthStatus } from "./types.js"; diff --git a/cli/selftune/dashboard-server.ts b/cli/selftune/dashboard-server.ts index e2fe3a6f..e26bb7da 100644 --- a/cli/selftune/dashboard-server.ts +++ b/cli/selftune/dashboard-server.ts @@ -20,10 +20,20 @@ import type { Database } from "bun:sqlite"; import { existsSync, type FSWatcher, watch as fsWatch, readFileSync } from "node:fs"; import { dirname, extname, isAbsolute, join, relative, resolve } from "node:path"; import type { BadgeFormat } from "./badge/badge-svg.js"; -import { EVOLUTION_AUDIT_LOG, QUERY_LOG, TELEMETRY_LOG } from "./constants.js"; -import type { OverviewResponse, SkillReportResponse } from "./dashboard-contract.js"; +import { + EVOLUTION_AUDIT_LOG, + LOG_DIR, + QUERY_LOG, + SELFTUNE_CONFIG_DIR, + TELEMETRY_LOG, +} from "./constants.js"; +import type { + HealthResponse, + OverviewResponse, + SkillReportResponse, +} from "./dashboard-contract.js"; import { readEvidenceTrail } from "./evolution/evidence.js"; -import { closeSingleton, getDb } from "./localdb/db.js"; +import { closeSingleton, DB_PATH, getDb } from "./localdb/db.js"; import { materializeIncremental } from "./localdb/materialize.js"; import { queryEvolutionAudit, @@ -52,6 +62,7 @@ export interface DashboardServerOptions { host?: string; spaDir?: string; openBrowser?: boolean; + runtimeMode?: HealthResponse["process_mode"]; statusLoader?: () => StatusResult | Promise; evidenceLoader?: () => EvolutionEvidenceEntry[]; overviewLoader?: () => OverviewResponse; @@ -68,6 +79,21 @@ try { // fallback already set } +/** Resolve short git SHA once at startup (cached). */ +let cachedGitSha: string | null = null; +function getGitSha(): string { + if (cachedGitSha !== null) return cachedGitSha; + try { + const result = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"]); + cachedGitSha = result.stdout.toString().trim() || "unknown"; + } catch { + cachedGitSha = "unknown"; + } + return cachedGitSha; +} + +const WORKSPACE_ROOT = resolve(import.meta.dir, "..", ".."); + function findSpaDir(): string | null { const candidates = [ join(dirname(import.meta.dir), "..", "apps", "local-dashboard", "dist"), @@ -138,6 +164,7 @@ export async function startDashboardServer( const port = options?.port ?? 3141; const hostname = options?.host ?? "localhost"; const openBrowser = options?.openBrowser ?? true; + const runtimeMode = options?.runtimeMode ?? (import.meta.main ? "dev-server" : "test"); const getStatusResult = options?.statusLoader ?? computeStatusFromDb; const getEvidenceEntries = options?.evidenceLoader ?? readEvidenceTrail; const getOverviewResponse = options?.overviewLoader; @@ -212,6 +239,7 @@ export async function startDashboardServer( // -- File watchers on JSONL logs for push-based updates --------------------- const WATCHED_LOGS = [TELEMETRY_LOG, QUERY_LOG, EVOLUTION_AUDIT_LOG]; + const watchedLogPaths = new Set(WATCHED_LOGS); let fsDebounceTimer: ReturnType | null = null; const FS_DEBOUNCE_MS = 500; @@ -226,16 +254,48 @@ export async function startDashboardServer( } const fileWatchers: FSWatcher[] = []; - for (const logPath of WATCHED_LOGS) { - if (existsSync(logPath)) { - try { - fileWatchers.push(fsWatch(logPath, onLogFileChange)); - } catch { - // Non-fatal: fall back to polling if watch fails - } + const watchedFiles = new Set(); + let directoryWatcherActive = false; + + function registerFileWatcher(logPath: string): void { + if (watchedFiles.has(logPath) || !existsSync(logPath)) return; + try { + fileWatchers.push(fsWatch(logPath, onLogFileChange)); + watchedFiles.add(logPath); + } catch { + // Non-fatal: fall back to polling if watch fails } } + for (const logPath of WATCHED_LOGS) { + registerFileWatcher(logPath); + } + + try { + fileWatchers.push( + fsWatch(LOG_DIR, (_eventType, filename) => { + if (typeof filename !== "string" || filename.length === 0) return; + const fullPath = join(LOG_DIR, filename); + if (!watchedLogPaths.has(fullPath)) return; + registerFileWatcher(fullPath); + onLogFileChange(); + }), + ); + directoryWatcherActive = true; + } catch { + directoryWatcherActive = false; + } + + function getWatcherMode(): HealthResponse["watcher_mode"] { + return directoryWatcherActive || watchedFiles.size > 0 ? "jsonl" : "none"; + } + + if (runtimeMode !== "test" && getWatcherMode() === "jsonl") { + console.warn( + "Dashboard freshness mode: JSONL watcher invalidation (legacy). Live updates can miss SQLite-only writes until WAL cutover lands.", + ); + } + let cachedStatusResult: StatusResult | null = null; let lastStatusCacheRefreshAt = 0; let statusRefreshPromise: Promise | null = null; @@ -283,16 +343,23 @@ export async function startDashboardServer( // ---- GET /api/health ---- if (url.pathname === "/api/health" && req.method === "GET") { - return Response.json( - { - ok: true, - service: "selftune-dashboard", - version: selftuneVersion, - spa: Boolean(spaDir), - v2_data_available: Boolean(getOverviewResponse || db), - }, - { headers: corsHeaders() }, - ); + const healthResponse: HealthResponse = { + ok: true, + service: "selftune-dashboard", + version: selftuneVersion, + spa: Boolean(spaDir), + v2_data_available: Boolean(getOverviewResponse || db), + workspace_root: WORKSPACE_ROOT, + git_sha: getGitSha(), + db_path: DB_PATH, + log_dir: LOG_DIR, + config_dir: SELFTUNE_CONFIG_DIR, + watcher_mode: getWatcherMode(), + process_mode: runtimeMode, + host: hostname, + port: boundPort, + }; + return Response.json(healthResponse, { headers: corsHeaders() }); } // ---- GET /api/v2/events ---- SSE stream for live updates @@ -537,5 +604,10 @@ export async function startDashboardServer( // -- Direct execution (bun run dashboard-server.ts --port XXXX) --------------- if (import.meta.main) { const port = Number(process.argv.find((_, i, a) => a[i - 1] === "--port")) || 7888; - startDashboardServer({ port, openBrowser: false }); + const runtimeModeArg = process.argv.find((_, i, a) => a[i - 1] === "--runtime-mode"); + const runtimeMode = + runtimeModeArg === "standalone" || runtimeModeArg === "dev-server" || runtimeModeArg === "test" + ? runtimeModeArg + : "dev-server"; + startDashboardServer({ port, openBrowser: false, runtimeMode }); } diff --git a/cli/selftune/dashboard.ts b/cli/selftune/dashboard.ts index 80ac299e..6bb1e4b5 100644 --- a/cli/selftune/dashboard.ts +++ b/cli/selftune/dashboard.ts @@ -46,7 +46,7 @@ Usage: const openBrowser = !args.includes("--no-open"); const { startDashboardServer } = await import("./dashboard-server.js"); - const { stop } = await startDashboardServer({ port, openBrowser }); + const { stop } = await startDashboardServer({ port, openBrowser, runtimeMode: "standalone" }); await new Promise((resolve) => { let closed = false; const keepAlive = setInterval(() => {}, 1 << 30); diff --git a/cli/selftune/eval/synthetic-evals.ts b/cli/selftune/eval/synthetic-evals.ts index 88bb7e18..28d76f46 100644 --- a/cli/selftune/eval/synthetic-evals.ts +++ b/cli/selftune/eval/synthetic-evals.ts @@ -37,6 +37,7 @@ export function buildSyntheticPrompt( skillName: string, maxPositives: number, maxNegatives: number, + realExamples?: { positive: string[]; negative: string[] }, ): { system: string; user: string } { const system = `You are generating test queries for a coding agent skill. Given the skill description below, generate realistic user queries. @@ -55,13 +56,27 @@ For NEGATIVE queries (should NOT trigger this skill): Output as JSON array with no surrounding text: [{"query": "...", "should_trigger": true, "invocation_type": "explicit|implicit|contextual|negative"}]`; - const user = `Skill name: ${skillName} + let user = `Skill name: ${skillName} Skill content: ${skillContent} Generate exactly ${maxPositives} positive queries (should_trigger: true) and ${maxNegatives} negative queries (should_trigger: false). Return ONLY the JSON array.`; + if (realExamples && (realExamples.positive.length > 0 || realExamples.negative.length > 0)) { + const parts: string[] = ["\n\nReal user queries for style and phrasing reference:"]; + if (realExamples.positive.length > 0) { + parts.push("Queries that triggered this skill:"); + parts.push(...realExamples.positive.map((q) => ` - "${q}"`)); + } + if (realExamples.negative.length > 0) { + parts.push("Queries that did NOT trigger (general queries):"); + parts.push(...realExamples.negative.map((q) => ` - "${q}"`)); + } + parts.push("\nGenerate queries that match this natural phrasing style."); + user += parts.join("\n"); + } + return { system, user }; } @@ -160,11 +175,49 @@ export async function generateSyntheticEvals( const skillContent = readFileSync(skillPath, "utf-8"); + // Load real query examples from the database for few-shot style guidance. + // Uses dynamic imports since SQLite may not be available in all contexts. + let realExamples: { positive: string[]; negative: string[] } | undefined; + try { + const { getDb } = await import("../localdb/db.js"); + const { querySkillUsageRecords, queryQueryLog } = await import("../localdb/queries.js"); + const { isHighConfidencePositiveSkillRecord } = await import( + "../utils/skill-usage-confidence.js" + ); + + const db = getDb(); + + // Positives: high-confidence triggered records for this skill + const skillRecords = querySkillUsageRecords(db); + const positive = skillRecords + .filter((r) => isHighConfidencePositiveSkillRecord(r, skillName)) + .map((r) => r.query) + .filter((q): q is string => typeof q === "string" && q.length > 0) + .slice(0, 5); + + // Negatives: from all_queries, excluding known positives + const posSet = new Set(positive.map((q: string) => q.toLowerCase())); + const allQueries = queryQueryLog(db); + const negative = allQueries + .map((r) => r.query) + .filter( + (q): q is string => typeof q === "string" && q.length > 0 && !posSet.has(q.toLowerCase()), + ) + .slice(0, 5); + + if (positive.length > 0) { + realExamples = { positive, negative }; + } + } catch { + // fail-open: synthetic gen works without real examples + } + const { system, user } = buildSyntheticPrompt( skillContent, skillName, maxPositives, maxNegatives, + realExamples, ); const raw = await callLlm(system, user, agent, options.modelFlag); diff --git a/cli/selftune/evolution/constitutional.ts b/cli/selftune/evolution/constitutional.ts new file mode 100644 index 00000000..4c5e131d --- /dev/null +++ b/cli/selftune/evolution/constitutional.ts @@ -0,0 +1,176 @@ +/** + * constitutional.ts + * + * Deterministic pre-validation gate for evolution proposals. Runs before + * confidence checks and LLM validation to reject obviously bad proposals + * cheaply — no LLM calls required. + * + * Four principles: + * 1. Size constraint — char limit + word-count ratio + * 2. No XML injection — reject embedded XML tags + * 3. No unbounded broadening — reject bare "all/any/every/everything" + * 4. Anchor preservation — preserve USE WHEN triggers and $skillName refs + */ + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +export interface ConstitutionalResult { + passed: boolean; + violations: string[]; +} + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function wordCount(text: string): number { + return text.split(/\s+/).filter(Boolean).length; +} + +/** + * Extract the sentence containing the match index. Splits on sentence-ending + * punctuation (`.` `!` `?`) followed by whitespace, but avoids splitting on + * common abbreviations like "e.g." or "i.e.". + */ +function sentenceContaining(text: string, matchIndex: number): string { + // Split only when the next token looks like a new sentence. + const sentences = text.split(/(?<=[.!?])\s+(?=[A-Z0-9"'‘“])/); + let offset = 0; + for (const sentence of sentences) { + const realOffset = text.indexOf(sentence, offset); + if (realOffset === -1) break; + if (matchIndex >= realOffset && matchIndex < realOffset + sentence.length) { + return sentence; + } + offset = realOffset + sentence.length; + } + return text; // fallback: treat entire text as one sentence +} + +const ENUMERATION_MARKERS = /\b(?:including|such as|like)\b|e\.g\.|,\s*\w+\s*,/i; + +// --------------------------------------------------------------------------- +// Main check +// --------------------------------------------------------------------------- + +export function checkConstitution( + proposed: string, + original: string, + _skillName: string, +): ConstitutionalResult { + const violations: string[] = []; + + // ------------------------------------------------------------------------- + // Principle 1: Size constraint + // ------------------------------------------------------------------------- + if (proposed.length > 8192) { + violations.push(`Size: ${proposed.length} chars exceeds 8192 limit`); + } + + const origWords = wordCount(original); + const propWords = wordCount(proposed); + + if (origWords > 0) { + const ratio = propWords / origWords; + if (ratio > 3.0) { + violations.push( + `Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), exceeds 3.0x limit`, + ); + } + if (ratio < 0.3) { + violations.push( + `Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), below 0.3x limit`, + ); + } + } + + // ------------------------------------------------------------------------- + // Principle 2: No XML injection + // ------------------------------------------------------------------------- + if (/<[a-zA-Z][^>]*>/.test(proposed)) { + violations.push("XML injection: proposed description contains XML/HTML tags"); + } + + // ------------------------------------------------------------------------- + // Principle 3: No unbounded broadening + // ------------------------------------------------------------------------- + const broadenPattern = /\b(all|any|every|everything)\b/gi; + let match: RegExpExecArray | null = broadenPattern.exec(proposed); + while (match !== null) { + const sentence = sentenceContaining(proposed, match.index); + if (!ENUMERATION_MARKERS.test(sentence)) { + violations.push( + `Unbounded broadening: "${match[0]}" at position ${match.index} without enumeration qualifier`, + ); + } + match = broadenPattern.exec(proposed); + } + + // ------------------------------------------------------------------------- + // Principle 4: Anchor preservation + // ------------------------------------------------------------------------- + // Check for USE WHEN triggers + if (/USE WHEN/i.test(original) && !/USE WHEN/i.test(proposed)) { + violations.push( + 'Anchor: original contains "USE WHEN" trigger phrase that is missing in proposed', + ); + } + + // Check for $variable references + const dollarRefs = original.match(/\$[A-Za-z0-9_-]+/g); + if (dollarRefs) { + const proposedDollarRefs = new Set(proposed.match(/\$[A-Za-z0-9_-]+/g) ?? []); + for (const ref of dollarRefs) { + if (!proposedDollarRefs.has(ref)) { + violations.push(`Anchor: original contains "${ref}" reference that is missing in proposed`); + } + } + } + + return { + passed: violations.length === 0, + violations, + }; +} + +// --------------------------------------------------------------------------- +// Size-only check (for body evolution) +// --------------------------------------------------------------------------- + +/** + * Body-specific constitutional check. Only enforces the word-count ratio + * (0.3x–3.0x of original). The 1024-char absolute limit does not apply + * to body text since bodies are typically much larger than descriptions. + */ +export function checkConstitutionSizeOnly( + proposed: string, + original: string, +): ConstitutionalResult { + const violations: string[] = []; + + const origWords = wordCount(original); + const propWords = wordCount(proposed); + + // Only enforce word-count ratio when the original is substantial enough + // for the ratio to be meaningful (at least 10 words). + if (origWords >= 10) { + const ratio = propWords / origWords; + if (ratio > 3.0) { + violations.push( + `Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), exceeds 3.0x limit`, + ); + } + if (ratio < 0.3) { + violations.push( + `Size: ${propWords} words is ${ratio.toFixed(1)}x original (${origWords} words), below 0.3x limit`, + ); + } + } + + return { + passed: violations.length === 0, + violations, + }; +} diff --git a/cli/selftune/evolution/evolve-body.ts b/cli/selftune/evolution/evolve-body.ts index a098816b..4324ea55 100644 --- a/cli/selftune/evolution/evolve-body.ts +++ b/cli/selftune/evolution/evolve-body.ts @@ -27,10 +27,11 @@ import type { } from "../types.js"; import { appendAuditEntry } from "./audit.js"; +import { checkConstitutionSizeOnly } from "./constitutional.js"; import { parseSkillSections, replaceBody, replaceSection } from "./deploy-proposal.js"; import { appendEvidenceEntry } from "./evidence.js"; import { extractFailurePatterns } from "./extract-patterns.js"; -import { generateBodyProposal } from "./propose-body.js"; +import { type ExecutionContext, generateBodyProposal } from "./propose-body.js"; import { generateRoutingProposal } from "./propose-routing.js"; import { refineBodyProposal } from "./refine-body.js"; import { validateBodyProposal } from "./validate-body.js"; @@ -227,6 +228,64 @@ export async function evolveBody( const missedQueries = failurePatterns.flatMap((p) => p.missed_queries); + // Compute execution context from session telemetry (fail-open) + let executionContext: ExecutionContext | undefined; + try { + const { querySessionTelemetry } = await import("../localdb/queries.js"); + const db = getDb(); + const allTelemetry = querySessionTelemetry(db); + + // Find session IDs that used this skill + const skillSessionIds = new Set( + skillUsage + .filter((r) => r.skill_name?.toLowerCase() === skillName.toLowerCase() && r.triggered) + .map((r) => r.session_id), + ); + + // Filter telemetry to skill sessions + const telemetryForSkill = allTelemetry.filter((t) => skillSessionIds.has(t.session_id)); + + if (telemetryForSkill.length > 0) { + const mean = (arr: number[]) => arr.reduce((a, b) => a + b, 0) / arr.length; + + const toolCallCounts = telemetryForSkill.map((t) => t.total_tool_calls ?? 0); + const errorCounts = telemetryForSkill.map((t) => t.errors_encountered ?? 0); + const turnCounts = telemetryForSkill.map((t) => t.assistant_turns ?? 0); + + // Count tool frequency across all sessions + const toolFreq = new Map(); + const failureToolFreq = new Map(); + + for (const t of telemetryForSkill) { + const tools: Record = t.tool_calls ?? {}; + const isFailure = (t.errors_encountered ?? 0) > 2; + + for (const [tool, count] of Object.entries(tools)) { + toolFreq.set(tool, (toolFreq.get(tool) ?? 0) + count); + if (isFailure) { + failureToolFreq.set(tool, (failureToolFreq.get(tool) ?? 0) + count); + } + } + } + + const topN = (freq: Map, n: number) => + [...freq.entries()] + .sort((a, b) => b[1] - a[1]) + .slice(0, n) + .map(([k]) => k); + + executionContext = { + avgToolCalls: mean(toolCallCounts), + avgErrors: mean(errorCounts), + avgTurns: mean(turnCounts), + commonTools: topN(toolFreq, 5), + failureTools: topN(failureToolFreq, 3), + }; + } + } catch { + // fail-open: body evolution works without execution context + } + // Step 4: Generate -> validate -> refine loop let lastProposal: BodyEvolutionProposal | null = null; let lastValidation: BodyValidationResult | null = null; @@ -258,6 +317,7 @@ export async function evolveBody( teacherAgent, teacherModel, fewShotExamples, + executionContext, ); } } else if (lastProposal && lastValidation) { @@ -290,6 +350,38 @@ export async function evolveBody( eval_set: evalSet, }); + // Constitutional size check (deterministic, pre-validation — body only) + if (target === "body") { + const constitution = checkConstitutionSizeOnly( + proposal.proposed_body, + proposal.original_body, + ); + if (!constitution.passed) { + const reason = `Constitutional: ${constitution.violations.join("; ")}`; + recordAudit(proposal.proposal_id, "rejected", reason); + recordEvidence({ + timestamp: new Date().toISOString(), + proposal_id: proposal.proposal_id, + skill_name: skillName, + skill_path: skillPath, + target, + stage: "rejected", + rationale: proposal.rationale, + confidence: proposal.confidence, + details: reason, + original_text: proposal.original_body, + proposed_text: proposal.proposed_body, + }); + return { + proposal: lastProposal, + validation: null, + deployed: false, + auditEntries, + reason, + }; + } + } + // Check confidence threshold if (proposal.confidence < confidenceThreshold) { recordAudit( diff --git a/cli/selftune/evolution/evolve.ts b/cli/selftune/evolution/evolve.ts index 0e15a688..a2f8b6bc 100644 --- a/cli/selftune/evolution/evolve.ts +++ b/cli/selftune/evolution/evolve.ts @@ -9,7 +9,7 @@ import { copyFileSync, existsSync, readFileSync, writeFileSync } from "node:fs"; import { parseArgs } from "node:util"; -import { QUERY_LOG, SKILL_LOG, TELEMETRY_LOG } from "../constants.js"; +import { QUERY_LOG, SKILL_LOG } from "../constants.js"; import type { BaselineMeasurement } from "../eval/baseline.js"; import { measureBaseline } from "../eval/baseline.js"; import { buildEvalSet } from "../eval/hooks-to-evals.js"; @@ -40,6 +40,7 @@ import { parseFrontmatter, replaceFrontmatterDescription } from "../utils/frontm import { createEvolveTUI } from "../utils/tui.js"; import { appendAuditEntry } from "./audit.js"; +import { checkConstitution } from "./constitutional.js"; import { appendEvidenceEntry } from "./evidence.js"; import { extractFailurePatterns } from "./extract-patterns.js"; import { @@ -129,6 +130,7 @@ function createAuditEntry( details: string, evalSnapshot?: EvalPassRate, skillName?: string, + iterationsUsed?: number, ): EvolutionAuditEntry { return { timestamp: new Date().toISOString(), @@ -137,6 +139,7 @@ function createAuditEntry( details, ...(skillName ? { skill_name: skillName } : {}), ...(evalSnapshot ? { eval_snapshot: evalSnapshot } : {}), + ...(iterationsUsed != null ? { iterations_used: iterationsUsed } : {}), }; } @@ -210,8 +213,16 @@ export async function evolve( action: EvolutionAuditEntry["action"], details: string, evalSnapshot?: EvalPassRate, + iterationsUsed?: number, ): void { - const entry = createAuditEntry(proposalId, action, details, evalSnapshot, skillName); + const entry = createAuditEntry( + proposalId, + action, + details, + evalSnapshot, + skillName, + iterationsUsed, + ); auditEntries.push(entry); try { _appendAuditEntry(entry); @@ -353,6 +364,33 @@ export async function evolve( `Extracted ${failurePatterns.length} failure pattern(s) (${totalMissed} missed queries)`, ); + // Compute aggregate grading metrics for proposal context + const aggregateMetrics = options.gradingResults?.length + ? (() => { + const scores = options.gradingResults.map( + (r) => r.summary.mean_score ?? r.summary.pass_rate, + ); + const meanScore = scores.reduce((a, b) => a + b, 0) / scores.length; + const scoreStdDev = Math.sqrt( + scores.reduce((sum, s) => sum + (s - meanScore) ** 2, 0) / scores.length, + ); + const failedRate = + options.gradingResults.filter((r) => r.summary.failed > 0).length / + options.gradingResults.length; + const errors = options.gradingResults.map( + (r) => r.execution_metrics?.errors_encountered ?? 0, + ); + const meanErrors = errors.reduce((a, b) => a + b, 0) / errors.length; + return { + mean_score: meanScore, + score_std_dev: scoreStdDev, + failed_session_rate: failedRate, + mean_errors: meanErrors, + total_graded: options.gradingResults.length, + }; + })() + : undefined; + // ----------------------------------------------------------------------- // Step 5: Cold-start bootstrap or early exit if no patterns // ----------------------------------------------------------------------- @@ -423,6 +461,8 @@ export async function evolve( ); } + let iterationsCompleted = 0; + if (paretoEnabled && candidateCount > 1) { // Generate N candidates in parallel const candidates = await generateMultipleProposals( @@ -434,6 +474,7 @@ export async function evolve( agent, candidateCount, options.proposalModel, + aggregateMetrics, ); // Filter by confidence threshold @@ -473,6 +514,32 @@ export async function evolve( eval_set: evalSet, }); + // Constitutional check before validation (same gate as retry flow) + const constitution = checkConstitution( + proposal.proposed_description, + currentDescription, + skillName, + ); + if (!constitution.passed) { + const reason = `Constitutional: ${constitution.violations.join("; ")}`; + recordAudit(proposal.proposal_id, "rejected", reason); + recordEvidence({ + timestamp: new Date().toISOString(), + proposal_id: proposal.proposal_id, + skill_name: skillName, + skill_path: skillPath, + target: "description", + stage: "rejected", + rationale: proposal.rationale, + confidence: proposal.confidence, + details: reason, + original_text: proposal.original_description, + proposed_text: proposal.proposed_description, + eval_set: evalSet, + }); + continue; + } + const validation = await _validateProposal( proposal, evalSet, @@ -537,6 +604,7 @@ export async function evolve( lastProposal = best.proposal; lastValidation = best.validation; + iterationsCompleted = 1; // Pareto selection is a single-pass // Skip the standard retry loop — we already have our result } else { @@ -544,6 +612,7 @@ export async function evolve( let feedbackReason = ""; for (let iteration = 0; iteration < maxIterations; iteration++) { + iterationsCompleted = iteration + 1; // Step 7: Generate proposal const effectiveMissedQueries = feedbackReason ? [...missedQueries, `[Previous attempt failed: ${feedbackReason}]`] @@ -558,6 +627,7 @@ export async function evolve( skillPath, agent, options.proposalModel, + aggregateMetrics, ); llmCallCount++; @@ -585,6 +655,39 @@ export async function evolve( eval_set: evalSet, }); + // Step 8b: Constitutional check (deterministic, pre-validation) + const constitution = checkConstitution( + proposal.proposed_description, + currentDescription, + skillName, + ); + if (!constitution.passed) { + feedbackReason = `Constitutional: ${constitution.violations.join("; ")}`; + recordAudit(proposal.proposal_id, "rejected", feedbackReason); + recordEvidence({ + timestamp: new Date().toISOString(), + proposal_id: proposal.proposal_id, + skill_name: skillName, + skill_path: skillPath, + target: "description", + stage: "rejected", + rationale: proposal.rationale, + confidence: proposal.confidence, + details: feedbackReason, + }); + if (iteration === maxIterations - 1) { + finishTui(); + return withStats({ + proposal: lastProposal, + validation: null, + deployed: false, + auditEntries, + reason: feedbackReason, + }); + } + continue; + } + // Step 9: Check confidence threshold if (proposal.confidence < confidenceThreshold) { feedbackReason = `Confidence ${proposal.confidence} below threshold ${confidenceThreshold}`; @@ -758,6 +861,26 @@ export async function evolve( ); if (!baselineResult.adds_value) { + recordAudit( + lastProposal.proposal_id, + "rejected", + `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`, + ); + recordEvidence({ + timestamp: new Date().toISOString(), + proposal_id: lastProposal.proposal_id, + skill_name: skillName, + skill_path: skillPath, + target: "description", + stage: "rejected", + rationale: lastProposal.rationale, + confidence: lastProposal.confidence, + details: `Baseline gate failed: lift=${baselineResult.lift.toFixed(3)} below 0.05 threshold`, + validation: { + improved: false, + net_change: baselineResult.lift, + }, + }); finishTui(); return withStats({ proposal: lastProposal, @@ -777,17 +900,37 @@ export async function evolve( if (options.gateModel && lastProposal && lastValidation?.improved) { tui.step(`Gate validation (${options.gateModel})...`); gateValidation = await _gateValidateProposal(lastProposal, evalSet, agent, options.gateModel); + llmCallCount++; tui.done( `Gate (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`, ); - recordAudit( - lastProposal.proposal_id, - "validated", - `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`, - ); - if (!gateValidation.improved) { + recordAudit( + lastProposal.proposal_id, + "rejected", + `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`, + ); + recordEvidence({ + timestamp: new Date().toISOString(), + proposal_id: lastProposal.proposal_id, + skill_name: skillName, + skill_path: skillPath, + target: "description", + stage: "rejected", + rationale: lastProposal.rationale, + confidence: lastProposal.confidence, + details: `Gate validation failed (${options.gateModel}): net_change=${gateValidation.net_change.toFixed(3)}`, + validation: { + improved: gateValidation.improved, + before_pass_rate: gateValidation.before_pass_rate, + after_pass_rate: gateValidation.after_pass_rate, + net_change: gateValidation.net_change, + regressions: gateValidation.regressions, + new_passes: gateValidation.new_passes, + per_entry_results: gateValidation.per_entry_results, + }, + }); finishTui(); return withStats({ proposal: lastProposal, @@ -799,6 +942,12 @@ export async function evolve( ...(baselineResult ? { baselineResult } : {}), }); } + + recordAudit( + lastProposal.proposal_id, + "validated", + `Gate validation (${options.gateModel}): improved=${gateValidation.improved}, net_change=${gateValidation.net_change.toFixed(3)}`, + ); } // ----------------------------------------------------------------------- @@ -826,12 +975,18 @@ export async function evolve( console.error("------------------------------\n"); } - recordAudit(lastProposal.proposal_id, "deployed", `Deployed proposal for ${skillName}`, { - total: evalSet.length, - passed: Math.round(lastValidation.after_pass_rate * evalSet.length), - failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length), - pass_rate: lastValidation.after_pass_rate, - }); + recordAudit( + lastProposal.proposal_id, + "deployed", + `Deployed proposal for ${skillName}`, + { + total: evalSet.length, + passed: Math.round(lastValidation.after_pass_rate * evalSet.length), + failed: evalSet.length - Math.round(lastValidation.after_pass_rate * evalSet.length), + pass_rate: lastValidation.after_pass_rate, + }, + iterationsCompleted, + ); recordEvidence({ timestamp: new Date().toISOString(), proposal_id: lastProposal.proposal_id, @@ -1135,7 +1290,7 @@ if (import.meta.main) { console.error( "\nTroubleshooting:\n" + " - Verify --skill-path points to a valid SKILL.md file\n" + - " - Ensure eval data exists (run `selftune evals` first) or pass --eval-set\n" + + " - Ensure eval data exists (run `selftune eval generate` first) or pass --eval-set\n" + " - Check that ANTHROPIC_API_KEY is set if using Claude\n" + " - Re-run with --verbose for full diagnostic output", ); diff --git a/cli/selftune/evolution/propose-body.ts b/cli/selftune/evolution/propose-body.ts index b272f6ff..3ed51682 100644 --- a/cli/selftune/evolution/propose-body.ts +++ b/cli/selftune/evolution/propose-body.ts @@ -37,6 +37,15 @@ Do NOT include any text outside the JSON object.`; // Prompt builder // --------------------------------------------------------------------------- +/** Execution telemetry context for body evolution proposals. */ +export interface ExecutionContext { + avgToolCalls: number; + avgErrors: number; + avgTurns: number; + commonTools: string[]; + failureTools: string[]; +} + /** Build the user prompt for full body generation. */ export function buildBodyGenerationPrompt( currentContent: string, @@ -44,6 +53,7 @@ export function buildBodyGenerationPrompt( missedQueries: string[], skillName: string, fewShotExamples?: string[], + executionContext?: ExecutionContext, ): string { const patternLines = failurePatterns.map((p) => { const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n"); @@ -66,6 +76,11 @@ export function buildBodyGenerationPrompt( const feedbackSection = feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : ""; + // Build execution telemetry section if provided + const executionSection = executionContext + ? `\n\nExecution Profile (from recent sessions using this skill):\n Average tool calls per session: ${executionContext.avgToolCalls.toFixed(1)}\n Average errors per session: ${executionContext.avgErrors.toFixed(1)}\n Average assistant turns: ${executionContext.avgTurns.toFixed(1)}\n Most-used tools in successful sessions: ${executionContext.commonTools.join(", ") || "none"}\n Tools correlated with failures: ${executionContext.failureTools.join(", ") || "none"}` + : ""; + // Build few-shot examples section if provided const fewShotSection = fewShotExamples && fewShotExamples.length > 0 @@ -81,7 +96,7 @@ Failure Patterns: ${patternLines.join("\n\n")} All Missed Queries: -${missedLines}${feedbackSection}${fewShotSection} +${missedLines}${feedbackSection}${executionSection}${fewShotSection} Generate an improved full body for the "${skillName}" skill that would correctly handle the missed queries listed above. The body should include everything below the # Title line: description, ## Workflow Routing table, and any other sections. Output ONLY a JSON object with "proposed_body", "rationale", and "confidence" fields.`; } @@ -144,6 +159,7 @@ export async function generateBodyProposal( agent: string, modelFlag?: string, fewShotExamples?: string[], + executionContext?: ExecutionContext, ): Promise { const prompt = buildBodyGenerationPrompt( currentContent, @@ -151,6 +167,7 @@ export async function generateBodyProposal( missedQueries, skillName, fewShotExamples, + executionContext, ); const rawResponse = await callLlm(BODY_GENERATOR_SYSTEM, prompt, agent, modelFlag); const { proposed_body, rationale, confidence } = parseBodyProposalResponse(rawResponse); diff --git a/cli/selftune/evolution/propose-description.ts b/cli/selftune/evolution/propose-description.ts index b4297a5f..5fe6a187 100644 --- a/cli/selftune/evolution/propose-description.ts +++ b/cli/selftune/evolution/propose-description.ts @@ -36,12 +36,22 @@ Do NOT include any text outside the JSON object.`; // Prompt builder // --------------------------------------------------------------------------- +/** Aggregate session quality metrics passed into proposal prompts. */ +export interface AggregateMetrics { + mean_score: number; + score_std_dev: number; + failed_session_rate: number; + mean_errors: number; + total_graded: number; +} + /** Build the user prompt for the LLM with context about failures. */ export function buildProposalPrompt( currentDescription: string, failurePatterns: FailurePattern[], missedQueries: string[], skillName: string, + aggregateMetrics?: AggregateMetrics, ): string { const patternLines = failurePatterns.map((p) => { const queries = p.missed_queries.map((q) => ` - "${q}"`).join("\n"); @@ -67,6 +77,10 @@ export function buildProposalPrompt( const feedbackSection = feedbackLines.length > 0 ? `\n\nStructured Failure Analysis:\n${feedbackLines.join("\n")}` : ""; + const metricsSection = aggregateMetrics + ? `\n\nSession Quality Context:\n Mean grading score: ${aggregateMetrics.mean_score.toFixed(2)}/1.0 (σ=${aggregateMetrics.score_std_dev.toFixed(2)})\n Failed session rate: ${(aggregateMetrics.failed_session_rate * 100).toFixed(0)}%\n Mean execution errors per session: ${aggregateMetrics.mean_errors.toFixed(1)}\n Sessions graded: ${aggregateMetrics.total_graded}` + : ""; + return `Skill Name: ${skillName} Current Description: @@ -76,7 +90,7 @@ Failure Patterns: ${patternLines.join("\n\n")} All Missed Queries: -${missedLines}${feedbackSection} +${missedLines}${feedbackSection}${metricsSection} Propose an improved description for the "${skillName}" skill that would correctly route the missed queries listed above. Output ONLY a JSON object with "proposed_description", "rationale", and "confidence" fields.`; } @@ -142,6 +156,7 @@ export async function generateMultipleProposals( agent: string, count = 3, modelFlag?: string, + aggregateMetrics?: AggregateMetrics, ): Promise { const variations = buildPromptVariations( currentDescription, @@ -149,6 +164,7 @@ export async function generateMultipleProposals( missedQueries, skillName, count, + aggregateMetrics, ); const proposals = await Promise.all( @@ -187,6 +203,7 @@ export function buildPromptVariations( missedQueries: string[], skillName: string, count: number, + aggregateMetrics?: AggregateMetrics, ): string[] { const biases: string[] = [ "Focus especially on improving explicit invocation (direct mentions of the skill).", @@ -199,6 +216,7 @@ export function buildPromptVariations( failurePatterns, missedQueries, skillName, + aggregateMetrics, ); const variations: string[] = []; @@ -219,8 +237,15 @@ export async function generateProposal( skillPath: string, agent: string, modelFlag?: string, + aggregateMetrics?: AggregateMetrics, ): Promise { - const prompt = buildProposalPrompt(currentDescription, failurePatterns, missedQueries, skillName); + const prompt = buildProposalPrompt( + currentDescription, + failurePatterns, + missedQueries, + skillName, + aggregateMetrics, + ); const rawResponse = await callLlm(PROPOSER_SYSTEM, prompt, agent, modelFlag); const { proposed_description, rationale, confidence } = parseProposalResponse(rawResponse); diff --git a/cli/selftune/hooks/skill-eval.ts b/cli/selftune/hooks/skill-eval.ts index 92b17b10..09249c09 100644 --- a/cli/selftune/hooks/skill-eval.ts +++ b/cli/selftune/hooks/skill-eval.ts @@ -270,7 +270,7 @@ function detectAgentType(transcriptPath: string): string { async function processSkillToolUse( payload: PostToolUsePayload, - logPath: string, + _logPath: string, canonicalLogPath: string, promptStatePath?: string, ): Promise { diff --git a/cli/selftune/index.ts b/cli/selftune/index.ts index 3a943b60..e798c8aa 100644 --- a/cli/selftune/index.ts +++ b/cli/selftune/index.ts @@ -24,6 +24,7 @@ * selftune export — Export SQLite data to JSONL files * selftune export-canonical — Export canonical telemetry for downstream ingestion * selftune telemetry — Manage anonymous usage analytics (status, enable, disable) + * selftune alpha — Alpha program management (upload) * selftune hook — Run a hook by name (prompt-log, session-stop, etc.) */ @@ -56,6 +57,7 @@ Commands: repair-skill-usage Rebuild trustworthy skill usage from transcripts export Export SQLite data to JSONL files export-canonical Export canonical telemetry for downstream ingestion + alpha Alpha program management (upload) telemetry Manage anonymous usage analytics (status, enable, disable) hook Run a hook by name (prompt-log, session-stop, etc.) @@ -551,6 +553,193 @@ Options: await cliMain(); break; } + case "alpha": { + const sub = process.argv[2]; + if (!sub || sub === "--help" || sub === "-h") { + console.log(`selftune alpha — Alpha program management + +Usage: + selftune alpha [options] + +Subcommands: + upload Run a manual alpha data upload cycle + relink Re-authenticate with the cloud (revokes old key, issues new one) + +Run 'selftune alpha --help' for subcommand-specific options.`); + process.exit(0); + } + process.argv = [process.argv[0], process.argv[1], ...process.argv.slice(3)]; + switch (sub) { + case "upload": { + const { parseArgs } = await import("node:util"); + let values: ReturnType["values"]; + try { + ({ values } = parseArgs({ + options: { + "dry-run": { type: "boolean", default: false }, + help: { type: "boolean", short: "h", default: false }, + }, + strict: true, + })); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.error(`Invalid arguments: ${message}`); + process.exit(1); + } + if (values.help) { + console.log(`selftune alpha upload — Run a manual alpha data upload cycle + +Usage: + selftune alpha upload [--dry-run] + +Options: + --dry-run Log what would be uploaded without sending + -h, --help Show this help message + +Output: + JSON summary: { enrolled, prepared, sent, failed, skipped, guidance? }`); + process.exit(0); + } + + const { SELFTUNE_CONFIG_PATH } = await import("./constants.js"); + const { getAlphaGuidance } = await import("./agent-guidance.js"); + const { readAlphaIdentity } = await import("./alpha-identity.js"); + const { getDb } = await import("./localdb/db.js"); + const { runUploadCycle } = await import("./alpha-upload/index.js"); + const { getSelftuneVersion, readConfiguredAgentType } = await import( + "./utils/selftune-meta.js" + ); + + const identity = readAlphaIdentity(SELFTUNE_CONFIG_PATH); + if (!identity?.enrolled) { + const guidance = getAlphaGuidance(identity); + console.log( + JSON.stringify( + { + enrolled: false, + prepared: 0, + sent: 0, + failed: 0, + skipped: 0, + guidance, + }, + null, + 2, + ), + ); + console.error(`[alpha upload] ${guidance.message}`); + console.error(`[alpha upload] Next: ${guidance.next_command}`); + process.exit(1); + } + + if (!identity.user_id?.trim() || !identity.api_key?.trim()) { + const guidance = getAlphaGuidance(identity); + console.log( + JSON.stringify( + { + enrolled: true, + prepared: 0, + sent: 0, + failed: 0, + skipped: 0, + guidance, + }, + null, + 2, + ), + ); + console.error(`[alpha upload] ${guidance.message}`); + console.error(`[alpha upload] Next: ${guidance.next_command}`); + process.exit(1); + } + + const db = getDb(); + + const result = await runUploadCycle(db, { + enrolled: true, + userId: identity.user_id, + agentType: readConfiguredAgentType(SELFTUNE_CONFIG_PATH, "unknown"), + selftuneVersion: getSelftuneVersion(), + dryRun: values["dry-run"] ?? false, + apiKey: identity.api_key, + }); + + console.log(JSON.stringify(result, null, 2)); + process.exit(result.failed > 0 ? 1 : 0); + break; + } + case "relink": { + const { SELFTUNE_CONFIG_PATH } = await import("./constants.js"); + const { readAlphaIdentity, writeAlphaIdentity, generateUserId } = await import( + "./alpha-identity.js" + ); + const { requestDeviceCode, pollDeviceCode } = await import("./auth/device-code.js"); + const { chmodSync } = await import("node:fs"); + + const existingIdentity = readAlphaIdentity(SELFTUNE_CONFIG_PATH); + process.stderr.write("[alpha relink] Starting device-code authentication flow...\n"); + + const grant = await requestDeviceCode(); + + console.log( + JSON.stringify({ + level: "info", + code: "device_code_issued", + verification_url: grant.verification_url, + user_code: grant.user_code, + expires_in: grant.expires_in, + message: `Open ${grant.verification_url} and enter code: ${grant.user_code}`, + }), + ); + + // Try to open browser + try { + const url = `${grant.verification_url}?code=${grant.user_code}`; + Bun.spawn(["open", url], { stdout: "ignore", stderr: "ignore" }); + process.stderr.write("[alpha relink] Browser opened. Waiting for approval...\n"); + } catch { + process.stderr.write( + "[alpha relink] Could not open browser. Visit the URL above manually.\n", + ); + } + + process.stderr.write("[alpha relink] Polling"); + const result = await pollDeviceCode(grant.device_code, grant.interval, grant.expires_in); + process.stderr.write("\n[alpha relink] Approved!\n"); + + const updatedIdentity = { + enrolled: true, + user_id: existingIdentity?.user_id ?? generateUserId(), + cloud_user_id: result.cloud_user_id, + cloud_org_id: result.org_id, + email: existingIdentity?.email, + display_name: existingIdentity?.display_name, + consent_timestamp: new Date().toISOString(), + api_key: result.api_key, + }; + + writeAlphaIdentity(SELFTUNE_CONFIG_PATH, updatedIdentity); + chmodSync(SELFTUNE_CONFIG_PATH, 0o600); + + console.log( + JSON.stringify({ + level: "info", + code: "alpha_relinked", + replaced_existing_key: Boolean(existingIdentity?.api_key), + cloud_user_id: result.cloud_user_id, + message: "Successfully relinked. Old key revoked by cloud during approval.", + }), + ); + break; + } + default: + console.error( + `Unknown alpha subcommand: ${sub}\nRun 'selftune alpha --help' for available subcommands.`, + ); + process.exit(1); + } + break; + } case "telemetry": { const { cliMain } = await import("./analytics.js"); await cliMain(); diff --git a/cli/selftune/ingestors/claude-replay.ts b/cli/selftune/ingestors/claude-replay.ts index dfbce0b7..e072d148 100644 --- a/cli/selftune/ingestors/claude-replay.ts +++ b/cli/selftune/ingestors/claude-replay.ts @@ -140,9 +140,9 @@ export function parseSession(transcriptPath: string): ParsedSession | null { export function writeSession( session: ParsedSession, dryRun = false, - queryLogPath: string = QUERY_LOG, - telemetryLogPath: string = TELEMETRY_LOG, - skillLogPath: string = SKILL_LOG, + _queryLogPath: string = QUERY_LOG, + _telemetryLogPath: string = TELEMETRY_LOG, + _skillLogPath: string = SKILL_LOG, canonicalLogPath: string = CANONICAL_LOG, ): void { if (dryRun) { diff --git a/cli/selftune/init.ts b/cli/selftune/init.ts index 49a8bf74..81028481 100644 --- a/cli/selftune/init.ts +++ b/cli/selftune/init.ts @@ -12,11 +12,14 @@ */ import { - copyFileSync, + closeSync, existsSync, + fsyncSync, mkdirSync, + openSync, readdirSync, readFileSync, + renameSync, writeFileSync, } from "node:fs"; import { homedir } from "node:os"; @@ -24,12 +27,34 @@ import { dirname, join, resolve } from "node:path"; import { fileURLToPath } from "node:url"; import { parseArgs } from "node:util"; +import { getAlphaGuidance } from "./agent-guidance.js"; +import { + ALPHA_CONSENT_NOTICE, + generateUserId, + isValidApiKeyFormat, + readAlphaIdentity, +} from "./alpha-identity.js"; import { TELEMETRY_NOTICE } from "./analytics.js"; +import { pollDeviceCode, requestDeviceCode } from "./auth/device-code.js"; import { CLAUDE_CODE_HOOK_KEYS, SELFTUNE_CONFIG_DIR, SELFTUNE_CONFIG_PATH } from "./constants.js"; -import type { SelftuneConfig } from "./types.js"; +import type { AgentCommandGuidance, AlphaIdentity, SelftuneConfig } from "./types.js"; import { hookKeyHasSelftuneEntry } from "./utils/hooks.js"; import { detectAgent } from "./utils/llm-call.js"; +interface InitCliErrorPayload extends AgentCommandGuidance { + error: string; +} + +class InitCliError extends Error { + payload: InitCliErrorPayload; + + constructor(payload: InitCliErrorPayload) { + super(payload.message); + this.name = "InitCliError"; + this.payload = payload; + } +} + // --------------------------------------------------------------------------- // Agent type detection // --------------------------------------------------------------------------- @@ -116,6 +141,24 @@ export function determineCliPath(override?: string): string { return resolve(dirname(import.meta.path), "index.ts"); } +function writeSelftuneConfig(configPath: string, config: SelftuneConfig): void { + const serialized = JSON.stringify(config, null, 2); + if (!config.alpha?.api_key?.trim()) { + writeFileSync(configPath, serialized, "utf-8"); + return; + } + + const tempPath = `${configPath}.tmp`; + const fd = openSync(tempPath, "w", 0o600); + try { + writeFileSync(fd, serialized, "utf-8"); + fsyncSync(fd); + } finally { + closeSync(fd); + } + renameSync(tempPath, configPath); +} + // --------------------------------------------------------------------------- // LLM mode determination // --------------------------------------------------------------------------- @@ -270,41 +313,13 @@ export function installClaudeCodeHooks(options?: { // Agent file installation // --------------------------------------------------------------------------- -/** Bundled agent files directory (ships with the npm package). */ -const BUNDLED_AGENTS_DIR = resolve(dirname(import.meta.path), "..", "..", ".claude", "agents"); - /** - * Copy bundled agent markdown files to ~/.claude/agents/. - * Returns a list of file names that were copied (skips files that already exist - * unless `force` is true). + * @deprecated Agent files are now bundled in skill/agents/ and read directly + * by the consuming agent via progressive disclosure. No installation needed. + * Kept as a no-op for backwards compatibility with callers. */ -export function installAgentFiles(options?: { homeDir?: string; force?: boolean }): string[] { - const home = options?.homeDir ?? homedir(); - const force = options?.force ?? false; - const targetDir = join(home, ".claude", "agents"); - - if (!existsSync(BUNDLED_AGENTS_DIR)) return []; - - let sourceFiles: string[]; - try { - sourceFiles = readdirSync(BUNDLED_AGENTS_DIR).filter((f) => f.endsWith(".md")); - } catch { - return []; - } - - if (sourceFiles.length === 0) return []; - - mkdirSync(targetDir, { recursive: true }); - - const copied: string[] = []; - for (const file of sourceFiles) { - const dest = join(targetDir, file); - if (!force && existsSync(dest)) continue; - copyFileSync(join(BUNDLED_AGENTS_DIR, file), dest); - copied.push(file); - } - - return copied; +export function installAgentFiles(_options?: { homeDir?: string; force?: boolean }): string[] { + return []; } // --------------------------------------------------------------------------- @@ -437,6 +452,11 @@ export interface InitOptions { agentOverride?: string; cliPathOverride?: string; homeDir?: string; + alpha?: boolean; + noAlpha?: boolean; + alphaEmail?: string; + alphaName?: string; + alphaKey?: string; } // --------------------------------------------------------------------------- @@ -447,7 +467,7 @@ export interface InitOptions { * Run the init flow. Returns the written (or existing) config. * Extracted as a pure function for testability. */ -export function runInit(opts: InitOptions): SelftuneConfig { +export async function runInit(opts: InitOptions): Promise { const { configDir, configPath, force } = opts; // If config exists and no --force, return existing @@ -462,6 +482,9 @@ export function runInit(opts: InitOptions): SelftuneConfig { } } + // Capture existing alpha identity before overwriting config (for user_id preservation) + const existingAlphaBeforeOverwrite = readAlphaIdentity(configPath); + // Detect agent type const agentType = detectAgentType(opts.agentOverride, opts.homeDir); @@ -485,6 +508,85 @@ export function runInit(opts: InitOptions): SelftuneConfig { const settingsPath = join(home, ".claude", "settings.json"); const hooksInstalled = agentType === "claude_code" ? checkClaudeCodeHooks(settingsPath) : false; + let validatedAlphaIdentity: AlphaIdentity | null = null; + if (opts.alpha) { + if (opts.alphaKey) { + // Direct key entry path — backward compatible, requires email + if (!opts.alphaEmail) { + throw new InitCliError({ + error: "alpha_email_required", + message: + "The --alpha-email flag is required when using --alpha-key. Run: selftune init --alpha --alpha-email user@example.com --alpha-key st_live_", + next_command: "selftune init --alpha --alpha-email --alpha-key st_live_", + suggested_commands: ["selftune init --alpha", "selftune status"], + blocking: true, + code: "alpha_email_required", + }); + } + + if (!isValidApiKeyFormat(opts.alphaKey)) { + throw new InitCliError({ + error: "invalid_api_key_format", + message: "API key must start with 'st_live_' or 'st_test_'. Check the key and retry.", + next_command: "selftune init --alpha --alpha-email --alpha-key st_live_", + suggested_commands: ["selftune status", "selftune doctor"], + blocking: true, + code: "invalid_api_key_format", + }); + } + + validatedAlphaIdentity = { + enrolled: true, + user_id: existingAlphaBeforeOverwrite?.user_id ?? generateUserId(), + email: opts.alphaEmail, + display_name: opts.alphaName, + consent_timestamp: new Date().toISOString(), + api_key: opts.alphaKey, + }; + } else { + // Device-code flow — no key provided, authenticate via browser + process.stderr.write("[alpha] Starting device-code authentication flow...\n"); + + const grant = await requestDeviceCode(); + + // Emit structured JSON for the agent to parse + console.log( + JSON.stringify({ + level: "info", + code: "device_code_issued", + verification_url: grant.verification_url, + user_code: grant.user_code, + expires_in: grant.expires_in, + message: `Open ${grant.verification_url} and enter code: ${grant.user_code}`, + }), + ); + + // Try to open browser + try { + const url = `${grant.verification_url}?code=${grant.user_code}`; + Bun.spawn(["open", url], { stdout: "ignore", stderr: "ignore" }); + process.stderr.write(`[alpha] Browser opened. Waiting for approval...\n`); + } catch { + process.stderr.write(`[alpha] Could not open browser. Visit the URL above manually.\n`); + } + + process.stderr.write("[alpha] Polling"); + const result = await pollDeviceCode(grant.device_code, grant.interval, grant.expires_in); + process.stderr.write("\n[alpha] Approved!\n"); + + validatedAlphaIdentity = { + enrolled: true, + user_id: existingAlphaBeforeOverwrite?.user_id ?? generateUserId(), + cloud_user_id: result.cloud_user_id, + cloud_org_id: result.org_id, + email: opts.alphaEmail, + display_name: opts.alphaName, + consent_timestamp: new Date().toISOString(), + api_key: result.api_key, + }; + } + } + const config: SelftuneConfig = { agent_type: agentType, cli_path: cliPath, @@ -496,13 +598,10 @@ export function runInit(opts: InitOptions): SelftuneConfig { // Write config mkdirSync(configDir, { recursive: true }); - writeFileSync(configPath, JSON.stringify(config, null, 2), "utf-8"); + writeSelftuneConfig(configPath, config); - // Install agent files to ~/.claude/agents/ - const copiedAgents = installAgentFiles({ homeDir: home, force }); - if (copiedAgents.length > 0) { - console.error(`[INFO] Installed agent files: ${copiedAgents.join(", ")}`); - } + // Agent files are bundled in skill/agents/ and read directly by the + // consuming agent — no installation step needed. // Auto-install hooks into ~/.claude/settings.json (Claude Code only) if (agentType === "claude_code") { @@ -513,7 +612,7 @@ export function runInit(opts: InitOptions): SelftuneConfig { if (addedHookKeys.length > 0) { config.hooks_installed = true; // Re-write config with updated hooks_installed flag - writeFileSync(configPath, JSON.stringify(config, null, 2), "utf-8"); + writeSelftuneConfig(configPath, config); console.error( `[INFO] Installed ${addedHookKeys.length} selftune hook(s) into ${settingsPath}: ${addedHookKeys.join(", ")}`, ); @@ -521,11 +620,34 @@ export function runInit(opts: InitOptions): SelftuneConfig { // Re-check in case hooks were already present config.hooks_installed = checkClaudeCodeHooks(settingsPath); if (config.hooks_installed) { - writeFileSync(configPath, JSON.stringify(config, null, 2), "utf-8"); + writeSelftuneConfig(configPath, config); } } } + if (existingAlphaBeforeOverwrite && !opts.alpha && !opts.noAlpha) { + config.alpha = existingAlphaBeforeOverwrite; + writeSelftuneConfig(configPath, config); + } + + // Handle alpha enrollment + if (validatedAlphaIdentity) { + config.alpha = validatedAlphaIdentity; + writeSelftuneConfig(configPath, config); + + const readiness = checkAlphaReadiness(configPath); + console.error(JSON.stringify({ alpha_readiness: readiness })); + } else if (opts.noAlpha) { + if (existingAlphaBeforeOverwrite) { + const identity: AlphaIdentity = { + ...existingAlphaBeforeOverwrite, + enrolled: false, + }; + config.alpha = identity; + writeSelftuneConfig(configPath, config); + } + } + return config; } @@ -541,6 +663,11 @@ export async function cliMain(): Promise { force: { type: "boolean", default: false }, "enable-autonomy": { type: "boolean", default: false }, "schedule-format": { type: "string" }, + alpha: { type: "boolean", default: false }, + "no-alpha": { type: "boolean", default: false }, + "alpha-email": { type: "string" }, + "alpha-name": { type: "string" }, + "alpha-key": { type: "string" }, }, strict: true, }); @@ -551,7 +678,14 @@ export async function cliMain(): Promise { const enableAutonomy = values["enable-autonomy"] ?? false; // Check for existing config without force - if (!force && !enableAutonomy && existsSync(configPath)) { + const hasAlphaMutation = !!( + values.alpha || + values["no-alpha"] || + values["alpha-email"] || + values["alpha-name"] || + values["alpha-key"] + ); + if (!force && !enableAutonomy && !hasAlphaMutation && existsSync(configPath)) { try { const raw = readFileSync(configPath, "utf-8"); const existing = JSON.parse(raw) as SelftuneConfig; @@ -565,15 +699,57 @@ export async function cliMain(): Promise { } } - const config = runInit({ + const config = await runInit({ configDir, configPath, force, agentOverride: values.agent, cliPathOverride: values["cli-path"], + alpha: values.alpha ?? false, + noAlpha: values["no-alpha"] ?? false, + alphaEmail: values["alpha-email"], + alphaName: values["alpha-name"], + alphaKey: values["alpha-key"], }); - console.log(JSON.stringify(config, null, 2)); + // Redact api_key before printing to stdout + const safeConfig = structuredClone(config); + if (safeConfig.alpha?.api_key) { + safeConfig.alpha.api_key = ""; + } + console.log(JSON.stringify(safeConfig, null, 2)); + + // Alpha enrollment output + if (values.alpha) { + console.log( + JSON.stringify({ + level: "info", + code: "alpha_enrolled", + user_id: config.alpha?.user_id, + email: config.alpha?.email, + enrolled: true, + }), + ); + console.log( + JSON.stringify({ + level: "info", + code: "alpha_upload_ready", + message: + "Alpha enrollment complete. Uploads will run automatically during 'selftune orchestrate'. To enable scheduled background sync (includes evolve + watch + upload), run: selftune cron setup", + next_command: "selftune alpha upload", + optional_autonomy: "selftune cron setup", + }), + ); + console.error(ALPHA_CONSENT_NOTICE); + } else if (values["no-alpha"]) { + console.log( + JSON.stringify({ + level: "info", + code: "alpha_unenrolled", + enrolled: false, + }), + ); + } // Detect workspace type and report const workspace = detectWorkspaceType(process.cwd()); @@ -637,6 +813,28 @@ export async function cliMain(): Promise { } } +// --------------------------------------------------------------------------- +// Alpha readiness check +// --------------------------------------------------------------------------- + +export function checkAlphaReadiness(configPath: string): { + ready: boolean; + missing: string[]; + guidance: AgentCommandGuidance; +} { + const identity = readAlphaIdentity(configPath); + const missing: string[] = []; + if (!identity) { + missing.push("alpha identity not configured"); + return { ready: false, missing, guidance: getAlphaGuidance(identity) }; + } + if (!identity.enrolled) missing.push("not enrolled"); + if (!identity.api_key) missing.push("api_key not set"); + else if (!isValidApiKeyFormat(identity.api_key)) + missing.push("api_key has invalid format (expected st_live_* or st_test_*)"); + return { ready: missing.length === 0, missing, guidance: getAlphaGuidance(identity) }; +} + // Guard: only run when invoked directly const isMain = (import.meta as Record).main === true || @@ -644,6 +842,10 @@ const isMain = if (isMain) { cliMain().catch((err) => { + if (err instanceof InitCliError) { + console.error(JSON.stringify(err.payload)); + process.exit(1); + } console.error(`[FATAL] ${err}`); process.exit(1); }); diff --git a/cli/selftune/last.ts b/cli/selftune/last.ts index e677c10f..04523212 100644 --- a/cli/selftune/last.ts +++ b/cli/selftune/last.ts @@ -78,7 +78,7 @@ export function computeLastInsight( let recommendation: string; const unmatched = unmatchedQueries.length; if (unmatched > 0) { - recommendation = `${unmatched} queries had no skill match. Run 'selftune evals --list-skills' to investigate.`; + recommendation = `${unmatched} queries had no skill match. Run 'selftune eval generate --list-skills' to investigate.`; } else if (errors > 0) { recommendation = `${errors} errors encountered. Check logs for details.`; } else { diff --git a/cli/selftune/localdb/db.ts b/cli/selftune/localdb/db.ts index 7a9bb3cc..c4e8f0e6 100644 --- a/cli/selftune/localdb/db.ts +++ b/cli/selftune/localdb/db.ts @@ -4,8 +4,8 @@ * Uses Bun's built-in SQLite driver. The database file lives at * ~/.selftune/selftune.db. In dual-write mode (Phase 1+), hooks write * directly to SQLite alongside JSONL. The database is the primary query - * store; JSONL serves as an append-only backup that can rebuild the DB - * via `selftune rebuild-db`. + * store; JSONL serves as an append-only backup that can be exported and + * used to repopulate a fresh DB when a manual recovery is required. */ import { Database } from "bun:sqlite"; @@ -52,7 +52,9 @@ export function openDb(dbPath: string = DB_PATH): Database { } catch (err) { const msg = err instanceof Error ? err.message : String(err); if (msg.includes("duplicate column")) continue; // expected on subsequent runs - throw new Error(`Schema migration failed: ${msg}. Run: selftune rebuild-db`); + throw new Error( + `Schema migration failed: ${msg}. Export first with 'selftune export', then remove '${dbPath}' and rerun 'selftune sync --force' or 'selftune dashboard'.`, + ); } } @@ -63,7 +65,9 @@ export function openDb(dbPath: string = DB_PATH): Database { } catch (err) { const msg = err instanceof Error ? err.message : String(err); if (msg.includes("already exists")) continue; // expected on subsequent runs - throw new Error(`Schema index creation failed: ${msg}. Run: selftune rebuild-db`); + throw new Error( + `Schema index creation failed: ${msg}. Export first with 'selftune export', then remove '${dbPath}' and rerun 'selftune sync --force' or 'selftune dashboard'.`, + ); } } } catch (err) { diff --git a/cli/selftune/localdb/direct-write.ts b/cli/selftune/localdb/direct-write.ts index db07c1cc..e3ac87bc 100644 --- a/cli/selftune/localdb/direct-write.ts +++ b/cli/selftune/localdb/direct-write.ts @@ -175,12 +175,25 @@ export function writeSessionTelemetryToDb(record: SessionTelemetryRecord): boole db, "session-telemetry", ` - INSERT OR IGNORE INTO session_telemetry + INSERT INTO session_telemetry (session_id, timestamp, cwd, transcript_path, tool_calls_json, total_tool_calls, bash_commands_json, skills_triggered_json, skills_invoked_json, assistant_turns, errors_encountered, transcript_chars, last_user_query, source, input_tokens, output_tokens) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(session_id) DO UPDATE SET + timestamp = excluded.timestamp, + tool_calls_json = excluded.tool_calls_json, + total_tool_calls = excluded.total_tool_calls, + bash_commands_json = excluded.bash_commands_json, + skills_triggered_json = excluded.skills_triggered_json, + skills_invoked_json = excluded.skills_invoked_json, + assistant_turns = excluded.assistant_turns, + errors_encountered = excluded.errors_encountered, + transcript_chars = excluded.transcript_chars, + last_user_query = excluded.last_user_query, + input_tokens = COALESCE(excluded.input_tokens, session_telemetry.input_tokens), + output_tokens = COALESCE(excluded.output_tokens, session_telemetry.output_tokens) `, ).run( record.session_id, @@ -231,11 +244,11 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean { return safeWrite("evolution-audit", (db) => { getStmt( db, - "evolution-audit", + "evolution-audit-v2", ` INSERT OR IGNORE INTO evolution_audit - (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json) - VALUES (?, ?, ?, ?, ?, ?) + (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used) + VALUES (?, ?, ?, ?, ?, ?, ?) `, ).run( record.timestamp, @@ -244,6 +257,7 @@ export function writeEvolutionAuditToDb(record: EvolutionAuditEntry): boolean { record.action, record.details, record.eval_snapshot ? JSON.stringify(record.eval_snapshot) : null, + record.iterations_used ?? null, ); }); } diff --git a/cli/selftune/localdb/materialize.ts b/cli/selftune/localdb/materialize.ts index acea0e66..d81a098c 100644 --- a/cli/selftune/localdb/materialize.ts +++ b/cli/selftune/localdb/materialize.ts @@ -10,7 +10,7 @@ // NOTE: With dual-write active (Phase 1+), hooks insert directly into SQLite. // The materializer is only needed for: // 1. Initial startup (to catch pre-existing JSONL data from before dual-write) -// 2. Manual rebuild via `selftune rebuild-db` +// 2. Manual recovery after exporting JSONL and recreating the DB file // 3. Backfill from batch ingestors that don't yet dual-write import type { Database } from "bun:sqlite"; @@ -41,6 +41,118 @@ import { readJsonl, readJsonlFrom } from "../utils/jsonl.js"; import { readEffectiveSkillUsageRecords } from "../utils/skill-log.js"; import { getMeta, setMeta } from "./db.js"; +/** Tables that contain SQLite-only data (written by hooks, not just materialized from JSONL). */ +const _PROTECTED_TABLES = [ + { table: "evolution_audit", tsColumn: "timestamp", jsonlLog: EVOLUTION_AUDIT_LOG }, + { table: "evolution_evidence", tsColumn: "timestamp", jsonlLog: EVOLUTION_EVIDENCE_LOG }, + { table: "orchestrate_runs", tsColumn: "timestamp", jsonlLog: ORCHESTRATE_RUN_LOG }, +] as const; + +/** + * Preflight check before full rebuild: detect tables where SQLite has rows + * newer than the corresponding JSONL file. If found and `force` is not set, + * throw an error so the user can export first. + */ +function preflightRebuildGuard(db: Database, options?: MaterializeOptions): void { + if (options?.force) return; + + const protectedTables = [ + { + table: "evolution_audit", + tsColumn: "timestamp", + jsonlLog: options?.evolutionAuditPath ?? EVOLUTION_AUDIT_LOG, + }, + { + table: "evolution_evidence", + tsColumn: "timestamp", + jsonlLog: options?.evolutionEvidencePath ?? EVOLUTION_EVIDENCE_LOG, + }, + { + table: "orchestrate_runs", + tsColumn: "timestamp", + jsonlLog: options?.orchestrateRunLogPath ?? ORCHESTRATE_RUN_LOG, + }, + ]; + + const warnings: string[] = []; + for (const { table, tsColumn, jsonlLog } of protectedTables) { + // Get newest timestamp in SQLite + let sqliteMax: string | null = null; + try { + const row = db.query(`SELECT MAX(${tsColumn}) AS max_ts FROM ${table}`).get() as { + max_ts: string | null; + } | null; + sqliteMax = row?.max_ts ?? null; + } catch { + continue; // table doesn't exist yet — safe to rebuild + } + + if (!sqliteMax) continue; // no rows in SQLite — safe + + // Get newest timestamp from JSONL + let jsonlMax: string | null = null; + let jsonlBoundaryCount = 0; + try { + const records = readJsonl<{ timestamp: string }>(jsonlLog); + if (records.length > 0) { + jsonlMax = records.reduce( + (max, r) => (r.timestamp > max ? r.timestamp : max), + records[0].timestamp, + ); + jsonlBoundaryCount = records.filter((record) => record.timestamp === jsonlMax).length; + } + } catch { + // JSONL file doesn't exist or is empty — SQLite has data JSONL doesn't + jsonlMax = null; + } + + let newerCount = 0; + let sqliteBoundaryCount = 0; + try { + if (!jsonlMax) { + const row = db.query(`SELECT COUNT(*) AS newer_count FROM ${table}`).get() as { + newer_count: number; + } | null; + newerCount = row?.newer_count ?? 0; + } else if (sqliteMax > jsonlMax) { + const row = db + .query(`SELECT COUNT(*) AS newer_count FROM ${table} WHERE ${tsColumn} > ?`) + .get(jsonlMax) as { + newer_count: number; + } | null; + newerCount = row?.newer_count ?? 0; + } + if (jsonlMax) { + const boundaryRow = db + .query(`SELECT COUNT(*) AS boundary_count FROM ${table} WHERE ${tsColumn} = ?`) + .get(jsonlMax) as { + boundary_count: number; + } | null; + sqliteBoundaryCount = boundaryRow?.boundary_count ?? 0; + } + } catch { + newerCount = 0; + sqliteBoundaryCount = 0; + } + + if ( + !jsonlMax || + newerCount > 0 || + (sqliteMax === jsonlMax && sqliteBoundaryCount !== jsonlBoundaryCount) + ) { + warnings.push( + ` - ${table}: ${newerCount} SQLite-only row(s), SQLite max=${sqliteMax}, JSONL max=${jsonlMax ?? "(empty)"}, boundary_count(SQLite=${sqliteBoundaryCount}, JSONL=${jsonlBoundaryCount})`, + ); + } + } + + if (warnings.length > 0) { + throw new Error( + `Rebuild blocked: the following tables have SQLite-only rows that would be lost:\n${warnings.join("\n")}\n\nRun \`selftune export\` first to preserve this data, then retry with --force.`, + ); + } +} + /** Meta key tracking last materialization timestamp. */ const META_LAST_MATERIALIZED = "last_materialized_at"; /** Meta key prefix for per-file byte offsets (append-only incremental reads). */ @@ -50,6 +162,8 @@ const META_OFFSET_PREFIX = "file_offset:"; * Full rebuild: drop all data tables, then re-insert everything. */ export function materializeFull(db: Database, options?: MaterializeOptions): MaterializeResult { + preflightRebuildGuard(db, options); + const tables = [ "session_telemetry", "evolution_audit", @@ -76,6 +190,8 @@ export interface MaterializeOptions { evolutionEvidencePath?: string; orchestrateRunLogPath?: string; since?: string | null; + /** Skip the preflight rebuild guard (use after `selftune export`). */ + force?: boolean; } export interface MaterializeResult { @@ -381,12 +497,29 @@ function insertExecutionFacts(db: Database, records: CanonicalRecord[]): number function insertSessionTelemetry(db: Database, records: SessionTelemetryRecord[]): number { const stmt = db.prepare(` - INSERT OR IGNORE INTO session_telemetry + INSERT INTO session_telemetry (session_id, timestamp, cwd, transcript_path, tool_calls_json, total_tool_calls, bash_commands_json, skills_triggered_json, skills_invoked_json, assistant_turns, errors_encountered, transcript_chars, last_user_query, source, input_tokens, output_tokens) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(session_id) DO UPDATE SET + timestamp = excluded.timestamp, + cwd = COALESCE(excluded.cwd, session_telemetry.cwd), + transcript_path = COALESCE(excluded.transcript_path, session_telemetry.transcript_path), + source = COALESCE(excluded.source, session_telemetry.source), + tool_calls_json = excluded.tool_calls_json, + total_tool_calls = excluded.total_tool_calls, + bash_commands_json = excluded.bash_commands_json, + skills_triggered_json = COALESCE(excluded.skills_triggered_json, session_telemetry.skills_triggered_json), + skills_invoked_json = COALESCE(excluded.skills_invoked_json, session_telemetry.skills_invoked_json), + assistant_turns = excluded.assistant_turns, + errors_encountered = excluded.errors_encountered, + transcript_chars = excluded.transcript_chars, + last_user_query = excluded.last_user_query, + input_tokens = COALESCE(excluded.input_tokens, session_telemetry.input_tokens), + output_tokens = COALESCE(excluded.output_tokens, session_telemetry.output_tokens) + WHERE session_telemetry.timestamp IS NULL OR excluded.timestamp >= session_telemetry.timestamp `); let count = 0; @@ -465,8 +598,8 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num // (idx_evo_audit_dedup defined in schema.ts). const stmt = db.prepare(` INSERT OR IGNORE INTO evolution_audit - (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json) - VALUES (?, ?, ?, ?, ?, ?) + (timestamp, proposal_id, skill_name, action, details, eval_snapshot_json, iterations_used) + VALUES (?, ?, ?, ?, ?, ?, ?) `); let count = 0; @@ -478,6 +611,7 @@ function insertEvolutionAudit(db: Database, records: EvolutionAuditEntry[]): num r.action, r.details, r.eval_snapshot ? JSON.stringify(r.eval_snapshot) : null, + r.iterations_used ?? null, ); count++; } diff --git a/cli/selftune/localdb/queries.ts b/cli/selftune/localdb/queries.ts index 39ab1e44..099b503e 100644 --- a/cli/selftune/localdb/queries.ts +++ b/cli/selftune/localdb/queries.ts @@ -73,7 +73,7 @@ export function getOverviewPayload(db: Database): OverviewPayload { // Evolution audit (bounded to most recent 500) const evolution = db .query( - `SELECT timestamp, proposal_id, action, details + `SELECT timestamp, proposal_id, skill_name, action, details FROM evolution_audit ORDER BY timestamp DESC LIMIT 500`, @@ -81,6 +81,7 @@ export function getOverviewPayload(db: Database): OverviewPayload { .all() as Array<{ timestamp: string; proposal_id: string; + skill_name: string | null; action: string; details: string; }>; @@ -242,9 +243,14 @@ export function getSkillsList(db: Database): SkillSummary[] { .query( `SELECT si.skill_name, - (SELECT s2.skill_scope FROM skill_invocations s2 - WHERE s2.skill_name = si.skill_name AND s2.skill_scope IS NOT NULL - ORDER BY s2.occurred_at DESC LIMIT 1) as skill_scope, + COALESCE( + (SELECT s2.skill_scope FROM skill_invocations s2 + WHERE s2.skill_name = si.skill_name AND s2.skill_scope IS NOT NULL + ORDER BY s2.occurred_at DESC LIMIT 1), + (SELECT su.skill_scope FROM skill_usage su + WHERE su.skill_name = si.skill_name AND su.skill_scope IS NOT NULL + ORDER BY su.timestamp DESC LIMIT 1) + ) as skill_scope, COUNT(*) as total_checks, SUM(CASE WHEN si.triggered = 1 THEN 1 ELSE 0 END) as triggered_count, COUNT(DISTINCT si.session_id) as unique_sessions, @@ -469,9 +475,12 @@ export function queryEvolutionAudit( eval_snapshot?: Record; }> { const sql = skillName - ? `SELECT * FROM evolution_audit WHERE skill_name = ? ORDER BY timestamp DESC` + ? `SELECT * FROM evolution_audit + WHERE skill_name = ? + OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%') + ORDER BY timestamp DESC` : `SELECT * FROM evolution_audit ORDER BY timestamp DESC`; - const rows = (skillName ? db.query(sql).all(skillName) : db.query(sql).all()) as Array< + const rows = (skillName ? db.query(sql).all(skillName, skillName) : db.query(sql).all()) as Array< Record >; return rows.map((r) => ({ @@ -569,6 +578,75 @@ export function queryImprovementSignals( })); } +// -- Alpha upload query helpers ----------------------------------------------- + +/** + * Get the most recent failed queue item's error and timestamp. + * Returns null if no failed items exist. + */ +export function getLastUploadError( + db: Database, +): { last_error: string | null; updated_at: string } | null { + try { + const row = db + .query( + `SELECT last_error, updated_at + FROM upload_queue + WHERE status = 'failed' + ORDER BY updated_at DESC + LIMIT 1`, + ) + .get() as { last_error: string | null; updated_at: string } | null; + return row ?? null; + } catch { + return null; + } +} + +/** + * Get the most recent sent queue item's timestamp. + * Returns null if no sent items exist. + */ +export function getLastUploadSuccess(db: Database): { updated_at: string } | null { + try { + const row = db + .query( + `SELECT updated_at + FROM upload_queue + WHERE status = 'sent' + ORDER BY updated_at DESC + LIMIT 1`, + ) + .get() as { updated_at: string } | null; + return row ?? null; + } catch { + return null; + } +} + +/** + * Get the age in seconds of the oldest pending queue item. + * Returns null if no pending items exist. + */ +export function getOldestPendingAge(db: Database): number | null { + try { + const row = db + .query( + `SELECT created_at + FROM upload_queue + WHERE status = 'pending' + ORDER BY created_at ASC + LIMIT 1`, + ) + .get() as { created_at: string } | null; + if (!row) return null; + const ageMs = Date.now() - new Date(row.created_at).getTime(); + return Math.floor(ageMs / 1000); + } catch { + return null; + } +} + // -- Helpers ------------------------------------------------------------------ export function safeParseJsonArray(json: string | null): T[] { diff --git a/cli/selftune/localdb/schema.ts b/cli/selftune/localdb/schema.ts index 606fb7ef..323a3b63 100644 --- a/cli/selftune/localdb/schema.ts +++ b/cli/selftune/localdb/schema.ts @@ -182,6 +182,41 @@ CREATE TABLE IF NOT EXISTS improvement_signals ( consumed_by_run TEXT )`; +// -- Alpha upload queue ------------------------------------------------------- + +export const CREATE_UPLOAD_QUEUE = ` +CREATE TABLE IF NOT EXISTS upload_queue ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + payload_type TEXT NOT NULL, + payload_json TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', + attempts INTEGER NOT NULL DEFAULT 0, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + last_error TEXT +)`; + +// -- Canonical upload staging ------------------------------------------------- + +export const CREATE_CANONICAL_UPLOAD_STAGING = ` +CREATE TABLE IF NOT EXISTS canonical_upload_staging ( + local_seq INTEGER PRIMARY KEY AUTOINCREMENT, + record_kind TEXT NOT NULL, + record_id TEXT NOT NULL, + record_json TEXT NOT NULL, + session_id TEXT, + prompt_id TEXT, + normalized_at TEXT, + staged_at TEXT NOT NULL +)`; + +export const CREATE_UPLOAD_WATERMARKS = ` +CREATE TABLE IF NOT EXISTS upload_watermarks ( + payload_type TEXT PRIMARY KEY, + last_uploaded_id INTEGER NOT NULL, + updated_at TEXT NOT NULL +)`; + // -- Metadata table ----------------------------------------------------------- export const CREATE_META = ` @@ -227,6 +262,13 @@ export const CREATE_INDEXES = [ `CREATE INDEX IF NOT EXISTS idx_signals_consumed ON improvement_signals(consumed)`, `CREATE INDEX IF NOT EXISTS idx_signals_ts ON improvement_signals(timestamp)`, `CREATE UNIQUE INDEX IF NOT EXISTS idx_signals_dedup ON improvement_signals(session_id, query, signal_type, timestamp)`, + // -- Alpha upload queue indexes --------------------------------------------- + `CREATE INDEX IF NOT EXISTS idx_upload_queue_status ON upload_queue(status)`, + `CREATE INDEX IF NOT EXISTS idx_upload_queue_type_status ON upload_queue(payload_type, status)`, + // -- Canonical upload staging indexes --------------------------------------- + `CREATE INDEX IF NOT EXISTS idx_staging_kind ON canonical_upload_staging(record_kind)`, + `CREATE INDEX IF NOT EXISTS idx_staging_session ON canonical_upload_staging(session_id)`, + `CREATE UNIQUE INDEX IF NOT EXISTS idx_staging_dedup ON canonical_upload_staging(record_kind, record_id)`, ]; /** @@ -239,6 +281,8 @@ export const MIGRATIONS = [ `ALTER TABLE skill_invocations ADD COLUMN skill_path TEXT`, `ALTER TABLE skill_invocations ADD COLUMN skill_scope TEXT`, `ALTER TABLE skill_invocations ADD COLUMN source TEXT`, + // Track how many iteration loops each evolution run used + `ALTER TABLE evolution_audit ADD COLUMN iterations_used INTEGER`, ]; /** Indexes that depend on migration columns — must run AFTER MIGRATIONS. */ @@ -261,6 +305,9 @@ export const ALL_DDL = [ CREATE_ORCHESTRATE_RUNS, CREATE_QUERIES, CREATE_IMPROVEMENT_SIGNALS, + CREATE_UPLOAD_QUEUE, + CREATE_UPLOAD_WATERMARKS, + CREATE_CANONICAL_UPLOAD_STAGING, CREATE_META, ...CREATE_INDEXES, ]; diff --git a/cli/selftune/observability.ts b/cli/selftune/observability.ts index 6c7d4137..7ff5579c 100644 --- a/cli/selftune/observability.ts +++ b/cli/selftune/observability.ts @@ -11,8 +11,18 @@ import { existsSync, readFileSync } from "node:fs"; import { homedir } from "node:os"; import { join } from "node:path"; +import { getAlphaGuidance } from "./agent-guidance.js"; +import { getAlphaLinkState, readAlphaIdentity } from "./alpha-identity.js"; import { LOG_DIR, REQUIRED_FIELDS, SELFTUNE_CONFIG_PATH } from "./constants.js"; -import type { DoctorResult, HealthCheck, HealthStatus, SelftuneConfig } from "./types.js"; +import { DB_PATH, getDb } from "./localdb/db.js"; +import type { + AlphaIdentity, + AlphaLinkState, + DoctorResult, + HealthCheck, + HealthStatus, + SelftuneConfig, +} from "./types.js"; import { missingClaudeCodeHookKeys } from "./utils/hooks.js"; const VALID_AGENT_TYPES = new Set(["claude_code", "codex", "opencode", "openclaw", "unknown"]); @@ -116,6 +126,13 @@ export function checkHookInstallation(): HealthCheck[] { if (!existsSync(settingsPath)) { settingsCheck.status = "warn"; settingsCheck.message = "Claude Code settings.json not found"; + settingsCheck.guidance = { + code: "hook_settings_missing", + message: "Claude Code settings.json is missing. Re-run init to install the selftune hooks.", + next_command: "selftune init --force", + suggested_commands: ["selftune doctor"], + blocking: true, + }; } else { try { const raw = readFileSync(settingsPath, "utf-8"); @@ -124,11 +141,25 @@ export function checkHookInstallation(): HealthCheck[] { if (!hooks || typeof hooks !== "object") { settingsCheck.status = "warn"; settingsCheck.message = "No hooks section in settings.json"; + settingsCheck.guidance = { + code: "hook_settings_missing", + message: "The Claude Code hooks are not configured yet.", + next_command: "selftune init --force", + suggested_commands: ["selftune doctor"], + blocking: true, + }; } else { const missing = missingClaudeCodeHookKeys(hooks as Record); if (missing.length > 0) { settingsCheck.status = "warn"; settingsCheck.message = `Selftune hooks not configured for: ${missing.join(", ")}`; + settingsCheck.guidance = { + code: "hook_settings_incomplete", + message: "Some Claude Code hooks are missing.", + next_command: "selftune init --force", + suggested_commands: ["selftune doctor"], + blocking: true, + }; } else { settingsCheck.status = "pass"; settingsCheck.message = "All selftune hooks configured in settings.json"; @@ -165,6 +196,18 @@ export function checkEvolutionHealth(): HealthCheck[] { return [check]; } +export function checkDashboardIntegrityHealth(): HealthCheck[] { + const check: HealthCheck = { + name: "dashboard_freshness_mode", + path: DB_PATH, + status: "warn", + message: + "Dashboard reads SQLite, but live refresh still relies on JSONL watcher invalidation instead of SQLite WAL. Expect freshness gaps for SQLite-only writes and export before destructive recovery.", + }; + + return [check]; +} + export function checkConfigHealth(): HealthCheck[] { const check: HealthCheck = { name: "config", @@ -176,6 +219,13 @@ export function checkConfigHealth(): HealthCheck[] { if (!existsSync(SELFTUNE_CONFIG_PATH)) { check.status = "warn"; check.message = "Config not found. Run 'selftune init' to bootstrap."; + check.guidance = { + code: "config_missing", + message: "selftune is not initialized yet.", + next_command: "selftune init", + suggested_commands: ["selftune doctor"], + blocking: true, + }; } else { try { const raw = readFileSync(SELFTUNE_CONFIG_PATH, "utf-8"); @@ -190,6 +240,13 @@ export function checkConfigHealth(): HealthCheck[] { if (errors.length > 0) { check.status = "fail"; check.message = errors.join("; "); + check.guidance = { + code: "config_invalid", + message: "The selftune config is invalid and needs to be regenerated.", + next_command: "selftune init --force", + suggested_commands: ["selftune doctor"], + blocking: true, + }; } else { check.status = "pass"; check.message = `agent_type=${config.agent_type}, llm_mode=${config.llm_mode}`; @@ -197,6 +254,13 @@ export function checkConfigHealth(): HealthCheck[] { } catch { check.status = "fail"; check.message = "Config file exists but is not valid JSON"; + check.guidance = { + code: "config_invalid_json", + message: "The selftune config file is corrupt JSON.", + next_command: "selftune init --force", + suggested_commands: ["selftune doctor"], + blocking: true, + }; } } @@ -249,6 +313,13 @@ export async function checkVersionHealth(): Promise { } else { check.status = "warn"; check.message = `v${currentVersion} installed, v${latestVersion} available. Run: npx skills add selftune-dev/selftune`; + check.guidance = { + code: "version_update_available", + message: "A newer selftune release is available.", + next_command: "npx skills add selftune-dev/selftune", + suggested_commands: ["selftune doctor"], + blocking: false, + }; } } else { check.message = `v${currentVersion} (unable to check npm registry)`; @@ -263,24 +334,199 @@ export async function checkVersionHealth(): Promise { return [check]; } +// --------------------------------------------------------------------------- +// Alpha upload queue health checks +// --------------------------------------------------------------------------- + +const ALPHA_STUCK_THRESHOLD_SECONDS = 3600; // 1 hour +const ALPHA_FAILURE_THRESHOLD = 50; + +export interface AlphaQueueCheckOptions { + stuckThresholdSeconds?: number; + failureThreshold?: number; +} + +/** + * Check alpha upload queue health. + * Returns empty array when not enrolled (checks are skipped). + */ +export async function checkAlphaQueueHealth( + db: import("bun:sqlite").Database, + enrolled: boolean, + opts?: AlphaQueueCheckOptions, +): Promise { + if (!enrolled) return []; + + const { getQueueStats } = await import("./alpha-upload/queue.js"); + const { getOldestPendingAge } = await import("./localdb/queries.js"); + + const checks: HealthCheck[] = []; + const stuckThreshold = opts?.stuckThresholdSeconds ?? ALPHA_STUCK_THRESHOLD_SECONDS; + const failureThreshold = opts?.failureThreshold ?? ALPHA_FAILURE_THRESHOLD; + + // Check for stuck pending items + const stuckCheck: HealthCheck = { + name: "alpha_queue_stuck", + path: "upload_queue", + status: "pass", + message: "", + }; + + const oldestAge = getOldestPendingAge(db); + if (oldestAge !== null && oldestAge > stuckThreshold) { + stuckCheck.status = "warn"; + const hours = Math.floor(oldestAge / 3600); + const minutes = Math.floor((oldestAge % 3600) / 60); + stuckCheck.message = `Oldest pending upload is ${hours}h ${minutes}m old (threshold: ${Math.floor(stuckThreshold / 3600)}h)`; + stuckCheck.guidance = { + code: "alpha_queue_stuck", + message: "The alpha upload queue has pending items that are not draining.", + next_command: "selftune alpha upload", + suggested_commands: ["selftune doctor", "selftune status"], + blocking: false, + }; + } else { + stuckCheck.message = + oldestAge !== null + ? `Oldest pending item: ${Math.floor(oldestAge / 60)}m old` + : "No pending items"; + } + checks.push(stuckCheck); + + // Check for excessive failures + const failCheck: HealthCheck = { + name: "alpha_queue_failures", + path: "upload_queue", + status: "pass", + message: "", + }; + + const stats = getQueueStats(db); + if (stats.failed > failureThreshold) { + failCheck.status = "warn"; + failCheck.message = `${stats.failed} failed uploads (threshold: ${failureThreshold})`; + failCheck.guidance = { + code: "alpha_queue_failures", + message: "The alpha upload queue has accumulated too many failures.", + next_command: "selftune alpha upload", + suggested_commands: ["selftune doctor", "selftune status"], + blocking: false, + }; + } else { + failCheck.message = `${stats.failed} failed uploads`; + } + checks.push(failCheck); + + return checks; +} + +export function checkSkillVersionSync(): HealthCheck[] { + const check: HealthCheck = { + name: "skill_version_sync", + path: "skill/SKILL.md", + status: "pass", + message: "", + }; + + try { + const pkgPath = join(import.meta.dir, "../../package.json"); + const pkgVersion: string = JSON.parse(readFileSync(pkgPath, "utf-8")).version; + + const skillPath = join(import.meta.dir, "../../skill/SKILL.md"); + if (!existsSync(skillPath)) { + check.status = "warn"; + check.message = "skill/SKILL.md not found (may be running from installed package)"; + return [check]; + } + + const skillContent = readFileSync(skillPath, "utf-8"); + const versionMatch = skillContent.match(/^\s*version:\s*(.+)$/m); + if (!versionMatch) { + check.status = "warn"; + check.message = "No version field found in SKILL.md frontmatter"; + return [check]; + } + + const skillVersion = versionMatch[1].trim(); + if (skillVersion === pkgVersion) { + check.message = `v${pkgVersion} (in sync)`; + } else { + check.status = "warn"; + check.message = `SKILL.md has v${skillVersion} but package.json has v${pkgVersion}. Run: bun run sync-version`; + check.guidance = { + code: "skill_version_out_of_sync", + message: "The packaged skill version does not match package.json.", + next_command: "bun run sync-version", + suggested_commands: ["selftune doctor"], + blocking: false, + }; + } + } catch { + check.status = "warn"; + check.message = "Unable to compare versions"; + } + + return [check]; +} + +// --------------------------------------------------------------------------- +// Cloud link health checks +// --------------------------------------------------------------------------- + +/** + * Check cloud link health for alpha users. + * Returns [] for non-alpha users (identity is null). + */ +const CLOUD_LINK_CHECKS: Record = { + not_linked: { status: "warn", message: "Not linked to cloud account (cloud_user_id missing)" }, + linked_not_enrolled: { status: "warn", message: "Linked but not enrolled" }, + enrolled_no_credential: { + status: "warn", + message: "Enrolled but api_key missing — uploads will fail", + }, + ready: { status: "pass", message: "Cloud link ready" }, +}; + +export function checkCloudLinkHealth(identity: AlphaIdentity | null): HealthCheck[] { + if (!identity) return []; + const state = getAlphaLinkState(identity); + const { status, message } = CLOUD_LINK_CHECKS[state]; + return [ + { + name: "cloud_link", + path: SELFTUNE_CONFIG_PATH, + status, + message, + guidance: getAlphaGuidance(identity), + }, + ]; +} + export async function doctor(): Promise { + const alphaIdentity = readAlphaIdentity(SELFTUNE_CONFIG_PATH); + const db = getDb(); const allChecks = [ ...checkConfigHealth(), ...checkLogHealth(), ...checkHookInstallation(), ...checkEvolutionHealth(), + ...checkDashboardIntegrityHealth(), + ...checkSkillVersionSync(), ...(await checkVersionHealth()), + ...checkCloudLinkHealth(alphaIdentity), + ...(await checkAlphaQueueHealth(db, alphaIdentity?.enrolled === true)), ]; const passed = allChecks.filter((c) => c.status === "pass").length; const failed = allChecks.filter((c) => c.status === "fail").length; const warned = allChecks.filter((c) => c.status === "warn").length; + const hasBlockingGuidance = allChecks.some((c) => c.guidance?.blocking === true); return { command: "doctor", timestamp: new Date().toISOString(), checks: allChecks, summary: { pass: passed, fail: failed, warn: warned, total: allChecks.length }, - healthy: failed === 0, + healthy: failed === 0 && !hasBlockingGuidance, }; } diff --git a/cli/selftune/orchestrate.ts b/cli/selftune/orchestrate.ts index 56ae6481..bae01e24 100644 --- a/cli/selftune/orchestrate.ts +++ b/cli/selftune/orchestrate.ts @@ -14,7 +14,9 @@ import { homedir } from "node:os"; import { join } from "node:path"; import { parseArgs } from "node:util"; -import { ORCHESTRATE_LOCK, SIGNAL_LOG } from "./constants.js"; +import { readAlphaIdentity } from "./alpha-identity.js"; +import type { UploadCycleSummary } from "./alpha-upload/index.js"; +import { ORCHESTRATE_LOCK, SELFTUNE_CONFIG_PATH, SIGNAL_LOG } from "./constants.js"; import type { OrchestrateRunReport, OrchestrateRunSkillAction } from "./dashboard-contract.js"; import type { EvolveResult } from "./evolution/evolve.js"; import { readGradingResultsForSkill } from "./grading/results.js"; @@ -42,6 +44,7 @@ import type { } from "./types.js"; import { readJsonl } from "./utils/jsonl.js"; import { detectAgent } from "./utils/llm-call.js"; +import { getSelftuneVersion, readConfiguredAgentType } from "./utils/selftune-meta.js"; import { findInstalledSkillPath, findRepositoryClaudeSkillDirs, @@ -192,6 +195,7 @@ export interface OrchestrateResult { syncResult: SyncResult; statusResult: StatusResult; candidates: SkillAction[]; + uploadSummary?: UploadCycleSummary; summary: { totalSkills: number; evaluated: number; @@ -430,6 +434,77 @@ function defaultResolveSkillPath(skillName: string): string | undefined { return findInstalledSkillPath(skillName, getSkillSearchDirs()); } +// --------------------------------------------------------------------------- +// Cross-skill eval set overlap detection (internal — exported for testing only) +// --------------------------------------------------------------------------- + +/** + * Detects significant overlap between the positive eval sets of evolution + * candidates. When two skills share >30% of their positive queries, it + * suggests a routing boundary problem. Console-only — no persistence. + * + * @internal Exported solely for unit testing. + */ +export async function detectCrossSkillOverlap( + candidates: Array<{ skill: string }>, + skillRecords: SkillUsageRecord[], + queryRecords: QueryLogRecord[], +): Promise< + Array<{ skill_a: string; skill_b: string; overlap_pct: number; shared_queries: string[] }> +> { + if (candidates.length < 2) return []; + + const { buildEvalSet } = await import("./eval/hooks-to-evals.js"); + + const evalSets = new Map>(); + + for (const c of candidates) { + const evalSet = buildEvalSet(skillRecords, queryRecords, c.skill); + const positives = new Set( + evalSet + .filter((e: { should_trigger: boolean }) => e.should_trigger) + .map((e: { query: string }) => e.query.toLowerCase()), + ); + evalSets.set(c.skill, positives); + } + + const overlaps: Array<{ + skill_a: string; + skill_b: string; + overlap_pct: number; + shared_queries: string[]; + }> = []; + const skillNames = [...evalSets.keys()]; + + for (let i = 0; i < skillNames.length; i++) { + for (let j = i + 1; j < skillNames.length; j++) { + const setA = evalSets.get(skillNames[i]); + const setB = evalSets.get(skillNames[j]); + if (!setA || !setB) continue; + + if (setA.size === 0 || setB.size === 0) continue; + + const shared: string[] = []; + for (const q of setA) { + if (setB.has(q)) shared.push(q); + } + + const overlapPct = shared.length / Math.min(setA.size, setB.size); + + if (overlapPct > 0.3) { + overlaps.push({ + skill_a: skillNames[i], + skill_b: skillNames[j], + overlap_pct: overlapPct, + shared_queries: shared.slice(0, 10), + }); + } + } + } + + return overlaps; +} + // --------------------------------------------------------------------------- // Candidate selection // --------------------------------------------------------------------------- @@ -722,6 +797,24 @@ export async function orchestrate( console.error(` ${c.action === "skip" ? "⊘" : "→"} ${c.skill}: ${c.reason}`); } + // Cross-skill overlap detection (console-only, non-critical) + if (evolveCandidates.length >= 2) { + try { + const overlap = await detectCrossSkillOverlap(evolveCandidates, skillRecords, queryRecords); + if (overlap.length > 0) { + console.error("\n[orchestrate] Cross-skill eval overlap detected:"); + for (const o of overlap) { + console.error( + ` ⚠ ${o.skill_a} ↔ ${o.skill_b}: ${(o.overlap_pct * 100).toFixed(0)}% shared queries (${o.shared_queries.length} queries)`, + ); + } + console.error(""); + } + } catch { + // fail-open: overlap detection is non-critical + } + } + // ------------------------------------------------------------------------- // Step 4: Detect agent // ------------------------------------------------------------------------- @@ -905,6 +998,33 @@ export async function orchestrate( /* fail-open */ } + // ------------------------------------------------------------------------- + // Step 9: Alpha upload (fail-open — never blocks the orchestrate loop) + // ------------------------------------------------------------------------- + const alphaIdentity = readAlphaIdentity(SELFTUNE_CONFIG_PATH); + if (alphaIdentity?.enrolled) { + try { + console.error("[orchestrate] Running alpha upload cycle..."); + const { runUploadCycle } = await import("./alpha-upload/index.js"); + const db = getDb(); + const uploadSummary = await runUploadCycle(db, { + enrolled: true, + userId: alphaIdentity.user_id, + agentType: readConfiguredAgentType(SELFTUNE_CONFIG_PATH, "unknown"), + selftuneVersion: getSelftuneVersion(), + dryRun: options.dryRun, + apiKey: alphaIdentity.api_key, + }); + result.uploadSummary = uploadSummary; + console.error( + `[orchestrate] Alpha upload: prepared=${uploadSummary.prepared}, sent=${uploadSummary.sent}, failed=${uploadSummary.failed}, skipped=${uploadSummary.skipped}`, + ); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + console.error(`[orchestrate] Alpha upload failed (non-blocking): ${msg}`); + } + } + return result; } finally { releaseLock(); @@ -1039,6 +1159,7 @@ Examples: // JSON output: include per-skill decisions for machine consumption const jsonOutput = { ...result.summary, + ...(result.uploadSummary ? { upload: result.uploadSummary } : {}), decisions: result.candidates.map((c) => ({ skill: c.skill, action: c.action, diff --git a/cli/selftune/routes/overview.ts b/cli/selftune/routes/overview.ts index 000772de..b7ec55b3 100644 --- a/cli/selftune/routes/overview.ts +++ b/cli/selftune/routes/overview.ts @@ -5,7 +5,6 @@ */ import type { Database } from "bun:sqlite"; -import type { OverviewResponse } from "../dashboard-contract.js"; import { getOverviewPayload, getSkillsList } from "../localdb/queries.js"; export function handleOverview(db: Database, version: string): Response { diff --git a/cli/selftune/routes/skill-report.ts b/cli/selftune/routes/skill-report.ts index 8cdb3e8d..cb885b5a 100644 --- a/cli/selftune/routes/skill-report.ts +++ b/cli/selftune/routes/skill-report.ts @@ -15,15 +15,16 @@ export function handleSkillReport(db: Database, skillName: string): Response { // 1. Evolution audit with eval_snapshot const evolution = db .query( - `SELECT timestamp, proposal_id, action, details, eval_snapshot_json + `SELECT timestamp, proposal_id, skill_name, action, details, eval_snapshot_json FROM evolution_audit - WHERE skill_name = ? + WHERE skill_name = ? OR (skill_name IS NULL AND proposal_id LIKE 'evo-' || ? || '-%') ORDER BY timestamp DESC LIMIT 100`, ) - .all(skillName) as Array<{ + .all(skillName, skillName) as Array<{ timestamp: string; proposal_id: string; + skill_name: string | null; action: string; details: string; eval_snapshot_json: string | null; @@ -85,12 +86,15 @@ export function handleSkillReport(db: Database, skillName: string): Response { }; // 4. Skill invocations — single source of truth + // JOIN prompts to recover query text when si.query is null (canonical records + // don't carry query; it's only populated via the direct-write hook path). const invocationsWithConfidence = db .query( `SELECT si.occurred_at as timestamp, si.session_id, si.skill_name, si.invocation_mode, si.triggered, si.confidence, si.tool_name, - si.agent_type, si.query, si.source + si.agent_type, COALESCE(si.query, p.prompt_text) as query, si.source FROM skill_invocations si + LEFT JOIN prompts p ON si.matched_prompt_id = p.prompt_id WHERE si.skill_name = ? ORDER BY si.occurred_at DESC LIMIT 100`, diff --git a/cli/selftune/status.ts b/cli/selftune/status.ts index 4129374f..ab45ba22 100644 --- a/cli/selftune/status.ts +++ b/cli/selftune/status.ts @@ -7,8 +7,19 @@ * - cliMain() (reads logs, runs doctor, prints output) */ +import { + formatGuidanceLines, + getAlphaGuidance, + getAlphaGuidanceForState, +} from "./agent-guidance.js"; +import { getAlphaLinkState, readAlphaIdentity } from "./alpha-identity.js"; +import { getQueueStats } from "./alpha-upload/queue.js"; +import { getBaseUrl } from "./auth/device-code.js"; +import { SELFTUNE_CONFIG_PATH } from "./constants.js"; import { getDb } from "./localdb/db.js"; import { + getLastUploadError, + getLastUploadSuccess, queryEvolutionAudit, queryQueryLog, querySessionTelemetry, @@ -17,6 +28,8 @@ import { import { computeMonitoringSnapshot, MIN_MONITORING_SKILL_CHECKS } from "./monitoring/watch.js"; import { doctor } from "./observability.js"; import type { + AgentCommandGuidance, + AlphaLinkState, DoctorResult, EvolutionAuditEntry, MonitoringSnapshot, @@ -55,6 +68,29 @@ export interface StatusResult { }; } +// --------------------------------------------------------------------------- +// Alpha upload status types +// --------------------------------------------------------------------------- + +export interface CloudVerifyData { + enrolled: boolean; + last_push_at: string | null; + key_prefix: string; + key_created_at: string; + total_pushes: number; + last_push_status: string | null; +} + +export interface AlphaStatusInfo { + enrolled: boolean; + linkState?: AlphaLinkState; + guidance?: AgentCommandGuidance; + stats: { pending: number; sending: number; sent: number; failed: number }; + lastError: { last_error: string | null; updated_at: string } | null; + lastSuccess: { updated_at: string } | null; + cloudVerify?: CloudVerifyData | null; +} + // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- @@ -62,6 +98,13 @@ export interface StatusResult { export const DEFAULT_WINDOW_SESSIONS = 20; const DEFAULT_BASELINE_PASS_RATE = 0.5; +const LINK_STATE_LABELS: Record = { + not_linked: "not linked", + linked_not_enrolled: "linked (not enrolled)", + enrolled_no_credential: "enrolled (missing credential)", + ready: "ready", +}; + // --------------------------------------------------------------------------- // computeStatus — pure function // --------------------------------------------------------------------------- @@ -324,6 +367,131 @@ function colorize(text: string, hex: string): string { return `\x1b[38;2;${r};${g};${b}m${text}\x1b[0m`; } +// --------------------------------------------------------------------------- +// Cloud verify — fail-open fetch of /api/v1/alpha/verify +// --------------------------------------------------------------------------- + +const CLOUD_VERIFY_TIMEOUT_MS = 3000; + +/** + * Fetch cloud verification data from the selftune API. + * Fail-open: returns null on any error (network, auth, timeout). + * Uses a 3-second timeout to avoid blocking the status command. + */ +export async function fetchCloudVerify(apiKey: string): Promise { + try { + const baseUrl = getBaseUrl(); + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), CLOUD_VERIFY_TIMEOUT_MS); + + try { + const response = await fetch(`${baseUrl}/alpha/verify`, { + method: "GET", + headers: { + Authorization: `Bearer ${apiKey}`, + Accept: "application/json", + }, + signal: controller.signal, + }); + + if (!response.ok) return null; + + const data = (await response.json()) as CloudVerifyData; + return data; + } finally { + clearTimeout(timeout); + } + } catch { + // Fail-open: network errors, timeouts, JSON parse errors all return null + return null; + } +} + +// --------------------------------------------------------------------------- +// Alpha upload status formatting +// --------------------------------------------------------------------------- + +/** + * Format the alpha upload status section for CLI output. + * Returns a multi-line string to append to the status output. + * Pass null when user is not enrolled. + */ +export function formatAlphaStatus(info: AlphaStatusInfo | null): string { + const lines: string[] = []; + lines.push(""); + lines.push("Alpha Upload"); + lines.push("\u2500".repeat(15)); + + if (!info) { + const guidance = getAlphaGuidanceForState("not_linked"); + lines.push(" Status: not enrolled"); + lines.push(" Cloud link: not linked"); + lines.push(...formatGuidanceLines(guidance)); + return lines.join("\n"); + } + + const linkState = info.linkState ?? "not_linked"; + lines.push(` Status: ${info.enrolled ? "enrolled" : "not enrolled"}`); + lines.push(` Cloud link: ${LINK_STATE_LABELS[linkState]}`); + + // Cloud verification data (when available) + if (info.cloudVerify) { + const cv = info.cloudVerify; + const verifiedAt = new Date(); + const verifiedTime = verifiedAt.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + }); + const verifiedClock = verifiedAt.toLocaleTimeString("en-US", { + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + lines.push(` Cloud verified: yes (last verified: ${verifiedTime}, ${verifiedClock})`); + lines.push(` Total pushes: ${cv.total_pushes}`); + if (cv.last_push_at) { + const d = new Date(cv.last_push_at); + const pushDate = d.toLocaleDateString("en-US", { month: "short", day: "numeric" }); + const pushTime = d.toLocaleTimeString("en-US", { + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + lines.push(` Last push: ${pushDate}, ${pushTime}`); + } + } + + lines.push(` Pending: ${info.stats.pending}`); + lines.push(` Sending: ${info.stats.sending}`); + lines.push(` Failed: ${info.stats.failed}`); + lines.push(` Sent: ${info.stats.sent}`); + + if (info.lastError) { + lines.push(` Last error: ${info.lastError.last_error ?? "unknown"}`); + } + + if (info.lastSuccess) { + const d = new Date(info.lastSuccess.updated_at); + const formatted = d.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + }); + const time = d.toLocaleTimeString("en-US", { + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + lines.push(` Last upload: ${formatted}, ${time}`); + } + + const guidance = info.guidance ?? getAlphaGuidanceForState(linkState); + if (guidance.blocking) { + lines.push(...formatGuidanceLines(guidance)); + } + + return lines.join("\n"); +} + // --------------------------------------------------------------------------- // cliMain — reads logs, runs doctor, prints output // --------------------------------------------------------------------------- @@ -340,6 +508,29 @@ export async function cliMain(): Promise { const result = computeStatus(telemetry, skillRecords, queryRecords, auditEntries, doctorResult); const output = formatStatus(result); console.log(output); + + // Alpha upload status section + const alphaIdentity = readAlphaIdentity(SELFTUNE_CONFIG_PATH); + let alphaInfo: AlphaStatusInfo | null = null; + if (alphaIdentity) { + const cloudVerify = + alphaIdentity.enrolled && alphaIdentity.api_key + ? await fetchCloudVerify(alphaIdentity.api_key) + : null; + alphaInfo = { + enrolled: alphaIdentity.enrolled === true, + linkState: getAlphaLinkState(alphaIdentity), + guidance: getAlphaGuidance(alphaIdentity), + stats: alphaIdentity.enrolled + ? getQueueStats(db) + : { pending: 0, sending: 0, sent: 0, failed: 0 }, + lastError: alphaIdentity.enrolled ? getLastUploadError(db) : null, + lastSuccess: alphaIdentity.enrolled ? getLastUploadSuccess(db) : null, + cloudVerify, + }; + } + console.log(formatAlphaStatus(alphaInfo)); + process.exit(0); } catch (err) { const message = err instanceof Error ? err.message : String(err); diff --git a/cli/selftune/types.ts b/cli/selftune/types.ts index 3aa511dc..68c05475 100644 --- a/cli/selftune/types.ts +++ b/cli/selftune/types.ts @@ -6,6 +6,33 @@ // Config types (written to ~/.selftune/config.json) // --------------------------------------------------------------------------- +export interface AlphaIdentity { + enrolled: boolean; + /** Cloud-issued user ID. Primary identifier after linking. */ + cloud_user_id?: string; + /** Cloud-issued org ID. Set during device-code approval. */ + cloud_org_id?: string; + /** Cached email from cloud account. Not authoritative. */ + email?: string; + /** Cached display name from cloud account. Not authoritative. */ + display_name?: string; + /** Local user_id — legacy, preserved for migration. */ + user_id: string; + consent_timestamp: string; + /** Bearer token for alpha API. Cloud-issued, cached locally. */ + api_key?: string; +} + +/** + * Derive the cloud link readiness state from an AlphaIdentity. + * Used by status.ts and observability.ts for agent-facing diagnostics. + */ +export type AlphaLinkState = + | "not_linked" + | "linked_not_enrolled" + | "enrolled_no_credential" + | "ready"; + export interface SelftuneConfig { agent_type: "claude_code" | "codex" | "opencode" | "openclaw" | "unknown"; cli_path: string; @@ -14,6 +41,7 @@ export interface SelftuneConfig { hooks_installed: boolean; initialized_at: string; analytics_disabled?: boolean; + alpha?: AlphaIdentity; } // --------------------------------------------------------------------------- @@ -250,11 +278,20 @@ export interface ExecutionMetrics { export type HealthStatus = "pass" | "fail" | "warn"; +export interface AgentCommandGuidance { + code: string; + message: string; + next_command: string; + suggested_commands: string[]; + blocking: boolean; +} + export interface HealthCheck { name: string; path: string; status: HealthStatus; message: string; + guidance?: AgentCommandGuidance; } export interface DoctorResult { @@ -311,6 +348,7 @@ export interface EvolutionAuditEntry { action: "created" | "validated" | "deployed" | "rolled_back" | "rejected"; details: string; eval_snapshot?: EvalPassRate; + iterations_used?: number; } export interface EvolutionEvidenceValidation { @@ -340,6 +378,8 @@ export interface EvolutionEvidenceEntry { proposed_text?: string; eval_set?: EvalEntry[]; validation?: EvolutionEvidenceValidation; + /** Deterministic evidence ID, generated during staging (ev_ prefix + hash). */ + evidence_id?: string; } export interface EvolutionConfig { diff --git a/cli/selftune/utils/selftune-meta.ts b/cli/selftune/utils/selftune-meta.ts new file mode 100644 index 00000000..fdb626e3 --- /dev/null +++ b/cli/selftune/utils/selftune-meta.ts @@ -0,0 +1,38 @@ +import { readFileSync } from "node:fs"; +import { join } from "node:path"; + +import type { SelftuneConfig } from "../types.js"; + +let cachedVersion: string | null = null; + +export function getSelftuneVersion(fallback = "0.0.0"): string { + if (cachedVersion !== null) return cachedVersion; + + try { + const pkg = JSON.parse( + readFileSync(join(import.meta.dir, "..", "..", "..", "package.json"), "utf-8"), + ) as { version?: unknown }; + cachedVersion = + typeof pkg.version === "string" && pkg.version.trim().length > 0 ? pkg.version : fallback; + } catch { + cachedVersion = fallback; + } + + return cachedVersion; +} + +export function readConfiguredAgentType( + configPath: string, + fallback: SelftuneConfig["agent_type"] = "unknown", +): SelftuneConfig["agent_type"] { + try { + const config = JSON.parse(readFileSync(configPath, "utf-8")) as { + agent_type?: unknown; + }; + return typeof config.agent_type === "string" + ? (config.agent_type as SelftuneConfig["agent_type"]) + : fallback; + } catch { + return fallback; + } +} diff --git a/docs/design-docs/alpha-remote-data-contract.md b/docs/design-docs/alpha-remote-data-contract.md new file mode 100644 index 00000000..86148e2c --- /dev/null +++ b/docs/design-docs/alpha-remote-data-contract.md @@ -0,0 +1,330 @@ + + +# Alpha Remote Data Contract — Cloud API V2 Push, Upload Queue, Auth Model + +**Status:** Active +**Created:** 2026-03-18 +**Updated:** 2026-03-19 +**Type:** Design document + +--- + +## 1. Overview + +### What the alpha remote pipeline does + +The alpha remote pipeline enables opted-in selftune users to upload consent-based telemetry data to the selftune cloud API. This data powers aggregate analysis across the alpha cohort: which skills trigger reliably, which evolution proposals improve outcomes, and where the selftune feedback loop breaks down across real-world usage patterns. + +The pipeline is batch-oriented and asynchronous. Local SQLite remains the source of truth. Uploads happen periodically during `orchestrate` runs or explicit `selftune sync --upload` invocations, not in real time. + +### Why the cloud API + +Alpha uploads target the existing selftune cloud API's V2 push endpoint (`POST /api/v1/push`) rather than a standalone service. This approach was chosen over a dedicated Cloudflare Worker/D1 setup because: + +- **Shared infrastructure.** The cloud API already handles authentication, rate limiting, and data storage in Neon Postgres. No separate service to deploy and maintain. +- **Canonical schema.** The V2 push endpoint accepts canonical records (sessions, prompts, skill_invocations, execution_facts, evolution_evidence) that align with selftune's data model. No impedance mismatch between local and remote schemas. +- **Single auth model.** Users authenticate with `st_live_*` API keys via Bearer header — the same mechanism used for all cloud API interactions. +- **Low cost for alpha volume.** The existing cloud infrastructure handles the expected alpha cohort (tens of users, thousands of records per day) without additional cost. + +### Relationship to the existing `contribute/` system + +The `contribute/` system and the alpha upload pipeline serve different purposes but now share the same cloud API backend: + +| Dimension | `contribute/` | Alpha upload | +|-----------|---------------|--------------| +| **Purpose** | Community sharing of anonymized eval data | Automatic telemetry for alpha cohort analysis | +| **Trigger** | Manual (`selftune contribute`) | Automatic (each `orchestrate` run) | +| **Transport** | HTTPS to cloud API | HTTPS to cloud API (`POST /api/v1/push`) | +| **Storage** | Neon Postgres (canonical tables) | Neon Postgres (canonical tables) | +| **Consent model** | Per-invocation confirmation | Enrollment flag in config (`config.alpha.enrolled`) + API key | +| **Data granularity** | Skill-level bundles with eval entries | Session-level, invocation-level, evolution-level V2 canonical records | +| **Privacy level** | Conservative or aggressive sanitization | Explicit alpha consent for raw prompt/query text plus structured telemetry | + +Both systems target the same cloud API, but alpha upload is automatic (when enrolled and an API key is configured) while contribute requires manual invocation and confirmation. + +--- + +## 2. Endpoint Configuration + +### Target endpoint + +Alpha uploads are sent to the cloud API's V2 push endpoint: + +```text +POST https://api.selftune.dev/api/v1/push +``` + +### Environment override + +The endpoint can be overridden with the `SELFTUNE_ALPHA_ENDPOINT` environment variable: + +```bash +export SELFTUNE_ALPHA_ENDPOINT="https://staging-api.selftune.dev/api/v1/push" +``` + +Default: `https://api.selftune.dev/api/v1/push` + +--- + +## 3. Authentication + +### API key model + +Each alpha user authenticates with an `st_live_*` API key: + +1. User creates a cloud account at the selftune web app +2. User generates an API key (format: `st_live_*`) +3. User stores the key locally via: `selftune init --alpha-key st_live_abc123...` + +### HTTP auth + +Every upload request includes the API key as a Bearer token: + +```text +Authorization: Bearer st_live_abc123... +``` + +The cloud API validates the key, identifies the user, and associates uploaded records with their account. + +### Key storage + +The API key is stored in `~/.selftune/config.json` under the `alpha` block: + +```json +{ + "alpha": { + "enrolled": true, + "user_id": "a1b2c3d4-...", + "api_key": "st_live_abc123...", + "email": "user@example.com" + } +} +``` + +--- + +## 4. V2 Canonical Payload Format + +### Schema version + +All upload payloads use `schema_version: "2.0"` and contain canonical records that map directly to the cloud API's Postgres tables. + +### Record types + +The V2 push payload contains typed canonical records: + +| Record type | Description | +|-------------|-------------| +| `sessions` | Session summaries with platform, model, timing, and skill trigger metadata | +| `prompts` | User prompt/query records with raw text (alpha consent required) | +| `skill_invocations` | Skill trigger/miss records with confidence, mode, and query context | +| `execution_facts` | Tool usage, error counts, and execution metadata (deterministic `execution_fact_id` generated during staging for records that lack one) | +| `evolution_evidence` | Evolution proposal outcomes, pass rate changes, deploy/rollback status (deterministic `evidence_id` generated during staging) | +| `orchestrate_runs` | Orchestrate run reports with sync/evolve/watch phase summaries | + +### Payload envelope + +Each HTTP request sends an envelope containing metadata and a batch of canonical records: + +```json +{ + "schema_version": "2.0", + "user_id": "a1b2c3d4-...", + "agent_type": "claude_code", + "selftune_version": "0.2.7", + "records": [ + { "type": "sessions", "data": { ... } }, + { "type": "skill_invocations", "data": { ... } } + ] +} +``` + +The TypeScript interfaces are defined in `cli/selftune/alpha-upload-contract.ts` (queue infrastructure types and `PushUploadResult`). The V2 payload shape is validated by `PushPayloadV2Schema` (Zod) with `min(0)` arrays. + +### Canonical upload staging + +Before payloads are built, records are staged into a local `canonical_upload_staging` SQLite table by `cli/selftune/alpha-upload/stage-canonical.ts`. This module reads canonical JSONL files, evolution evidence, and orchestrate_runs, then writes them into the staging table with deterministic IDs: + +- **`execution_fact_id`** — generated deterministically during staging for records that lack one (hash of session_id + tool + timestamp) +- **`evidence_id`** — generated deterministically during staging for evolution evidence records (hash of proposal_id + target + skill + timestamp) + +The staging table uses a single monotonic cursor, so `build-payloads.ts` reads only unstaged records on each cycle. This avoids re-scanning the full JSONL history. If a malformed staged row is encountered, payload assembly stops before that row and holds the cursor at the last valid sequence so corrupted data is not silently skipped. + +### Cloud-side lossless ingest + +The cloud API stores every push request in a `raw_pushes` table before normalizing into canonical tables. This provides: + +- **Lossless ingest** — no data is lost even if normalization logic changes +- **Partial push acceptance** — unresolved references are stored in raw_pushes and resolved later +- **Retry safety** — natural-key UNIQUE constraints with `onConflictDoNothing` make duplicate pushes idempotent + +--- + +## 5. Response Handling + +The cloud API returns standard HTTP status codes: + +| Status | Meaning | Client behavior | +|--------|---------|-----------------| +| `201 Created` | Records accepted and stored | Mark queue item as `sent` | +| `409 Conflict` | Duplicate records (already uploaded) | Treat as success, mark `sent` | +| `429 Too Many Requests` | Rate limited | Retryable — increment attempts, apply backoff | +| `401 Unauthorized` | Invalid or missing API key | Non-retryable — mark `failed`, log auth error | +| `403 Forbidden` | Key valid but user not authorized | Non-retryable — mark `failed`, log auth error | +| `5xx` | Server error | Retryable — increment attempts, apply backoff | + +--- + +## 6. Upload Timing + +**Recommendation: periodic batch upload, not immediate.** + +Uploads happen at two touchpoints: + +1. **On each `selftune orchestrate` run.** After sync completes and before evolution begins, the orchestrate loop checks for pending upload queue items and flushes them. This piggybacks on the existing orchestrate cadence (typically cron-scheduled every 1-4 hours). + +2. **Explicit `selftune sync --upload`.** A future `--upload` flag on the sync command triggers an immediate flush. This gives agents a way to force-upload without running a full orchestrate cycle. + +**Rationale for batch over immediate:** + +- **Alpha volume is low.** Tens of users generating hundreds of records per day. Real-time streaming adds complexity without proportional value. +- **Reduces noise.** Batching naturally deduplicates records that might be written multiple times during a session (e.g., skill_usage records appended by hooks then reconciled by sync). +- **Aligns with orchestrate cadence.** The orchestrate loop already reads local SQLite, runs evolution, and writes results. Adding an upload step is a natural extension of this pipeline. +- **Failure isolation.** If the cloud API is unreachable, the upload fails silently and retries next cycle. No impact on local selftune operation. + +**What NOT to do:** +- Do not upload from hooks (too latency-sensitive, runs in the critical path of user prompts). +- Do not upload from the dashboard server (it is a read-only query surface). +- Do not upload on every SQLite write (too frequent, creates thundering herd for multi-skill users). + +--- + +## 7. Queue/Retry Model + +### Local upload queue + +A local `upload_queue` table in the existing selftune SQLite database stages records for upload. This table is defined in `cli/selftune/localdb/schema.ts`. + +```sql +CREATE TABLE upload_queue ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + payload_type TEXT NOT NULL, -- 'sessions' | 'invocations' | 'evolution' + payload_json TEXT NOT NULL, -- JSON-serialized array of payload items + created_at TEXT NOT NULL, + status TEXT NOT NULL DEFAULT 'pending', -- 'pending' | 'sent' | 'failed' + attempts INTEGER NOT NULL DEFAULT 0, + last_attempt_at TEXT, + last_error TEXT, + sent_at TEXT +); + +CREATE INDEX idx_upload_queue_status ON upload_queue(status); +CREATE INDEX idx_upload_queue_created ON upload_queue(created_at); +``` + +### Enqueue flow + +1. During `orchestrate` or `sync --upload`, the upload module queries local SQLite for records not yet uploaded (tracked via a `last_upload_watermark` in `_meta`). +2. Records are batched into envelopes of up to **100 records** per payload type. +3. Each batch is inserted into `upload_queue` as a single row with `status = 'pending'`. + +### Flush flow + +1. `flushQueue()` selects rows with `status = 'pending'` ordered by `created_at ASC`. +2. For each pending item, it POSTs the stored V2 push envelope to `https://api.selftune.dev/api/v1/push` with the Bearer API key. +3. Retryable failures (`429`, `5xx`) are retried with exponential backoff inside the same `flushQueue()` run. +4. Success (`201` or `409`) is terminal: set `status = 'sent'` and `sent_at`. +5. Exhausted retryable failures and non-retryable auth failures (`401`, `403`) are terminal: increment `attempts`, set `last_attempt_at` / `last_error`, and leave the row at `status = 'failed'`. + +### Retry with exponential backoff + +When retrying failed items within a single flush cycle: + +| Attempt | Delay before retry | +|---------|-------------------| +| 1 | 1 second | +| 2 | 2 seconds | +| 3 | 4 seconds | +| 4 | 8 seconds | +| 5 | 16 seconds | + +After 5 failed attempts, the queue item stays at `status = 'failed'` and is not retried automatically. A future `selftune alpha retry` command could reset failed items. + +### Batch size limits + +- Maximum **100 records** per envelope (per payload_type). +- If a local query returns more than 100 records for a payload type, they are split into multiple queue items. +- This keeps individual HTTP requests small (estimated <50KB per envelope at 100 invocation records). + +--- + +## 8. Consent Enforcement + +### Local enforcement + +Before any network call, the upload module performs this check: + +```python +config = readFreshConfig() // NOT cached, read from disk each time +if config.alpha?.enrolled !== true: + return // silently skip upload +if !config.alpha?.api_key: + return // no API key configured, skip upload +``` + +Reading config fresh from disk on every upload attempt means a user (or their agent) can unenroll at any time by setting `config.alpha.enrolled = false` or removing the `alpha` key. The next upload cycle respects the change immediately. + +### Server-side enforcement + +The cloud API validates every upload: + +1. Extract the API key from the `Authorization: Bearer` header. +2. Look up the associated user account. +3. If the key is invalid or the user has been deactivated, return 401/403. +4. On successful writes, update the user's `last_upload_at` timestamp. + +### Future: data deletion + +A future `selftune alpha delete-data` command will: +- Call a cloud API endpoint that deletes all records for the user's account. +- Remove the `alpha` config block locally. +- Confirm deletion to the agent. + +This aligns with the principle that alpha enrollment is fully reversible. + +--- + +## 9. Privacy Model + +### Data minimization + +The alpha pipeline uploads only the fields needed for alpha analysis, but it does include raw query text for explicitly consented users: + +| Data category | What is uploaded | What is NOT uploaded | +|---------------|-----------------|---------------------| +| Queries | Raw query text (in `raw_source_ref.metadata`) | Full transcript bodies outside the captured prompt/query text | +| Workspace paths | Workspace path (in V2 canonical records) | N/A | +| File contents | Nothing | Nothing | +| Conversation text | Prompt/query text only | Full conversation transcripts | +| Code | Nothing | Nothing | +| File paths | Only if the user typed them into prompt/query text | Structured file-path fields | +| Session IDs | Session ID (opaque UUID) | N/A | + +### What is explicitly excluded + +- No file contents of any kind +- No transcript text beyond the captured prompt/query text +- No code snippets or diffs +- No environment variables or shell history +- No tool input/output content + +--- + +## Appendix: Design Decision — Cloud API over Standalone Worker + +The initial design direction evaluated a standalone Cloudflare Worker backed by D1 (SQLite at the edge). This was replaced with direct integration into the existing cloud API for these reasons: + +1. **Reduced operational surface.** One service to monitor, not two. +2. **Unified auth.** API keys work the same way for all cloud interactions. +3. **Schema convergence.** V2 canonical records are the shared language between local and cloud — no separate D1 schema to maintain. +4. **Future-proof.** As selftune moves toward a full cloud product, alpha data lives in the same Postgres tables that power the cloud dashboard. diff --git a/docs/design-docs/evolution-pipeline.md b/docs/design-docs/evolution-pipeline.md index 620ecef5..7df92049 100644 --- a/docs/design-docs/evolution-pipeline.md +++ b/docs/design-docs/evolution-pipeline.md @@ -149,6 +149,24 @@ Coordinates the full pipeline with retry logic: 5. Deploy if validation passed (unless `--dry-run`) 6. Record audit entries at every state transition +### Constitutional Pre-Validation + +Before LLM validation, description proposals pass through a deterministic +constitutional gate. This rejects obviously bad proposals before they can +consume validation budget or pollute the retry loop. + +Current checks: + +- size guard: description must stay within the configured character and word-count bounds +- XML/HTML rejection: proposals containing tags are rejected immediately +- unbounded broadening guard: bare "all", "any", "every", or "everything" must be qualified +- anchor preservation: required `USE WHEN` anchors and `$skillName` references must survive + +If the gate fails, the pipeline records a `rejected` audit entry with the +constitutional reason. For description evolution the loop can retry with a +new proposal; for body evolution the size-only constitutional rejection is a +terminal failure for that candidate. + ### CLI Flags | Flag | Default | Description | diff --git a/docs/design-docs/index.md b/docs/design-docs/index.md index 75d4d089..188f1e8e 100644 --- a/docs/design-docs/index.md +++ b/docs/design-docs/index.md @@ -1,4 +1,4 @@ - + # Design Documents Index @@ -17,6 +17,7 @@ Registry of all design documents with verification status. | live-dashboard-sse.md | Current | 2026-03-17 | Team | | sqlite-first-migration.md | Current | 2026-03-17 | Team | | ../integration-guide.md | Current | 2026-03-01 | Team | +| alpha-remote-data-contract.md | Current | 2026-03-18 | Team | ## Verification Schedule diff --git a/docs/design-docs/live-dashboard-sse.md b/docs/design-docs/live-dashboard-sse.md index 7e8b670d..2b30f33c 100644 --- a/docs/design-docs/live-dashboard-sse.md +++ b/docs/design-docs/live-dashboard-sse.md @@ -1,16 +1,24 @@ - + # Live Dashboard — Server-Sent Events +## Status + +This doc describes the intended end-state for live dashboard freshness, not the fully shipped runtime. + +- Current runtime: SSE exists, but invalidation still watches selected JSONL logs in `cli/selftune/dashboard-server.ts`. +- Pending cutover: SQLite WAL should become the only live invalidation signal. +- Canonical tracking plan: `docs/exec-plans/active/dashboard-data-integrity-recovery.md` + ## Problem The dashboard relied on polling (15–30s intervals per endpoint) to show new data. Combined with a 15s server-side materialization TTL and React Query's `staleTime`, new invocations could take 30+ seconds to appear — or not appear at all until a hard refresh cleared all cache layers. -## Solution +## Target Solution -Replace polling as the primary update mechanism with Server-Sent Events (SSE). The dashboard server watches the SQLite WAL file for changes and pushes update notifications to all connected browser tabs in real time. +Replace polling as the primary update mechanism with Server-Sent Events (SSE). The target design is for the dashboard server to watch the SQLite WAL file for changes and push update notifications to all connected browser tabs in real time. -## Architecture +## Target Architecture ```mermaid sequenceDiagram @@ -41,13 +49,15 @@ sequenceDiagram ### SQLite WAL Watcher -`fs.watchFile()` monitors the SQLite WAL file (`~/.selftune/selftune.db-wal`) with 500ms polling. When hooks write directly to SQLite, the WAL file's modification time or size changes, triggering the watcher. The old JSONL file watchers have been removed. +End-state design: `fs.watchFile()` monitors the SQLite WAL file (`~/.selftune/selftune.db-wal`) with 500ms polling. When hooks write directly to SQLite, the WAL file's modification time or size changes, triggering the watcher. + +Current runtime note: the old JSONL file watchers have not been fully removed yet. The shipped dashboard still warns when running in legacy JSONL watcher mode. A 500ms debounce timer coalesces rapid writes (e.g., a hook appending multiple records in sequence) into a single broadcast cycle. ### No Separate Materialization Step -Because hooks now write directly to SQLite, there is no separate materialization step. The data is already in the database when the WAL watcher fires. The server simply broadcasts the SSE event and the next API query reads fresh data directly from SQLite. +Target design: because hooks now write directly to SQLite, there is no separate materialization step in the hot path. The data is already in the database when the WAL watcher fires. The server simply broadcasts the SSE event and the next API query reads fresh data directly from SQLite. ### Fan-Out @@ -74,7 +84,7 @@ All React Query hooks retain `refetchInterval` but relaxed to 60s (was 15–30s) `staleTime` was reduced to 5s (was 10–30s) so that SSE-triggered invalidations result in immediate network requests rather than returning cached data. -## Latency Budget +## Target Latency Budget | Stage | Time | |-------|------| @@ -85,7 +95,7 @@ All React Query hooks retain `refetchInterval` but relaxed to 60s (was 15–30s) | React Query invalidation + fetch | ~100ms | | **Total** | **~1100ms** | -New data appears in the dashboard within ~1 second of a hook writing to SQLite (best case when the poll fires immediately after the write). +After the WAL cutover lands, new data should appear in the dashboard within ~1 second of a hook writing to SQLite. ## Files Changed @@ -111,8 +121,9 @@ New data appears in the dashboard within ~1 second of a hook writing to SQLite ( **Why keep polling?** SSE connections can drop. `EventSource` reconnects automatically, but during the reconnect window (up to 3s by default) no updates arrive. The 60s polling fallback ensures the dashboard never goes completely stale. -## Limitations +## Current Limitations -- `fs.watchFile()` uses stat polling (500ms interval), so there is an inherent latency floor compared to event-driven watchers. Best-case latency is ~600ms; worst-case is ~1100ms. +- The runtime is still using legacy JSONL watcher invalidation in some paths, so the WAL-only freshness model described above is not yet the sole shipped behavior. +- `fs.watchFile()` uses stat polling (500ms interval), so even after the WAL cutover there is an inherent latency floor compared to event-driven watchers. - On network filesystems, stat polling may be slower or return stale metadata. -- The debounce means writes within the same 500ms window are coalesced -- the dashboard won't show intermediate states within a burst. +- The debounce means writes within the same 500ms window are coalesced; the dashboard won't show intermediate states within a burst. diff --git a/docs/design-docs/sqlite-first-migration.md b/docs/design-docs/sqlite-first-migration.md index 95941ee8..9b9dabfd 100644 --- a/docs/design-docs/sqlite-first-migration.md +++ b/docs/design-docs/sqlite-first-migration.md @@ -1,7 +1,15 @@ - + # SQLite-First Data Architecture +## Status + +Most SQLite-first read-path work has landed, but this doc currently overstates the freshness cutover. + +- Landed: hooks/sync write to SQLite, dashboard/status/report reads are primarily SQLite-backed. +- Still open: SSE invalidation is not yet WAL-only in the shipped runtime. +- Treat this document as migration design plus progress notes, not as a perfect description of current live freshness behavior. + ## Problem JSONL-as-source-of-truth caused: @@ -16,7 +24,7 @@ JSONL-as-source-of-truth caused: **Phase 1: Dual-Write** — Hooks INSERT into SQLite alongside JSONL appends via `localdb/direct-write.ts`. Zero risk: additive only, fully reversible. -**Phase 2: Cut Over Reads** — Dashboard reads SQLite directly. Materializer removed from the hot path (runs once on startup for historical backfill). SSE watchers switched from JSONL file events to SQLite WAL file changes. +**Phase 2: Cut Over Reads** — Dashboard reads SQLite directly. Materializer is removed from the hot path for normal reads (runs once on startup for historical backfill). WAL-based SSE invalidation is the target end-state, but the shipped runtime still carries legacy JSONL watcher invalidation in `dashboard-server.ts`. **Phase 3: Drop JSONL Writes** — Hooks stop appending JSONL. SQLite is the sole write target. A new `selftune export` command generates JSONL from SQLite on demand for portability. @@ -28,7 +36,7 @@ Data flow (before): Hook → JSONL append → [15s wait] → Materializer reads JSONL → SQLite → Dashboard ``` -Data flow (after): +Target data flow (after full freshness cutover): ``` Hook → SQLite INSERT (via direct-write.ts) → WAL watcher → SSE broadcast → Dashboard @@ -66,7 +74,7 @@ Hook → SQLite INSERT (via direct-write.ts) → WAL watcher → SSE broadcast | Ingestors | All platform adapters — dual-write path | | Evolution | `evolution/*.ts` — read from SQLite, write via direct-write | | Orchestrate + Grading | `orchestrate.ts`, `grading/*.ts` — SQLite reads | -| Dashboard | `dashboard-server.ts`, SSE watchers, all route handlers | +| Dashboard | `dashboard-server.ts`, SQLite-backed routes, transitional SSE invalidation | | CI | Workflow updated for new test structure | ## Impact @@ -75,12 +83,13 @@ Hook → SQLite INSERT (via direct-write.ts) → WAL watcher → SSE broadcast |--------|--------|-------| | Dashboard load (first call) | 9.5s | 86ms | | Dashboard load (subsequent) | ~2s (TTL hit) | 15ms | -| Data latency (hook → dashboard) | 15–30s | <1s (SSE push) | +| Data latency (hook → dashboard) | 15–30s | target: <1s after WAL-only SSE cutover | | Schema change propagation | 7 files | 4 files | | Test delta | baseline | +2 passing, -2 failures | ## Limitations +- The WAL-only SSE cutover is not yet complete — legacy JSONL watcher invalidation still exists in the current runtime - Phase 3 (drop JSONL writes) is not yet complete — dual-write is still active - Historical data prior to Phase 1 requires a one-time materializer backfill on first startup - `selftune export --since DATE` is supported for date-range filtering; per-skill filtering is not yet implemented diff --git a/docs/design-docs/system-overview.md b/docs/design-docs/system-overview.md index f0157d01..520a2864 100644 --- a/docs/design-docs/system-overview.md +++ b/docs/design-docs/system-overview.md @@ -1,4 +1,4 @@ - + # System Overview @@ -123,6 +123,14 @@ sequenceDiagram Materializer->>DB: refresh from logs when sync/materialize runs ``` +## Local vs Cloud Dashboard + +The local dashboard and the cloud dashboard should not be treated as interchangeable copies of the same product surface. + +- **Local dashboard:** machine-scoped operator/debug UI over local SQLite. It should answer whether this machine is capturing correctly, whether local evidence looks sane, whether uploads are queued or failing, and what this workspace has observed. +- **Cloud dashboard:** org-scoped control-plane and analysis UI over cloud auth plus canonical cloud storage. It should answer who is enrolled, which credentials exist, what landed in the cloud, and what org-level trends or operator decisions exist. +- If a concept appears in both places, one surface must be clearly authoritative. Local should own machine/runtime health; cloud should own org identity, enrollment, credentials, and remote analytics. + ## Signal-Reactive Improvement Beyond scheduled runs, selftune detects improvement signals in real-time. diff --git a/docs/exec-plans/active/advanced-skill-patterns-adoption.md b/docs/exec-plans/active/advanced-skill-patterns-adoption.md new file mode 100644 index 00000000..240eb480 --- /dev/null +++ b/docs/exec-plans/active/advanced-skill-patterns-adoption.md @@ -0,0 +1,285 @@ + + +# Execution Plan: Advanced Skill Patterns Adoption + +**Status:** Planned +**Created:** 2026-03-18 +**Goal:** Adopt the highest-value advanced Claude Code skill patterns in selftune without breaking the current agent-first umbrella-skill model. + +--- + +## Executive Summary + +selftune already uses advanced skill-authoring patterns at the package level: + +- progressive disclosure through `Workflows/`, `references/`, `assets/`, and `agents/` +- manual subagent escalation via bundled agent prompt files +- structured pre-flight interaction patterns for mutating workflows + +What it does **not** use yet are most of the newer platform-native skill controls described in the Claude Code docs: + +- `argument-hint` +- `disable-model-invocation` +- `user-invocable` +- `allowed-tools` +- `model` +- `context: fork` +- `agent` +- skill-frontmatter `hooks` +- runtime string substitutions like `${CLAUDE_SKILL_DIR}` + +The key architectural constraint is that selftune is currently an **umbrella skill**: one top-level skill file routes to many workflows. Most of the advanced frontmatter controls are **per-skill**, so applying them to the current monolith would be too coarse. + +This plan therefore splits the work into two tracks: + +1. **Adopt low-risk patterns now** within the current umbrella skill. +2. **Design before splitting** if we want first-class platform-native subskill execution later. + +--- + +## Current State + +### Already using advanced package patterns + +- [skill/SKILL.md](skill/SKILL.md) is a routing surface, not a monolithic prompt blob +- `skill/Workflows/*.md` contains per-workflow execution playbooks +- `skill/references/*.md` contains heavy reference material loaded on demand +- `skill/assets/*.json` contains reusable setup/config templates +- `skill/agents/*.md` contains bundled subagent prompt files + +### Not yet using platform-native skill controls + +- Main [skill/SKILL.md](skill/SKILL.md#L1) only uses `name`, `description`, and `metadata` +- No `argument-hint`, `disable-model-invocation`, `user-invocable`, `allowed-tools`, `model`, `context`, `agent`, or `hooks` fields appear anywhere under `skill/` +- No use of `$ARGUMENTS`, `${CLAUDE_SESSION_ID}`, or `${CLAUDE_SKILL_DIR}` +- Subagent spawning is manual/instructional, not driven by `context: fork` + +### Constraint + +Applying `context: fork`, `allowed-tools`, `disable-model-invocation`, or `model` to the umbrella skill would affect **all** workflows, including ones that should remain inline and auto-routable. + +--- + +## Target State + +### Phase 1 target + +Improve the current umbrella skill with low-risk advanced patterns that do not require structural change: + +- add `argument-hint` to the main skill +- add bundled `examples/` supporting files and reference them explicitly +- harden skill-relative path references using `${CLAUDE_SKILL_DIR}` where appropriate + +### Phase 2 target + +Produce a design for converting selected internal roles into first-class internal/helper skills so selftune can use: + +- `context: fork` +- `agent` +- `user-invocable: false` +- `disable-model-invocation: true` +- `allowed-tools` + +### Phase 3 target + +If the design is sound, implement the split for a small set of high-value helper roles without changing the public selftune user experience. + +--- + +## Non-Goals + +- Do **not** add `context: fork` to the current umbrella skill. +- Do **not** add `allowed-tools` to the current umbrella skill. +- Do **not** set a single `model` for the current umbrella skill. +- Do **not** move selftune hook installation into skill-frontmatter `hooks:` in this phase. + +--- + +## Implementation + +## Phase 1: Low-Risk Adoption in the Current Skill + +**Goal:** adopt advanced patterns that improve ergonomics and portability without changing skill topology. + +### 1. Add `argument-hint` to the umbrella skill + +**Files:** + +| File | Change | +|------|--------| +| `skill/SKILL.md` | Add `argument-hint` to frontmatter | + +**Recommended value:** + +```yaml +argument-hint: "[request]" +``` + +This improves direct `/selftune ...` invocation UX while preserving auto-routing behavior. + +### 2. Add an `examples/` supporting-files layer + +**Files:** + +| File | Change | +|------|--------| +| `skill/examples/doctor-output.md` | New example of doctor output interpretation | +| `skill/examples/evolve-summary.md` | New example of evolve dry-run summary | +| `skill/examples/orchestrate-summary.md` | New example of orchestrate result interpretation | +| `skill/SKILL.md` | Add examples to resource index | +| Relevant `Workflows/*.md` | Reference examples where useful | + +**Rationale:** + +The Claude Code docs recommend supporting files for detailed examples instead of bloating `SKILL.md`. selftune already has references and templates; examples are the missing supporting-file type. + +### 3. Harden skill-relative file references + +**Files:** + +| File | Change | +|------|--------| +| `skill/SKILL.md` | Update any skill-local path guidance to prefer skill-dir-relative references | +| `skill/Workflows/Initialize.md` | Use `${CLAUDE_SKILL_DIR}` when referencing bundled setup files in command/snippet examples | +| `skill/references/setup-patterns.md` | Use `${CLAUDE_SKILL_DIR}` in examples that point to bundled assets | + +**Rule:** + +When a workflow tells the agent to read or use a bundled file from the installed skill package, prefer `${CLAUDE_SKILL_DIR}` over assuming the current working directory or repo layout. + +### 4. Preserve current invocation semantics + +The umbrella skill should remain: + +- auto-loadable when relevant +- user-invocable +- inline by default + +This means **do not** add `disable-model-invocation`, `user-invocable: false`, `context: fork`, `agent`, `allowed-tools`, or `model` to the main skill in Phase 1. + +--- + +## Phase 2: Design Spike for Internal Skill Extraction + +**Goal:** determine whether selftune should extract some helper roles from `skill/agents/*.md` into first-class internal/helper skills. + +### Candidate roles + +The best candidates are the roles that are already conceptually separate and expensive enough to justify their own execution context: + +- diagnosis analyst +- evolution reviewer +- pattern analyst +- integration guide + +### Questions to answer + +1. How should these internal/helper skills be packaged so they install alongside selftune without confusing users? +2. Should they remain hidden with `user-invocable: false`? +3. Which should run with `context: fork` by default? +4. Which should be manual-only via `disable-model-invocation: true`? +5. What tool restrictions would actually be useful per helper skill? + +### Deliverable + +Create a short design doc that answers: + +- packaging layout +- install/update story +- routing semantics from the umbrella skill +- migration plan from `skill/agents/*.md` +- whether helper skills should remain discoverable to users + +No code changes are required to complete this phase. + +--- + +## Phase 3: Optional Rollout of Platform-Native Controls + +**Goal:** apply platform-native controls only where the design spike proves they fit. + +### Likely rollout pattern + +| Helper role | Recommended controls | +|-------------|----------------------| +| Diagnosis | `context: fork`, `agent`, `user-invocable: false` | +| Evolution review | `context: fork`, `agent`, `user-invocable: false` | +| Integration guide | `context: fork`, `agent`, maybe user-invocable if exposed intentionally | +| Destructive/manual workflows if split out | `disable-model-invocation: true` | + +### Explicit anti-patterns + +- Do not create a second top-level public interface that competes with `selftune`. +- Do not expose hidden helper skills in `/` unless that is a deliberate product decision. +- Do not overfit `allowed-tools` before the helper skill boundaries are stable. + +--- + +## Workstreams + +### Workstream A: Phase 1 implementation + +- add `argument-hint` +- add `examples/` +- harden path references with `${CLAUDE_SKILL_DIR}` +- update resource index and workflow references + +### Workstream B: Phase 2 design spike + +- evaluate helper-skill packaging options +- define visibility/invocation policy per helper role +- document recommended rollout path + +### Workstream C: Phase 3 optional implementation + +- create first-class helper skills only after Workstream B is approved +- wire umbrella-skill routing to those helpers +- add per-skill frontmatter controls where justified + +--- + +## Verification + +### Phase 1 + +1. `skill/SKILL.md` frontmatter includes `argument-hint` +2. `skill/examples/` exists and is referenced from the resource index +3. Bundled-file examples use `${CLAUDE_SKILL_DIR}` where path portability matters +4. The umbrella skill remains auto-routable and user-invocable + +### Phase 2 + +1. A short design doc exists for helper-skill extraction +2. The design explicitly answers packaging, visibility, and routing questions +3. The design names which roles should remain manual vs forked vs hidden + +### Phase 3 + +1. Helper skills, if added, do not change the public “use selftune” experience +2. `context: fork` and `agent` are only applied to helper skills, not the umbrella skill +3. Any `disable-model-invocation` or `user-invocable: false` usage is intentional and documented + +--- + +## Dependencies + +- Builds on the completed agent-first skill restructure work +- Should be coordinated with ongoing skill/CLI parity cleanup so docs do not drift again +- Phase 3 depends on approval of the Phase 2 design spike + +--- + +## Estimated Effort + +- Phase 1: 2 to 4 hours +- Phase 2: 2 to 3 hours +- Phase 3: variable, depends on packaging choice + +--- + +## Success Criteria + +- [ ] selftune adopts at least three high-value advanced patterns without regressing current routing behavior +- [ ] No broad frontmatter controls are applied to the umbrella skill in a way that harms existing workflows +- [ ] Supporting-file usage becomes stronger and more explicit +- [ ] The repo has a clear answer on whether platform-native helper skills are worth introducing diff --git a/docs/exec-plans/active/agent-first-alpha-onboarding.md b/docs/exec-plans/active/agent-first-alpha-onboarding.md new file mode 100644 index 00000000..c05b12e1 --- /dev/null +++ b/docs/exec-plans/active/agent-first-alpha-onboarding.md @@ -0,0 +1,193 @@ +# Agent-First Alpha Onboarding + +**Status:** Proposed +**Date:** 2026-03-19 + +## Goal + +Make the real alpha user path happen through the user's coding agent and the +local CLI, not through the cloud frontend as the primary UX. + +The cloud app remains the control plane for: +- sign-in +- alpha enrollment +- upload credential issuance + +But the user's experience should be: +1. tell the agent to set up selftune +2. complete the minimum cloud auth handoff +3. return to the agent/CLI flow + +## Product Rule + +The cloud app is a dependency, not the main product surface. + +The main product surface remains: +- `skill/SKILL.md` +- `skill/Workflows/Initialize.md` +- `selftune init` + +## Ticket 1: Define the Agent-First Enrollment Flow + +**Goal:** specify the exact setup sequence the agent should follow. + +### Deliverable +- a short flow spec covering: + - user says "set up selftune" + - agent checks local config + - if not linked, agent explains the cloud enrollment step + - user signs in / enrolls / issues credential + - agent stores credential locally + - agent finishes setup and verifies upload readiness + +### Acceptance +- no ambiguity about where browser handoff happens +- no ambiguity about what the agent asks the user +- no ambiguity about when the flow returns to local CLI mode + +## Ticket 2: Replace Local Alpha Identity Assumptions + +**Goal:** stop treating alpha identity as a separate local-only user model. + +### Files +- `cli/selftune/alpha-identity.ts` +- `cli/selftune/types.ts` +- `cli/selftune/init.ts` + +### Work +- treat cloud-linked identity as authoritative +- keep local config as a cache of: + - cloud user id + - org id + - upload credential + - enrollment status metadata if needed +- remove assumptions that local email/user id are the real alpha identity source + +### Acceptance +- local config reflects linked cloud identity, not a separate parallel identity model + +## Ticket 3: Add CLI Support for Cloud Linking State + +**Goal:** make `selftune init` and related commands aware of cloud link status. + +### Files +- `cli/selftune/init.ts` +- `cli/selftune/status.ts` +- `cli/selftune/observability.ts` + +### Work +- detect whether cloud identity + upload credential are present +- show clear agent-facing next steps when missing +- expose whether alpha upload is: + - not linked + - linked but not enrolled + - enrolled but missing credential + - ready + +### Acceptance +- agent can reliably diagnose why alpha upload is not active + +## Ticket 4: Add Browser Handoff UX for the Agent + +**Goal:** make the unavoidable cloud step feel intentional and small. + +### Files +- `skill/Workflows/Initialize.md` +- `skill/SKILL.md` +- `skill/references/interactive-config.md` + +### Work +- tell the agent exactly when to ask the user to sign in to the cloud app +- tell the agent exactly when to ask the user to issue an upload credential +- make the copy explicit: + - this is a one-time account/enrollment step + - afterwards the workflow returns to the local agent/CLI path + +### Acceptance +- the agent does not present the cloud app as the main way to use selftune + +## Ticket 5: Add Credential Import / Storage Path + +**Goal:** let the agent finish setup after the user gets a cloud-issued credential. + +### Files +- `cli/selftune/init.ts` +- `cli/selftune/alpha-upload/index.ts` +- local config read/write helpers + +### Work +- accept product-issued `st_live_*` credential in setup flow +- store it locally in the expected config location +- validate presence/format before marking setup complete + +### Acceptance +- after credential issuance, the agent can finish setup without manual file editing + +## Ticket 6: Add Upload Readiness Verification + +**Goal:** prove the local machine is actually ready after setup. + +### Files +- `cli/selftune/init.ts` +- `cli/selftune/observability.ts` +- `skill/Workflows/Initialize.md` + +### Work +- run a small readiness check after setup: + - config present + - enrollment/credential fields present + - push endpoint configured + - upload queue can initialize +- return agent-facing confirmation or exact remediation + +### Acceptance +- setup ends with a concrete readiness result, not “probably done” + +## Ticket 7: Update Agent Docs to Match the New Truth + +**Goal:** keep the agent-first product surface aligned with the new onboarding path. + +### Files +- `skill/SKILL.md` +- `skill/Workflows/Initialize.md` +- `skill/Workflows/Doctor.md` +- `skill/Workflows/Dashboard.md` if any cloud references exist + +### Work +- make the setup workflow explicitly agent-first +- describe cloud auth as a required one-time control-plane handoff +- remove any implication that users should live in the cloud UI for normal use + +### Acceptance +- docs match the intended product story + +## Ticket 8: Add End-to-End Setup Smoke Test + +**Goal:** verify the intended user path, not just the pieces. + +### Scope +- temp local config +- simulated or staged cloud-issued credential +- `selftune init` +- readiness verification + +### Acceptance +- one passing test proves the setup can go from fresh machine to upload-ready + +## Recommended Order + +1. Ticket 1 — flow spec +2. Ticket 2 — local identity cleanup +3. Ticket 3 — cloud-link state in CLI +4. Ticket 5 — credential import/storage +5. Ticket 6 — readiness verification +6. Ticket 4 and Ticket 7 — doc/agent workflow alignment +7. Ticket 8 — end-to-end smoke test + +## Success Criteria + +- the primary setup story is “tell your agent to set up selftune” +- the cloud UI is used only as a short auth/enrollment handoff +- the agent can explain exactly what the user must do and when +- local config reflects cloud-issued identity/credential state +- setup ends with upload-ready verification diff --git a/docs/exec-plans/active/alpha-rollout-data-loop-plan.md b/docs/exec-plans/active/alpha-rollout-data-loop-plan.md new file mode 100644 index 00000000..fbb44dc0 --- /dev/null +++ b/docs/exec-plans/active/alpha-rollout-data-loop-plan.md @@ -0,0 +1,362 @@ +# Execution Plan: Alpha Rollout and Data Loop Activation + + + +**Status:** In Progress +**Created:** 2026-03-18 +**Goal:** Move selftune from “mechanics built” to “confidence building” by shipping a consent-based alpha rollout and a real multi-user data loop, while only fixing the dashboard/data-integrity issues that block trustworthy testing. + +## Status Update — 2026-03-18 + +This plan has partially executed. + +- **Phase A:** substantially complete + - runtime identity landed in `/api/health` and the dashboard footer + - hermetic path overrides now cover config/log/Claude/OpenClaw roots + - the dev probe is stable again and no longer mutates `bun.lock` + - rebuild preflight now blocks lossy rebuilds and reports SQLite-only row counts +- **Phase B:** complete for the current onboarding slice + - alpha config/identity flow shipped + - explicit consent/email flow is documented for the agent-facing init workflow + - raw prompt/query text consent wording is now aligned with the friendly alpha cohort + - plain `selftune init --force` preserves existing alpha enrollment +- **Phase C:** complete (cloud-realigned, hardened) + - the initial D1 schema/type/doc spike landed, then fully realigned to cloud API + - standalone Worker/D1 scaffold removed; pipeline targets `POST /api/v1/push` on the cloud API + - auth model: `st_live_*` API keys via Bearer header + - lossless canonical upload staging table (`canonical_upload_staging`) with single monotonic cursor + - `stage-canonical.ts` reads canonical JSONL + evolution evidence + orchestrate_runs into staging + - deterministic `execution_fact_id` and `evidence_id` generation during staging + - `build-payloads.ts` reads from staging table, produces V2 canonical push payloads + - HTTP client with Bearer auth and fail-open behavior (never throws) + - flush engine: 409 (duplicate) treated as success, 401/403 as non-retryable auth errors + - orchestrate_runs now staged and included in V2 push payloads + - telemetry contract hardened with Zod schemas (`PushPayloadV2Schema` with `min(0)` arrays) + - cloud API stores lossless `raw_pushes` before normalizing into canonical Postgres tables + - `selftune alpha upload [--dry-run]` CLI command + - upload step wired into `selftune orchestrate` (step 5, fail-open) + - `selftune status` and `selftune doctor` show alpha queue health + - e2e integration tests for the full upload pipeline + +The next implementation target is **Phase D: Analysis Loop for Marginal Cases**. + +--- + +## Executive Summary + +The office-hours synthesis changes the priority order. + +The main problem is not “build more product surface.” The main problem is that selftune still lacks enough real-world data to know what good looks like across users, skills, and workflows. + +That means the next move should **not** be “start the entire dashboard-data-integrity-recovery plan end-to-end.” That plan is valid, but only part of it is a prerequisite for alpha. + +The right sequence is: + +1. Finish the **remaining trust-floor follow-ons** only where they still block alpha. +2. Treat the **consentful alpha onboarding flow** as landed for the current slice. +3. Build the **remote data pipeline** for opted-in alpha users. +4. Create a **tight operator loop** for Daniel to inspect marginal cases and learn from them. +5. Then return to the deeper dashboard/runtime cleanup that is not blocking alpha. + +--- + +## Recommendation on the Existing Recovery Plan + +**Do not start the full** [dashboard-data-integrity-recovery.md](dashboard-data-integrity-recovery.md) **first.** + +Start only the parts of it that are direct alpha prerequisites: + +- Phase 0: runtime identity and dev-server truth +- Phase 1: hermetic tests / proof harnesses +- Phase 2: lossy-rebuild guardrails and backup honesty + +Defer the rest until after alpha data collection is live: + +- WAL-based SSE freshness cleanup +- broader dashboard semantic cleanup +- deeper documentation realignment beyond what alpha needs + +Reason: Ray’s synthesis says the bottleneck is confidence from data, not more mechanics. But alpha data is only useful if the data path is trustworthy. + +--- + +## Planning Inputs + +- office-hours-2026-03-18-synthesis.md (external strategy document) +- [dashboard-data-integrity-recovery.md](dashboard-data-integrity-recovery.md) +- [cloud-auth-unification-for-alpha.md](cloud-auth-unification-for-alpha.md) + +--- + +## Target State + +- Daniel can onboard 3-5 alpha users with explicit consent in minutes. +- Each alpha user has a stable local identity stored in `~/.selftune/`. +- Opted-in alpha data uploads to a shared backend with enough fidelity to analyze false positives, false negatives, and marginal cases. +- Local dashboards and stores are trustworthy enough that Daniel can validate what happened during alpha sessions. +- Tests and proofs cannot pollute the real operator store. +- Rebuild/backfill cannot silently drop recent data. + +--- + +## Execution Order + +### Phase A: Alpha Trust Floor + +**Status:** Substantially complete + +**Priority:** Critical +**Effort:** Medium +**Risk:** Low + +This phase is the minimum cut of the dashboard recovery work required before recruiting testers. + +**Scope:** + +1. Expose runtime identity in `/api/health` and the dashboard UI. Completed. +2. Fix the `bun run dev` backend-health probe and startup race baseline. Probe fixed; startup wait is still optional follow-on work. +3. Make test/proof runs hermetic with environment-overridable storage roots. Substantially complete. +4. Add rebuild preflight/guardrails so recent SQLite-only rows cannot be silently discarded. Completed. + +**Why this phase exists:** + +- alpha data is useless if Daniel cannot tell which workspace/server he is looking at +- alpha data is dangerous if tests can leak into the real store +- alpha confidence collapses if rebuild can delete recent rows + +**Completion criteria:** + +- Daniel can identify workspace, DB path, log path, and watcher mode from the running dashboard +- `bun run dev` and `selftune dashboard` no longer create mystery backend mismatches +- proof/test runs leave `~/.selftune` and `~/.claude` untouched +- destructive rebuild aborts when it would be lossy + +--- + +### Phase B: Consentful Alpha Onboarding + +**Status:** Complete for current scope + +**Priority:** Critical +**Effort:** Medium +**Risk:** Medium + +**Primary outcome:** `selftune init` becomes the alpha enrollment point. + +**Files likely involved:** + +- `cli/selftune/init.ts` +- `cli/selftune/types.ts` +- `cli/selftune/constants.ts` +- `skill/Workflows/Initialize.md` +- `skill/SKILL.md` +- config/helpers under `cli/selftune/` + +**Changes:** + +1. Add an explicit alpha-consent flow during init: + - explain that this is an alpha + - explain what data is shared + - explain that the purpose is improving selftune +2. Collect: + - email + - display name or optional label + - consent timestamp + - alpha participation flag +3. Persist a stable local user identity in `~/.selftune/`. +4. Keep the flow simple and skippable: + - opted-in alpha user + - local-only user +5. Update the agent-facing init docs to reflect the exact flow. + +**Non-goals:** + +- full public-launch anonymization +- enterprise-grade privacy workflows + +**Completion criteria:** + +- a new alpha user can complete init and enrollment in under 5 minutes +- identity and consent are stored locally and inspectable +- the skill docs tell the agent how to explain the alpha clearly + +--- + +### Phase C: Remote Alpha Data Pipeline + +**Status:** Complete + +**Priority:** Critical +**Effort:** Large +**Risk:** Medium + +**Primary outcome:** opted-in alpha data reaches a shared backend Daniel can analyze. + +**Current state:** fully implemented. Local queue, payload builders, HTTP transport, CLI surface, orchestrate integration, and operator diagnostics are all shipped with 80 passing tests. The standalone Cloudflare Worker/D1 scaffold was replaced with direct integration into the existing cloud API's V2 push endpoint (`POST /api/v1/push`), authenticated with `st_live_*` API keys. + +**Design direction (resolved):** + +- The initial Cloudflare/D1 direction from the synthesis was evaluated and scaffolded, but was replaced with the existing cloud API to reduce operational surface and unify authentication +- Upload from opted-in clients only, authenticated with `st_live_*` API keys via Bearer header +- Local SQLite as source-of-truth cache, cloud API (Neon Postgres) as analysis sink + +**Files likely involved:** + +- new remote sync/upload module under `cli/selftune/` +- `cli/selftune/orchestrate.ts` or a dedicated uploader command/scheduler +- `cli/selftune/contribute/` if reused +- `cli/selftune/types.ts` +- docs and init workflow + +**Changes:** + +1. Define the alpha upload contract: + - user ID + - agent/platform metadata + - skill invocation facts + - prompt/query references needed for false positive / false negative analysis + - evolution outcomes where relevant +2. Decide upload timing: + - immediate best-effort + - periodic batch + - explicit sync +3. Add local queueing / retry behavior for failed uploads. +4. Add a simple operator view or CLI for upload status. +5. Keep consent enforcement local and explicit. + +**Completed sub-split for this phase:** + +1. local upload queue + watermark tracking +2. canonical upload staging (`stage-canonical.ts`) + payload builders +3. cloud API V2 push integration (replaced Worker/D1 direction) +4. upload-status visibility for operators + +**Completion criteria:** + +- Daniel can query remote data by user, time window, and skill +- failed uploads are visible and retryable +- an opted-out user sends nothing upstream + +--- + +### Phase D: Analysis Loop for Marginal Cases + +**Priority:** High +**Effort:** Medium +**Risk:** Medium + +**Primary outcome:** Daniel can turn alpha data into learning, not just storage. + +Detailed spike: [phase-d-marginal-case-review-spike.md](phase-d-marginal-case-review-spike.md) + +**Changes:** + +1. Build the four-quadrant analysis view around: + - true positive + - false positive + - false negative + - true negative +2. Prioritize operator views for: + - likely false negatives + - likely false positives + - ambiguous/marginal cases +3. Add a lightweight review mechanism for marginal cases: + - thumbs up/down + - accepted/rejected label + - optional note +4. Store those labels so future eval/evolution work can use them. + +**Important note:** + +This does **not** need to be a polished end-user product first. A Daniel-only operator surface is enough for the first cohort. + +**Completion criteria:** + +- Daniel can review and label marginal cases from alpha users +- labels are stored with enough context to feed later eval/evolution improvements + +--- + +### Phase E: Alpha Cohort Operations + +**Priority:** High +**Effort:** Small +**Risk:** Low + +**Primary outcome:** the first 3-5 testers are actually live. + +**Changes:** + +1. Prepare a short alpha invite script and install script. +2. Create a tester checklist: + - install + - init + - consent + - verify upload + - run normal work +3. Add a simple internal tracker: + - who is active + - when they were onboarded + - whether uploads are flowing + - notable skill failures or wins +4. Respond to Ray and any other volunteers with the alpha setup flow. + +**Completion criteria:** + +- 3-5 alpha users are onboarded +- at least 2 are generating real data regularly +- Daniel can inspect their uploads without custom debugging + +--- + +### Phase F: Return to the Deferred Recovery Work + +**Priority:** Medium +**Effort:** Medium +**Risk:** Medium + +After alpha data is flowing, resume the deferred parts of the dashboard recovery plan: + +- WAL-driven SSE freshness +- broader dashboard semantic cleanup +- final documentation alignment + +This work still matters, but it should follow the data loop, not precede it. + +--- + +## Completed Agent Splits + +### Phase C (completed 2026-03-18) + +Wave 1 (parallel): +1. **Agent 1:** Queue + watermark storage (20 tests) +2. **Agent 2:** Payload builder from SQLite (19 tests) +3. **Agent 3:** HTTP client + flush engine (15 tests) +4. **Agent 4:** Cloud API integration (replaced standalone Worker scaffold) (17 tests) + +Wave 2 (after Wave 1): +5. **Agent 5:** CLI + orchestrate integration (10 tests) +6. **Agent 6:** Upload status + doctor diagnostics (17 tests) + +### Next split suggestion + +Phase D is the next active target: +1. **Agent 1:** Four-quadrant analysis view (TP/FP/FN/TN) +2. **Agent 2:** Labeling + review mechanism +3. **Agent 3:** Operator inspection flow (Daniel-only) + +--- + +## Acceptance Criteria for Starting Alpha + +Alpha is ready to begin when all of the following are true: + +- Daniel can trust which runtime/store he is looking at +- tests cannot contaminate real data +- rebuild cannot silently lose fresh rows +- init can enroll a user with explicit consent +- opted-in data can reach the shared backend +- Daniel can inspect marginal cases from at least one non-Daniel user + +Until then, the product is still in internal mechanics mode, not alpha-learning mode. diff --git a/docs/exec-plans/active/alpha-simplification-program.md b/docs/exec-plans/active/alpha-simplification-program.md new file mode 100644 index 00000000..75a080f8 --- /dev/null +++ b/docs/exec-plans/active/alpha-simplification-program.md @@ -0,0 +1,328 @@ + + +# Execution Plan: Alpha Simplification Program + +**Status:** In Progress +**Created:** 2026-03-19 +**Goal:** Reduce coordination tax during alpha by freezing optional breadth, deleting redundant architecture, and converging on one narrow happy path that is easier to ship, debug, and maintain. + +## Status Update — 2026-03-20 + +This plan has partially executed. + +**Landed already:** + +- Claude Code is now explicitly the primary platform for active support, while Codex/OpenCode/OpenClaw are labeled experimental across the CLI, README, SKILL surface, and AGENTS docs. +- Alpha remote ingest is concentrated on the main cloud API + Neon path; the earlier sidecar/Worker direction was replaced by the current `POST /api/v1/push` path, canonical upload staging, and `SELFTUNE_ALPHA_ENDPOINT`. +- Major SQLite-primary read paths are already in place: architecture docs describe SQLite as the primary operational store, and core dashboard/status/report surfaces query SQLite rather than reading JSONL directly. +- Dashboard runtime identity and freshness honesty improved materially: the runtime footer and Status page expose workspace/db/log/config paths and warn when the server is still using legacy JSONL watcher invalidation. + +**Partially landed / still mixed:** + +- SQLite-primary is not fully complete. Dashboard live invalidation still watches JSONL logs, and some modules still read JSONL directly on transitional or non-core paths. +- The “one honest dashboard story” is only partially complete. Runtime identity and watcher-mode warnings landed, but the live-freshness story is still mixed and some design docs no longer match the current implementation. +- Optional breadth is mostly frozen in labels and messaging, but not yet consistently enforced as a planning rule across all active work. + +**Still open:** + +- Cloud auth unification is not done. Local alpha identity still behaves as a first-class local model, and the repo still carries an active auth-unification plan to converge browser auth, upload auth, and local cached identity. +- Duplicate/obsolete paths and stale docs remain in the tree. +- JSONL-driven runtime invalidation remains open until the dashboard fully cuts over to SQLite WAL live updates. + +This plan should now be treated as an in-progress simplification program with some decisions already landed, some partially landed, and a smaller set of unresolved authority/auth/freshness issues still blocking the end-state. + +## Problem Statement + +selftune is moving slowly because too many changes cross too many unsettled boundaries. + +Today the project is simultaneously acting as: + +- a local agent skill +- a telemetry/normalization pipeline +- a dashboard/operator surface +- a cloud product with auth, uploads, and analysis + +That would be manageable if the boundaries were settled. They are not. + +The current drag comes from unresolved duplication and partial migrations: + +- JSONL vs SQLite vs cloud as “source of truth” +- browser auth vs API auth vs alpha auth +- local product vs cloud product +- agent-facing docs vs CLI behavior +- primary platform vs experimental platforms + +The result is that a small feature often becomes: + +- CLI work +- workflow doc work +- local schema work +- dashboard contract work +- cloud ingest work +- operator semantics work + +This is why progress feels slow. + +## Principle + +For alpha, selftune should optimize for: + +- one primary platform +- one local runtime path +- one cloud auth model +- one ingest path +- one explanation of what the system is doing + +Everything else should be frozen, deferred, or explicitly downgraded to experimental. + +## Target Alpha Shape + +### Keep + +- Claude Code as the primary alpha platform +- SQLite as the local runtime/query store +- cloud upload to the main cloud API + Neon +- Neon Auth as the canonical user/session model +- product-owned upload credentials tied to cloud users +- one dashboard path that reflects the actual current system +- the agent-first skill surface for the primary workflows + +### Freeze + +- Codex/OpenCode/OpenClaw architecture work +- new dashboard surfaces that do not help alpha learning +- new eval/evolution sophistication that is not required for current alpha decisions +- new auth variants +- new ingestion backends + +### Delete or Defer + +- runtime dependence on JSONL watchers +- duplicate auth stacks +- stale workflows/docs for unsupported paths +- ambiguous “source of truth” language +- optional architecture branches that are not serving the current alpha + +## Simplification Decisions + +### Decision 1: One Primary Platform + +**Current status:** Substantially landed in the current repo surface. + +**Decision:** Claude Code is the only first-class platform during alpha. + +Implications: + +- Claude Code paths get active maintenance +- Codex/OpenCode/OpenClaw remain explicitly experimental +- no new architectural work should be justified by experimental adapters during alpha + +Follow-through: + +- mark non-Claude adapters as frozen for alpha +- stop routing roadmap-critical decisions through multi-platform generality + +### Decision 2: SQLite-Primary Local Runtime + +**Current status:** Partially landed. + +**Decision:** SQLite is the only local runtime/query source of truth. + +Implications: + +- dashboard reads from SQLite +- local status/doctor/report queries read from SQLite +- JSONL remains backup/export/input material, not runtime truth + +Follow-through: + +- no runtime freshness logic should depend on JSONL watchers +- JSONL becomes archival/recovery/input material only +- docs must stop implying equal status between JSONL and SQLite + +### Decision 3: One Cloud Ingest Path + +**Current status:** Landed for the current alpha upload path. + +**Decision:** Alpha data goes to the main cloud API and Neon. No parallel worker/D1 path for alpha. + +Implications: + +- one remote store +- one auth boundary +- one operator query surface + +Follow-through: + +- remove or freeze sidecar remote-ingest experiments for alpha +- keep `telemetry-contract` authoritative, but keep ingestion concentrated in one backend + +### Decision 4: One Cloud Auth Story + +**Current status:** Not landed yet. + +**Decision:** Neon Auth owns user/session identity. Upload credentials are product-owned credentials tied to those cloud users. + +Implications: + +- alpha users are cloud users +- local alpha identity becomes cached state, not source of truth +- browser auth and upload auth resolve into one user/org graph + +Follow-through: + +- do not keep a parallel direct Better Auth product auth stack +- do not assume custom Better Auth plugin paths are the right long-term boundary just because Neon Auth uses Better Auth under the hood + +### Decision 5: One Honest Dashboard Story + +**Current status:** Partially landed. + +**Decision:** The dashboard must clearly say what it is showing and what freshness model it uses. + +Implications: + +- no mixed implicit semantics +- no “recent activity” labels when the data source is actually older audit-only state +- no mystery backend/process identity + +Follow-through: + +- preserve runtime identity and watcher-mode indicators +- prefer explicit labels over ambiguous aggregation + +## Concrete Cut List + +### Cut Now + +- New platform-generalization work for non-Claude adapters +- Additional D1/worker architecture for alpha telemetry +- Auth work that preserves both Neon Auth and a second product auth stack +- Dashboard features that depend on unresolved semantics + +### Cut Soon + +- JSONL-driven runtime invalidation +- stale workflow instructions for removed or legacy paths +- duplicate contract definitions where one package should be authoritative + +### Keep Investing In + +- upload reliability +- operator review tools +- marginal-case analysis +- auth unification +- data integrity +- agent-facing workflow accuracy + +## Execution Phases + +### Phase 0: Freeze Optional Breadth + +**Status:** Partially complete. + +**Priority:** Critical +**Effort:** Small +**Risk:** Low + +Actions: + +- mark non-Claude platform work as frozen for alpha +- mark sidecar remote-ingest experiments as out of scope for alpha +- stop accepting roadmap arguments that depend on multi-platform breadth + +Completion criteria: + +- active plans stop assuming equal investment across platforms +- open work is framed around the Claude Code alpha path + +### Phase 1: Remove Duplicate Authority + +**Status:** In progress. + +**Priority:** Critical +**Effort:** Medium +**Risk:** Medium + +Actions: + +- converge auth around the cloud-auth unification plan +- continue the SQLite-primary cleanup +- remove stale source-of-truth language in docs + +Completion criteria: + +- one answer for “where is local truth?” +- one answer for “who is the user?” +- one answer for “where does alpha data go?” + +### Phase 2: Delete Obsolete Paths + +**Status:** Not complete. + +**Priority:** High +**Effort:** Medium +**Risk:** Medium + +Actions: + +- remove dead or misleading commands/docs +- remove runtime dependencies on transitional code paths once replacements are proven +- archive or explicitly label experimental modules instead of pretending they are near-equal peers + +Completion criteria: + +- fewer paths to do the same thing +- fewer stale docs +- fewer “temporary” branches still in the critical path + +### Phase 3: Tighten the Alpha Kernel + +**Status:** Partially complete. + +**Priority:** Critical +**Effort:** Medium +**Risk:** Low + +Define the alpha kernel as the only thing that must feel great: + +- init/enroll +- observe +- upload +- inspect +- label marginal cases +- improve core skill behavior + +Everything else is secondary until the kernel is fast and trustworthy. + +Completion criteria: + +- a new alpha user can be onboarded quickly +- uploads are trustworthy +- Daniel can inspect real data quickly +- the core improvement loop is understandable + +## Success Metrics + +- A typical alpha-facing change touches fewer subsystems than it does today. +- The team can explain local truth, cloud truth, and auth truth in one sentence each. +- The number of “experimental but still on the critical path” modules goes down. +- The time from bug discovery to confident fix gets shorter. +- The number of plan/doc/code mismatches drops materially. + +## Anti-Goals + +Do not use this plan as justification for: + +- another broad rewrite +- new generic abstractions +- new cross-platform frameworks +- more architecture before deleting old architecture + +The point is subtraction, not sophistication. + +## Related Plans + +- [alpha-rollout-data-loop-plan.md](/Users/danielpetro/conductor/workspaces/selftune/miami/docs/exec-plans/active/alpha-rollout-data-loop-plan.md) +- [cloud-auth-unification-for-alpha.md](/Users/danielpetro/conductor/workspaces/selftune/miami/docs/exec-plans/active/cloud-auth-unification-for-alpha.md) +- [dashboard-data-integrity-recovery.md](/Users/danielpetro/conductor/workspaces/selftune/miami/docs/exec-plans/active/dashboard-data-integrity-recovery.md) diff --git a/docs/exec-plans/active/cloud-auth-unification-for-alpha.md b/docs/exec-plans/active/cloud-auth-unification-for-alpha.md new file mode 100644 index 00000000..769a3674 --- /dev/null +++ b/docs/exec-plans/active/cloud-auth-unification-for-alpha.md @@ -0,0 +1,360 @@ +# Execution Plan: Cloud Auth Unification for Alpha + +**Status:** Proposed +**Created:** 2026-03-19 +**Goal:** Unify the cloud app and alpha-upload auth model so alpha users are first-class cloud users, browser and API routes share one identity boundary, and CLI uploads use cloud-issued credentials instead of a parallel local-only identity system. + +## Why This Exists + +Today the auth story is split: + +- the Next.js cloud app uses Neon Auth wrappers +- the API package configures Better Auth directly +- local selftune stores a separate alpha identity block in `~/.selftune/config.json` +- alpha upload credentials are conceptually API keys, but that path is not yet the same clear product boundary as browser auth + +That split creates unnecessary product and operational complexity: + +- alpha users are not clearly the same thing as cloud users +- browser auth and upload auth are hard to reason about together +- operator support is harder because identity is duplicated across local and cloud layers +- rollout confidence is lower because the public authenticated upload path is not the same path we are exercising through the browser app + +The desired model is simpler: + +- **cloud account** is the source of truth for identity +- **org membership** is the source of truth for tenancy +- **alpha enrollment** is cloud-side state on that user or org membership +- **CLI upload credential** is minted from the signed-in cloud user and stored locally as a cache + +Not every cloud user must be an alpha user. +But every alpha user should be a cloud user. + +## Architectural Recommendation + +Use **Neon Auth as the canonical product auth boundary** for user and session identity. + +That does not mean “keep two independent auth implementations because Neon Auth uses Better Auth internally.” It means: + +- the product should treat Neon Auth as the user/session authority +- the API should trust the same identity model as the web app +- CLI upload credentials should be issued by the cloud app/API for authenticated cloud users +- local selftune should stop inventing its own long-lived alpha identity as the source of truth + +### Docs Basis + +Neon’s auth docs make the intended boundary explicit: + +- Neon Auth is a **managed authentication service** for users, sessions, and auth configuration +- auth state lives in the **`neon_auth` schema** +- Neon positions it as the right choice for **production authentication**, preview environments, and branch-aware auth flows +- Neon also explicitly says **self-hosting Better Auth makes sense when you need custom plugins, hooks, and options not yet supported by Neon Auth** + +That matters here because the current API package is trying to use a separate direct Better Auth setup plus the `apiKey()` plugin as if it were the same thing as Neon Auth. The docs do not support that assumption. + +### What This Means for Upload Credentials + +Neon Auth should remain the canonical **user/session** layer. + +But alpha upload credentials should be treated as **product-owned credentials tied to Neon-authenticated users**, not as an implicit “Neon Auth supports the Better Auth API-key plugin” assumption. + +Recommended long-term shape: + +- browser sign-in and cloud identity: **Neon Auth** +- upload credential issuance and revocation: **product-owned tables and endpoints in the cloud app** +- upload credential verification: **API middleware that resolves the credential back to the same Neon-authenticated user/org graph** + +### Auth Boundary After Unification + +- **Browser app** + - Neon Auth session/cookie + - user signs in once + - org membership resolved cloud-side + +- **Cloud API** + - browser requests authenticated via the same cloud user/session boundary + - CLI upload requests authenticated via cloud-issued upload keys or tokens stored in product-owned tables + - both paths resolve to the same `user_id` and `org_id` + +- **Local selftune** + - stores cached cloud identity references and upload credentials + - does not treat local email/user_id as canonical enrollment truth + +## Product Rules + +1. Alpha enrollment is a cloud feature, not a local-only feature. +2. The source of truth for alpha status lives in the cloud backend. +3. The CLI may cache enrollment and credential state locally for convenience, but the cloud backend remains authoritative. +4. Upload credentials must be revocable and attributable to a real cloud user and org. +5. Auth for browser and auth for upload may use different credential forms, but they must resolve to the same user/org graph. + +## Target State + +- A user signs into the cloud app and belongs to an org. +- That user opts into alpha inside the product or through an authenticated CLI/browser handoff. +- The cloud app mints an upload credential scoped to that user/org. +- `selftune init --alpha` stores the credential locally and records the linked `cloud_user_id` and `org_id`. +- `selftune alpha upload` authenticates with that cloud-issued credential. +- Operator tools query by the same user/org identifiers the browser app uses. + +## Scope + +### In Scope + +- choose one canonical auth boundary for app + API +- make alpha users first-class cloud users +- mint upload credentials from authenticated cloud users +- change local alpha identity semantics from source of truth to cache +- align CLI onboarding with cloud sign-in +- align docs and product language around one auth story + +### Out of Scope + +- enterprise SSO +- billing/plan enforcement beyond org membership hooks +- public self-serve signup polish +- non-alpha community contribution auth + +## Repo Boundaries + +### `/Users/danielpetro/conductor/workspaces/selftune-cloud-app/gwangju-v1` + +Owns: + +- canonical user/session/org auth model +- alpha enrollment state +- upload credential issuance and revocation +- protected operator surfaces +- upload auth verification + +### `/Users/danielpetro/conductor/workspaces/selftune/miami` + +Owns: + +- local sign-in/enrollment handoff UX +- cached identity and credential storage +- upload client usage of issued credentials +- agent-facing workflow docs + +## Execution Phases + +### Phase 0: Decide the Canonical Auth Surface + +**Priority:** Critical +**Risk:** Low + +Make an explicit architectural choice: + +- treat Neon Auth as the product-level user/session authority +- stop treating the direct Better Auth setup in `packages/api` as an independent product auth stack +- stop assuming Neon Auth should also directly host the Better Auth `apiKey()` plugin path + +Deliverables: + +- short architecture note in the cloud repo +- one stated auth source of truth +- clear ownership of browser sessions vs CLI upload credentials + +Completion criteria: + +- the team can answer “how does a user authenticate?” in one sentence +- the team can answer “how does a CLI upload authenticate?” in one sentence + +### Phase 1: Cloud Enrollment Model + +**Priority:** Critical +**Risk:** Medium + +Add or normalize cloud-side enrollment state. + +Recommended shape: + +- `alpha_enrollments` + - `user_id` + - `org_id` + - `status` + - `consented_at` + - `cohort` + - `notes` + - `created_at` + - `updated_at` + +Alternative: + +- add alpha fields directly to a user/org membership table if that is materially simpler + +Requirements: + +- enrollment is queryable by user and org +- enrollment can be revoked without deleting the user +- operator tools can filter to alpha-enrolled users only + +### Phase 2: Upload Credential Issuance + +**Priority:** Critical +**Risk:** Medium + +Build the cloud-side flow that issues a CLI upload credential from an authenticated cloud user. + +Recommended model: + +- authenticated browser/session request +- server creates scoped upload credential +- credential tied to `user_id` + `org_id` +- credential revocable and auditable + +The credential should be product-owned, not a side effect of a parallel Better Auth plugin stack. + +Recommended model: + +- `upload_credentials` or equivalent product-owned table +- credential issued only after a Neon-authenticated user session is resolved +- credential tied to `user_id` + `org_id` +- credential revocable without touching the underlying user account +- credential usage auditable (`created_by`, `last_used_at`, `revoked_at`) + +The concrete credential can be either: + +- a product-owned API key, or +- a signed upload token with rotation metadata + +But it should not depend on a second hidden auth world, and it should not assume Neon Auth directly exposes the custom Better Auth plugin surface you would get from self-hosting. + +Requirements: + +- issue +- list +- revoke +- last used timestamp +- scope metadata (`push`, optional `read`) + +### Phase 3: API Auth Unification + +**Priority:** Critical +**Risk:** High + +Update the API so: + +- browser-authenticated requests resolve user/org via the canonical cloud auth path +- upload-authenticated requests resolve to the same user/org model using the issued credential +- push/operator routes do not rely on a parallel auth implementation with drifting tables or models + +This phase should remove the current conceptual split between: + +- browser auth in the app +- direct Better Auth auth in `packages/api` +- product-owned upload credentials vs user/session identity + +Completion criteria: + +- the auth middleware resolves both browser and CLI callers into the same `user_id` / `org_id` context +- one integration test proves browser session auth +- one integration test proves upload credential auth + +### Phase 4: Local CLI Onboarding Realignment + +**Priority:** Critical +**Risk:** Medium + +Change `miami` so local alpha identity is no longer the primary source of truth. + +New flow: + +1. agent asks user whether they want to enroll in alpha +2. if yes, CLI opens or instructs a cloud login flow +3. cloud confirms identity and enrollment +4. cloud issues upload credential +5. local config stores: + - `cloud_user_id` + - `org_id` + - cached email/display name if useful + - upload credential metadata + +Local config should stop behaving like the canonical alpha registry. + +### Phase 5: Migration and Compatibility + +**Priority:** High +**Risk:** Medium + +Provide a temporary migration path for existing local alpha users. + +Recommended behavior: + +- detect legacy local-only alpha blocks +- prompt to link or migrate to a cloud account +- do not silently discard local enrollment state +- support a transitional fallback period if needed + +Completion criteria: + +- existing testers can migrate without losing upload ability +- new testers only see the unified flow + +## Testing Strategy + +### Cloud Repo + +- session-auth route tests +- upload-credential issuance tests +- upload-credential verification tests +- revoke/expired credential tests +- user/org resolution tests + +### Local Repo + +- init/enrollment tests +- migration from legacy local alpha block tests +- upload with cloud-issued credential tests +- opted-out user sends nothing tests + +### End-to-End + +1. sign in as a cloud user +2. enroll in alpha +3. mint upload credential +4. store locally +5. perform upload +6. verify rows land under the correct org/user +7. revoke credential +8. verify further uploads fail cleanly + +## Rollout Order + +1. Cloud enrollment model +2. Credential issuance + revocation +3. API auth unification +4. Local CLI onboarding realignment +5. Legacy migration +6. Remove obsolete local-only identity assumptions + +## Acceptance Criteria + +- Every alpha user is a cloud user. +- Browser auth and upload auth resolve into the same user/org graph. +- Local `~/.selftune/config.json` is a cache of cloud-linked identity state, not the source of truth. +- Upload credentials are cloud-issued, revocable, attributable, and stored in product-owned credential tables. +- One real end-to-end authenticated upload works without `DEV_AUTH=1`. +- The product can explain alpha enrollment in one consistent sentence across app, API, and CLI. + +## Neon Docs Notes + +Reviewed against Neon Auth overview, last updated **March 5, 2026**: + +- Neon Auth overview: https://neon.com/docs/auth/overview + +Key constraints taken from the docs: + +- Neon Auth is managed and stores auth state in `neon_auth` +- Neon Auth is the recommended product auth layer for app users and sessions +- self-hosting Better Auth remains the path for unsupported custom plugins/hooks/options + +That is why this plan converges on: + +- **Neon Auth for user/session identity** +- **product-owned upload credentials for CLI ingestion** + +## Related Plans + +- [alpha-rollout-data-loop-plan.md](/Users/danielpetro/conductor/workspaces/selftune/miami/docs/exec-plans/active/alpha-rollout-data-loop-plan.md) +- [dashboard-data-integrity-recovery.md](/Users/danielpetro/conductor/workspaces/selftune/miami/docs/exec-plans/active/dashboard-data-integrity-recovery.md) diff --git a/docs/exec-plans/active/dashboard-data-integrity-recovery.md b/docs/exec-plans/active/dashboard-data-integrity-recovery.md new file mode 100644 index 00000000..6cb7d533 --- /dev/null +++ b/docs/exec-plans/active/dashboard-data-integrity-recovery.md @@ -0,0 +1,424 @@ +# Execution Plan: Dashboard Data Integrity Recovery + + + +**Status:** In Progress +**Created:** 2026-03-18 +**Goal:** Eliminate mixed-freshness dashboard behavior, prevent rebuild-driven data loss, isolate tests from real operator stores, and make it obvious which codebase and datastore a running dashboard is actually using. + +## Status Update — 2026-03-18 + +This recovery plan has partially executed. + +**Landed already:** +- runtime identity now exposes repo-root `workspace_root`, git SHA, DB/log/config paths, watcher mode, and process mode +- the dashboard UI now shows a runtime footer +- the dashboard footer and Status page now warn explicitly when live invalidation is still in legacy JSONL watcher mode +- the dev probe uses `localhost` again and no longer rewrites `bun.lock` +- the app-local dashboard `dev` flow now waits for backend health before starting Vite, reducing startup proxy noise +- env-overridable storage roots now cover config/log/Claude/OpenClaw paths +- rebuild preflight now blocks lossy rebuilds and reports SQLite-only row counts +- doctor now includes an integrity warning about the current JSONL-backed dashboard freshness contract + +**Still open from this plan:** +- backup symmetry for `evolution_audit`, `evolution_evidence`, and `orchestrate_runs` +- WAL-driven SSE freshness instead of JSONL watcher invalidation +- clearer overview timeline semantics +- doctor/integrity diagnostics beyond the current trust-floor slice + +This plan should now be treated as a partially completed recovery plan, not as untouched future work. + +--- + +## Executive Summary + +selftune is currently in an inconsistent hybrid state: + +- some streams still dual-write to SQLite + JSONL +- some streams write only to SQLite +- full rebuild still deletes tables and repopulates from JSONL +- the dashboard SSE layer still watches JSONL files, not the SQLite WAL +- tests and proof harnesses can touch the real `~/.selftune` / `~/.claude` stores +- runtime identity is too opaque, so `selftune dashboard`, `bun run dev`, and a globally linked `selftune` binary can look like “the same dashboard” while actually coming from different processes or workspaces + +That combination produces exactly the class of failures we just saw: + +- fresh telemetry with stale evolution activity +- recent rows visible in one server and not another +- rebuilds that can silently discard SQLite-only rows +- test/proof activity polluting the real local store + +This plan fixes the safety issues first, then closes the architecture/documentation gap. + +--- + +## Current Failure Modes + +### 1. Rebuild is not lossless + +- `cli/selftune/localdb/materialize.ts` deletes `evolution_audit`, `evolution_evidence`, and `orchestrate_runs` during full rebuild +- current `cli/selftune/evolution/audit.ts`, `cli/selftune/evolution/evidence.ts`, and `cli/selftune/orchestrate.ts` write to SQLite directly +- rebuild still rehydrates those tables from `~/.claude/*.jsonl` + +Result: + +- if SQLite contains newer rows than JSONL, rebuild can discard real data + +### 2. Dashboard freshness is split across two mental models + +- `cli/selftune/dashboard-server.ts` materializes once at startup +- `refreshV2Data()` and `refreshV2DataImmediate()` are no-ops +- SSE invalidation still watches `TELEMETRY_LOG`, `QUERY_LOG`, and `EVOLUTION_AUDIT_LOG`, not the SQLite WAL +- docs in `docs/design-docs/live-dashboard-sse.md` and `docs/design-docs/sqlite-first-migration.md` describe a more complete SQLite/WAL model than the current runtime actually implements + +Result: + +- the dashboard feels “real-time” for some flows but still depends on legacy file activity for invalidation +- operator expectations do not match the actual code path + +### 3. The homepage activity panel is narrower than it looks + +- `cli/selftune/localdb/queries.ts` builds overview timeline data from `evolution_audit` +- the right-rail activity UI in `packages/ui/src/components/ActivityTimeline.tsx` renders that audit-backed data +- recent `evolution_evidence` rows are not enough to make the overview timeline look fresh + +Result: + +- the page can show fresh session telemetry and stale “latest evolution” at the same time + +### 4. Runtime identity is too opaque + +- `selftune dashboard --port 3141` and `bun run dev` can run different backend processes +- the historical `127.0.0.1` probe mismatch created false negatives on IPv6-localhost setups; the probe is now fixed, but process clarity still matters +- `/api/health` now exposes runtime identity, but operators still need broader freshness/integrity diagnostics +- a global `npm link` can point `selftune` at a different workspace than the one the operator thinks is live + +Result: + +- operators cannot quickly tell which checkout, backend, DB, or log store they are looking at + +### 5. Tests and proof harnesses are not hermetic enough + +- constants resolve directly to `homedir()` paths in `cli/selftune/constants.ts` +- proof and integration tests can exercise real appenders unless they override dependencies correctly +- recent local-store pollution matched temp `selftune-blog-proof-*` paths from `tests/blog-proof/seo-audit-evolve.test.ts` + +Result: + +- test/proof data can leak into real operator dashboards + +### 6. CLI/operator guidance is inconsistent + +- the nonexistent `selftune rebuild-db` guidance was removed from code paths +- the remaining operator task is to keep docs aligned around the export-first recovery flow + +Result: + +- recovery guidance still needs active maintenance right when the operator most needs trustworthy instructions + +--- + +## Target State + +- every persisted stream has one clearly defined durability strategy +- destructive rebuild is either lossless or blocked +- tests cannot touch the real local store +- dashboard health clearly identifies runtime, workspace, DB path, log path, and watcher mode +- `selftune dashboard` and `bun run dev` expose the same backend truth when pointed at the same store +- real evolutions appear in the dashboard within one refresh cycle +- docs describe the architecture that is actually shipping + +--- + +## Execution Order + +Work in this order. Do not start with UI tweaks. + +### Phase 0: Protect Real Data and Expose Runtime Identity + +**Status:** Mostly complete + +**Priority:** Critical +**Effort:** Small +**Risk:** Low + +**Files:** + +- `cli/selftune/dashboard-server.ts` +- `cli/selftune/dashboard-contract.ts` +- `packages/ui/src/types.ts` +- `apps/local-dashboard/src/pages/Overview.tsx` +- `package.json` +- `apps/local-dashboard/vite.config.ts` + +**Changes:** + +1. Expand `/api/health` to include: + - workspace root + - git SHA + - DB path + - log directory + - watcher mode (`jsonl` vs `sqlite-wal`) + - process mode (`standalone`, `dev-server`) + - listening host/port +2. Surface the same runtime identity in the dashboard UI, at least in a compact debug footer or operator panel. +3. Fix the `dev` script probe to use `localhost`, not `127.0.0.1`. +4. Make the `dev` script wait for backend health before letting the frontend proxy race it. +5. Add an explicit warning in health/UI if the dashboard is still using JSONL watcher mode. + +**Acceptance Criteria:** + +- an operator can answer “which workspace/codebase is this server running?” from the UI or `/api/health` +- `bun run dev` no longer false-fails on IPv6-localhost setups +- startup race on `5199` is reduced to at most a brief initial retry, not a confusing multi-error burst + +--- + +### Phase 1: Make Tests and Proof Harnesses Hermetic + +**Status:** Substantially complete for path isolation; CI/store-touch guard still optional follow-on + +**Priority:** Critical +**Effort:** Medium +**Risk:** Low + +**Files:** + +- `cli/selftune/constants.ts` +- test helpers under `tests/` +- `tests/blog-proof/seo-audit-evolve.test.ts` +- `tests/autonomy-proof.test.ts` +- `tests/evolution/*.test.ts` +- sandbox harness scripts if needed + +**Changes:** + +1. Introduce environment-overridable storage roots, for example: + - `SELFTUNE_HOME` + - `SELFTUNE_CONFIG_DIR` + - `SELFTUNE_LOG_DIR` +2. Make all constants derive from those overrides first, then fall back to `homedir()`. +3. Update proof/integration tests to run with temp directories for both config and logs. +4. Add a shared test helper that creates and tears down isolated temp stores. +5. Add a CI/test guard that fails if any test touches the real `~/.selftune` or `~/.claude` paths. + +**Acceptance Criteria:** + +- running blog-proof or autonomy-proof tests leaves the real local dashboard data unchanged +- tests can still use real appenders, but only against temp stores +- local developers can inspect a temp test DB/log dir after a failure + +--- + +### Phase 2: Make Rebuild and Backup Semantics Honest + +**Status:** Started + +**Priority:** Critical +**Effort:** Medium +**Risk:** Medium + +**Files:** + +- `cli/selftune/localdb/materialize.ts` +- `cli/selftune/localdb/db.ts` +- `cli/selftune/evolution/audit.ts` +- `cli/selftune/evolution/evidence.ts` +- `cli/selftune/orchestrate.ts` +- `cli/selftune/export.ts` +- `cli/selftune/index.ts` +- relevant tests under `tests/localdb/`, `tests/evolution/`, `tests/dashboard/` + +**Decision:** + +Short-term, restore backup symmetry for the streams that rebuild currently assumes are recoverable from JSONL: + +- `evolution_audit` +- `evolution_evidence` +- `orchestrate_runs` + +Long-term, remove that compatibility bridge only after rebuild no longer depends on JSONL for those tables. + +**Changes:** + +1. Add a rebuild preflight that compares SQLite max timestamps vs JSONL max timestamps per stream. Completed. +2. Refuse destructive rebuild when SQLite is newer for protected tables unless the operator explicitly forces it. Completed. +3. Reintroduce JSONL backup writes for audit/evidence/orchestrate rows so current backup/rebuild claims become true again. +4. Either implement a real `selftune rebuild-db` command with the safety checks, or remove every user-facing reference to it until it exists. +5. Add tests proving: + - rebuild aborts on lossy inputs + - backup JSONL stays in sync for protected streams + - export/rebuild round-trips preserve recent rows + +**Acceptance Criteria:** + +- rebuild cannot silently discard recent SQLite-only rows +- protected streams are recoverable from backup again +- operator-facing guidance matches the actual available command surface + +--- + +### Phase 3: Finish the Dashboard Freshness Contract + +**Priority:** High +**Effort:** Medium +**Risk:** Medium + +**Files:** + +- `cli/selftune/dashboard-server.ts` +- `cli/selftune/localdb/db.ts` +- `cli/selftune/localdb/queries.ts` +- `docs/design-docs/live-dashboard-sse.md` +- `docs/design-docs/sqlite-first-migration.md` +- dashboard route tests + +**Changes:** + +1. Replace JSONL file watchers with SQLite WAL watching in the live server. +2. Keep startup materialization only as historical backfill, not as part of “freshness.” +3. Remove no-op refresh indirection once watcher mode is coherent. +4. Add a targeted test that proves a direct SQLite write triggers SSE and a subsequent fresh overview fetch. +5. Update the design docs to match the shipped implementation exactly. + +**Acceptance Criteria:** + +- SSE invalidation is triggered by SQLite writes, not JSONL file changes +- the dashboard’s freshness path matches the architecture docs +- live updates do not depend on evolution audit JSONL specifically + +--- + +### Phase 4: Make the Overview Timeline Semantics Explicit + +**Priority:** High +**Effort:** Small +**Risk:** Low + +**Files:** + +- `cli/selftune/localdb/queries.ts` +- `packages/ui/src/components/ActivityTimeline.tsx` +- `apps/local-dashboard/src/pages/Overview.tsx` +- `cli/selftune/evolution/evolve.ts` +- `cli/selftune/evolution/evolve-body.ts` +- tests for overview queries and timeline rendering + +**Decision:** + +Do not paper over missing audit rows by automatically treating all evidence as timeline activity. + +Fix the invariants first: + +- real evolution flows that should appear in the operator timeline must emit audit rows consistently +- evidence-only flows may exist, but must be explicitly labeled as such + +**Changes:** + +1. Audit the evolve/orchestrate paths to ensure `created`, `validated`, `deployed`, and rollback-worthy events always emit audit entries. +2. Add a dashboard indicator explaining whether the overview timeline is “audit activity” or a broader “evolution activity” feed. +3. Only after invariants are fixed, decide whether to add a separate evidence activity panel or merge sources intentionally. + +**Acceptance Criteria:** + +- a real autonomous evolution produces timeline-visible activity within one refresh cycle +- proof/test evidence does not masquerade as production timeline history +- operators can tell what the overview timeline is actually showing + +--- + +### Phase 5: Add Data-Integrity Diagnostics and Recovery Tools + +**Priority:** Medium +**Effort:** Medium +**Risk:** Medium + +**Files:** + +- `cli/selftune/observability.ts` +- `cli/selftune/status.ts` +- `cli/selftune/dashboard-server.ts` +- optional repair utility/command + +**Changes:** + +1. Add doctor checks for: + - DB newer than JSONL + - JSONL newer than DB + - missing protected backup streams + - test/temp skill paths in production tables + - watcher mode mismatch vs docs +2. Add a compact integrity section to the dashboard doctor view. +3. Consider an opt-in repair tool for reconstructable audit rows from evidence, but only after: + - tests are isolated + - runtime identity is visible + - repair filters out temp/test paths + +**Acceptance Criteria:** + +- operators can detect drift before data disappears +- any repair path is explicit and conservative + +--- + +## Verification Matrix + +### Runtime parity + +1. Start `selftune dashboard --port 3141 --no-open` +2. Start `bun run dev` +3. Compare: + - `/api/health` + - `/api/v2/overview` + - `/api/v2/orchestrate-runs` +4. Confirm both backends report the same: + - workspace root + - git SHA + - DB path + - latest telemetry timestamp + - latest evolution audit timestamp + +### Rebuild safety + +1. Seed SQLite with newer protected rows than JSONL +2. Attempt rebuild +3. Verify rebuild aborts with a clear diagnostic +4. Enable explicit force only in a controlled test and verify the warning is unmistakable + +### Test isolation + +1. Snapshot row counts in the real `~/.selftune/selftune.db` +2. Run proof/integration tests +3. Verify real counts are unchanged +4. Verify temp store contains the expected new rows instead + +### Freshness + +1. Perform a direct SQLite write to a watched table +2. Verify SSE broadcasts an update +3. Verify the overview fetch reflects the new row +4. Run a real `selftune evolve` / `selftune orchestrate` flow against a temp skill and verify the overview timeline updates + +--- + +## Scope Boundaries + +This plan is not: + +- a UI redesign +- a generalized event-sourcing rewrite +- a cloud-sync architecture change + +This plan is specifically about making the current local operator system trustworthy. + +--- + +## Recommended First PR Split + +1. Runtime identity + `dev` health-check fix +2. Test storage isolation +3. Rebuild safety + protected-stream backup restoration +4. SQLite WAL SSE cutover +5. Timeline semantics + doctor integrity checks + +That order reduces the chance of losing more operator data while the deeper cleanup is still in flight. diff --git a/docs/exec-plans/active/local-sqlite-materialization.md b/docs/exec-plans/active/local-sqlite-materialization.md index c4d45351..b14f7bdb 100644 --- a/docs/exec-plans/active/local-sqlite-materialization.md +++ b/docs/exec-plans/active/local-sqlite-materialization.md @@ -1,16 +1,18 @@ # Execution Plan: Local SQLite Materialization and App Data Layer - + **Status:** Active **Created:** 2026-03-12 -**Goal:** Use SQLite as a local indexed/materialized view layer on top of selftune’s raw JSONL source-of-truth logs so the local app can be fast, credible, and simple to reason about. +**Goal:** Finish the SQLite-first local runtime transition so the local app, CLI, and operator surfaces read from one operational store, while JSONL is reduced to capture, rebuild, export, and recovery roles. --- ## Executive Summary -selftune’s raw JSONL logs remain the right source of truth for: +This plan predates the current SQLite-first runtime cutover and should now be read as a migration-completion plan, not as justification for JSONL-first local reads. + +JSONL still matters for: - telemetry capture - transcript/source replay @@ -19,7 +21,7 @@ selftune’s raw JSONL logs remain the right source of truth for: They are not the right structure for serving a good local product experience directly. -SQLite via `bun:sqlite` is the right local materialization layer because it gives us: +SQLite via `bun:sqlite` is the right local operational store because it gives us: - fast indexed reads - a simple single-file local store @@ -27,11 +29,11 @@ SQLite via `bun:sqlite` is the right local materialization layer because it give - zero extra network services - a much cleaner foundation for overview/report queries -The architecture is now: +The target local architecture is now: -- **JSONL = truth** -- **SQLite = local indexed/materialized view** -- **SPA = local user experience** +- **SQLite = operational local runtime/query truth** +- **JSONL = append-only capture plus rebuild/export/recovery input during migration** +- **SPA = local operator experience** --- @@ -46,7 +48,7 @@ The old dashboard path showed the limits of raw-log-first serving: SQLite solves the UX/product problem without replacing the telemetry model. -This is not a move to “database-first telemetry.” It is a local query/materialization layer on top of append-only source logs. +This is not a move to “database-first cloud telemetry.” It is a move to one operational local runtime store, with JSONL retained only where capture, replay, or recovery still require it. --- @@ -88,23 +90,23 @@ Likely source domains: - evidence - optional materialized aggregates for overview/report -The exact schema can evolve, but its role should stay narrow: +The exact schema can evolve, but its role should stay clear: -- indexed cache/materialized view +- operational local runtime store - local query surface -- not the authority for telemetry capture +- rebuildable from append-only capture where legacy bridges still exist --- ## Architectural Rules -### 1. JSONL remains authoritative +### 1. Local runtime reads are SQLite-first -If a conflict exists between raw logs and SQLite materialization, the raw logs win. +Dashboard, status, doctor, and other operator-facing local reads should treat SQLite as the operational source of truth. -### 2. Materialization must be rebuildable +### 2. JSONL remains capture/export/recovery input until the migration is fully closed -It should always be possible to rebuild the local DB from source-truth logs. +If rebuild/export paths still depend on JSONL, keep them honest and explicit. Do not reintroduce JSONL as a first-class live query surface. ### 3. Local app queries should be explicit @@ -128,9 +130,9 @@ The local data layer should explicitly support: - overview KPI/status/skill-card payload - single-skill report payload -### 2. Move the SPA onto SQLite-backed data +### 2. Finish the SQLite-backed local app path -The React local app should stop depending primarily on the old dashboard server’s heavy data path. +The React local app already reads SQLite-backed payloads. The remaining work is to remove legacy freshness bridges and leftover JSONL-dependent dashboard helpers. ### 3. Remove remaining non-v2 dashboard paths @@ -142,9 +144,9 @@ The legacy HTML runtime is gone. The remaining follow-through is to keep migrati onto the same SQLite-backed payload semantics where appropriate. -### 4. Keep source-truth sync first +### 4. Keep sync and rebuild semantics honest -Any materialization flow must still start from fresh source-truth sync/repair data. +Any rebuild/materialization flow must make it obvious when JSONL is still being used as import/recovery input, and which streams are already SQLite-primary. --- diff --git a/docs/exec-plans/active/output-quality-loop-prereqs.md b/docs/exec-plans/active/output-quality-loop-prereqs.md new file mode 100644 index 00000000..8c113e4e --- /dev/null +++ b/docs/exec-plans/active/output-quality-loop-prereqs.md @@ -0,0 +1,166 @@ +# Output Quality Loop Prerequisites + +**Status:** Proposed +**Date:** 2026-03-19 +**Related:** [prd-output-quality-loop.md](/Users/danielpetro/Documents/Projects/FOSS/selftune/strategy/prd-output-quality-loop.md) + +## Purpose + +Do **not** build `selftune evolve output` yet. + +Do the minimum now so the alpha program collects the right data and does not +close off the option to build the output-quality loop well later. + +This is a prerequisite plan, not a feature plan. + +## Why Now + +The output-quality PRD is strategically right but tactically early. + +Current priority remains: +- trusted alpha onboarding +- reliable local-to-cloud upload +- operator visibility +- real-session review loops + +But if alpha telemetry omits the evidence needed for output-quality learning, +we waste the highest-value learning window. + +## Goal + +Capture enough output-side evidence during alpha that a later +`selftune evolve output` loop can be built from real data rather than guesses. + +## Non-Goals + +- no `selftune evolve output` command yet +- no automated output mutation loop yet +- no output-quality dashboard panel yet +- no output grader rollout yet + +## Required Data Prerequisites + +### 1. Final Output Capture + +For sessions where a skill fires, preserve the best available representation of +what the agent actually produced. + +Prefer, in order: +- final assistant message text +- generated file references and changed file paths +- structured artifact metadata when available +- attachment or screenshot references when available + +Minimum requirement: +- enough data to let an operator answer “the skill fired, but what did it produce?” + +### 2. Output Context Linkage + +Every captured output signal should be linkable back to: +- `session_id` +- `prompt_id` if available +- `skill_invocation_id` +- `skill_name` +- platform / agent type / model +- timestamp + +This is what makes later grading and mutation evidence usable. + +### 3. Artifact References, Not Just Text + +For output-quality work, text alone is often insufficient. + +Capture references to: +- changed files +- generated markdown/docs/code outputs +- image or screenshot paths when local artifacts exist +- any durable local artifact ID that can be replayed or inspected later + +Do not try to upload huge binaries blindly in the first pass. +Store references and metadata first. + +### 4. Manual Review Hook + +Add a lightweight operator review path for “triggered correctly, output looked bad.” + +Minimum viable form: +- mark a session or invocation as output-bad +- attach a short note +- preserve the linked output evidence + +This gives real labels before full automation exists. + +### 5. Cloud Queryability + +The cloud side should be able to answer: +- which skills trigger often but receive poor output feedback +- which invocations are linked to output-bad labels +- what artifacts or outputs were produced for those invocations + +This can start as operator-facing inspection, not polished UI. + +## Suggested Implementation Slices + +### Slice A: Local Evidence Capture + +In `miami`, ensure the local telemetry pipeline preserves: +- final response text when safely available +- changed file paths +- artifact metadata or attachment references + +Do not block alpha on perfect normalization. +Prefer capture over elegance. + +### Slice B: Canonical Upload Contract Extension + +Extend the alpha upload contract only where needed to preserve: +- output evidence references +- linked file paths or artifact metadata +- future operator labels for output quality + +Avoid a giant schema expansion. +Add only fields that are clearly useful for later grading or review. + +### Slice C: Cloud Operator Inspection + +In the cloud app, ensure operator surfaces can inspect: +- invocation +- output evidence +- linked artifacts +- any manual output-quality label + +Start with raw/operator views, not polished product UI. + +### Slice D: Manual Label Seed + +Add a minimal label model for: +- `output_bad` +- `output_good` +- optional note + +This is enough to seed the later quality loop. + +## Acceptance Criteria + +- For a triggered skill invocation, an operator can inspect what was produced. +- Output evidence is linked to invocation/session identity. +- At least one manual label path exists for “triggered correctly, output was poor.” +- The cloud model preserves enough evidence to support later output-quality grading. +- No major alpha rollout work is blocked on this prerequisite slice. + +## Sequencing + +1. Finish current alpha/auth/upload stabilization. +2. Add output-evidence capture and linkage as a narrow telemetry enhancement. +3. Add minimal operator review/label support. +4. Reassess after the first alpha cohort produces real sessions. +5. Only then decide whether to start full `selftune evolve output`. + +## Decision + +Use the output-quality PRD to influence **what data we keep now**. + +Do **not** treat it as the next implementation milestone until: +- alpha users are active +- the current trigger/data loop is trusted +- operator review of real outputs is happening diff --git a/docs/exec-plans/active/phase-d-marginal-case-review-spike.md b/docs/exec-plans/active/phase-d-marginal-case-review-spike.md new file mode 100644 index 00000000..ee31a7fe --- /dev/null +++ b/docs/exec-plans/active/phase-d-marginal-case-review-spike.md @@ -0,0 +1,295 @@ +# Execution Plan: Phase D Marginal-Case Review Spike + + + +**Status:** Planned +**Created:** 2026-03-18 +**Goal:** Define the minimum operator loop Daniel needs to review false positives, false negatives, and ambiguous trigger decisions from alpha users once Phase C upload data is live. + +--- + +## Why This Exists + +Ray’s office-hours guidance was clear: + +- the point of alpha is data back to Daniel +- the signal is in false negatives, false positives, and marginal cases +- human thumbs up/down on borderline cases is where the learning loop gets sharper + +Phase C gets the data upstream. Phase D defines how Daniel turns that data into learning instead of just storage. + +This is a **spike**, not a polished product build. + +The output of this plan is: + +1. a concrete review data model +2. a concrete candidate-generation model +3. a minimum operator workflow +4. a low-conflict implementation split for later + +--- + +## Scope + +### In scope + +- four-quadrant analysis model +- candidate-generation heuristics for likely FP/FN/marginal cases +- review-label schema +- minimum Daniel-only surface +- storage and query assumptions for reviewed cases + +### Out of scope + +- end-user-facing UI polish +- public-launch privacy redesign +- RLHF/training pipeline beyond storing labels cleanly +- automated judgment replacement for the human review step + +--- + +## Core Product Decision + +The first Phase D implementation should be **Daniel-only and review-first**. + +That means: + +- no attempt to build a general “community review product” +- no attempt to fully automate classification +- no need for a beautiful UX before the workflow is proven + +The system only needs to answer: + +1. which cases are worth Daniel’s attention? +2. how does Daniel label them quickly? +3. how do those labels feed future eval/evolution work? + +--- + +## The Four-Quadrant Model + +Every reviewed case should eventually be classifiable as one of: + +| Expected | Actual | Outcome | +|---|---|---| +| should trigger | triggered | true positive | +| should trigger | not triggered | false negative | +| should not trigger | triggered | false positive | +| should not trigger | not triggered | true negative | + +In practice: + +- true negatives will dominate volume +- true positives matter, but usually need less human review +- false negatives and false positives are the main learning signal +- ambiguous cases should be explicitly modeled rather than forced into certainty + +--- + +## Candidate Types + +The review system should surface three candidate buckets first: + +### 1. Likely False Negatives + +Queries where a skill probably should have triggered but did not. + +Candidate sources: + +- unmatched queries from local/remote telemetry +- prompt text that strongly resembles existing true positives +- prompt text that later led to manual skill usage or correction +- prompts near known eval positives but absent from invocation logs + +### 2. Likely False Positives + +Queries where a skill triggered but probably should not have. + +Candidate sources: + +- triggered skills followed by poor grading, low execution value, or user correction +- triggered skills followed by explicit “wrong skill” behavior +- over-broad routing collisions between multiple skills +- triggered skills on queries later labeled irrelevant by Daniel + +### 3. Ambiguous / Marginal Cases + +Cases where heuristics disagree or confidence is low. + +These should be prioritized for manual review because they are the highest-value labeling surface. + +Candidate sources: + +- medium-confidence trigger decisions +- disagreement between heuristic detectors +- novel user phrasing with sparse historical neighbors +- cross-skill overlap where multiple skills could plausibly trigger + +--- + +## Minimum Data Required From Phase C + +Phase D assumes Phase C makes these available remotely: + +- `user_id` +- `session_id` +- `occurred_at` +- `skill_name` +- `triggered` +- `invocation_mode` +- `query_text` +- `skill_scope` +- platform / agent metadata +- evolution outcome context where relevant + +Helpful but not strictly required in v1: + +- grading summary by session +- confidence scores +- active-skill overlap metrics +- operator-facing links back to local proposal/audit history + +--- + +## Review Record Schema + +The first implementation should store explicit review labels as their own record type. + +Recommended shape: + +```ts +interface MarginalCaseReview { + review_id: string + user_id: string + session_id: string + occurred_at: string + skill_name: string | null + query_text: string + candidate_type: "likely_false_negative" | "likely_false_positive" | "marginal" + predicted_quadrant: "tp" | "fp" | "fn" | "tn" | "unknown" + reviewer_label: "tp" | "fp" | "fn" | "tn" | "unsure" + reviewer_note?: string + reviewer_id: string + reviewed_at: string +} +``` + +Important choices: + +- `reviewer_label` should use the same four-quadrant vocabulary +- `unsure` is allowed +- the raw `query_text` should stay attached to the review record +- `skill_name` may be null for cross-skill review queues before Daniel chooses the intended skill + +--- + +## Minimum Operator Workflow + +The first useful loop should be: + +1. generate a ranked queue of candidate cases +2. show Daniel one case at a time with enough context to judge it +3. let Daniel mark: + - correct trigger + - missed trigger + - bad trigger + - correct skip + - unsure +4. optionally add a note +5. persist the label +6. feed those labels into later eval/evolution improvements + +The first surface can be either: + +- a CLI/TUI review flow, or +- a narrow dashboard operator panel + +Recommendation: + +- start with the cheapest surface that preserves context +- do not block on a polished dashboard workflow + +--- + +## Ranking Heuristics For The Queue + +The queue should not be chronological only. It should be scored. + +Recommended initial ranking formula: + +1. higher novelty first +2. higher ambiguity first +3. repeated query patterns across users first +4. cases near recent regressions first +5. cases tied to important/active skills first + +Concrete signal ideas: + +- semantic similarity to known positives with no trigger +- triggered skill followed by low-value session outcome +- repeated manual correction patterns +- low-confidence or conflicting routing outcomes +- recent deploys that changed trigger boundaries + +--- + +## Where Labels Should Feed Back + +Phase D should explicitly connect to later work: + +### Eval generation + +- reviewed false negatives become high-value positive eval examples +- reviewed false positives become high-value negative eval examples + +### Routing/body evolution + +- marginal labels help identify where descriptions are too broad or too narrow +- repeated notes can become structured failure feedback + +### Operator analytics + +- show reviewed-case volume over time +- show per-skill reviewed FP/FN patterns +- show whether review debt is growing or shrinking + +--- + +## Minimum Implementation Split When Ready + +When this spike turns into execution, split it like this: + +1. **Candidate generation** + - query/ranking logic + - likely FP/FN candidate extraction +2. **Review persistence** + - review-record schema + - write/read APIs +3. **Operator surface** + - CLI or dashboard review flow +4. **Feedback integration** + - label export into eval/evolution inputs + +Do not give one agent “the whole review loop” at once. + +--- + +## Acceptance Criteria For Completing The Spike + +This spike is done when: + +- the candidate buckets are clearly defined +- the review record schema is decided +- the minimum operator workflow is chosen +- the ranking logic is concrete enough to implement +- the feedback path into future eval/evolution work is explicit + +--- + +## Recommended Next Step After This Spike + +Do **not** start full Phase D implementation until Phase C has at least one real uploaded user worth reviewing. + +Once that exists, the first implementation ticket should be: + +**“Build a Daniel-only ranked review queue for likely false negatives, likely false positives, and marginal cases, with persisted four-quadrant labels.”** diff --git a/docs/exec-plans/active/user-owned-session-data-and-org-visible-outcomes.md b/docs/exec-plans/active/user-owned-session-data-and-org-visible-outcomes.md new file mode 100644 index 00000000..2ee9e939 --- /dev/null +++ b/docs/exec-plans/active/user-owned-session-data-and-org-visible-outcomes.md @@ -0,0 +1,282 @@ +# User-Owned Session Data And Org-Visible Outcomes + +**Status:** Proposed +**Date:** 2026-03-20 + +## Goal + +Correct the cloud data model so raw session telemetry is private to the user by +default, while organization-visible data is limited to derived, reviewed, or +explicitly shared outcomes. + +This fixes the current semantic mismatch: + +- storage is org-scoped +- raw prompts and session telemetry are user-originated and sensitive +- product intent is personal learning first, shared skill outcomes second + +## Product Rule + +Raw session data belongs to a user by default. + +Organization visibility should default to: + +- aggregates +- alerts +- reviewed proposals +- deployed evolution outcomes +- explicitly promoted/shared exemplars + +Organization visibility should **not** default to: + +- raw prompt text +- full per-session telemetry +- raw execution facts +- raw invocation trails + +## Why This Matters + +The current alpha model stores canonical session-level facts with `org_id` as +the main ownership key. That is acceptable for single-user alpha orgs, but it +will become incorrect as soon as multiple humans share one org. + +If left unchanged, the system will implicitly treat one user’s raw working +sessions as org-wide data. That is the wrong privacy and ownership default for +the product. + +## Target Model + +### Keep + +- `org_id` as the tenancy boundary +- org-scoped skill namespace +- org-scoped derived dashboards and operator views +- one ingest pipeline and one cloud storage path + +### Add + +- direct user ownership on raw session-level canonical records +- explicit visibility state for raw/session-derived records where needed +- a clear split between private raw telemetry and shared derived outcomes + +### Default Semantics + +- raw session layer: user-owned, private by default +- derived outcomes layer: org-visible by default +- raw sharing: explicit opt-in, never implied by org membership alone + +## Data Model Changes + +### Session-Level Tables To Reclassify As User-Owned + +These tables should keep `org_id` for tenancy partitioning, but they should no +longer be treated as org-owned data semantically: + +- `raw_pushes` +- `canonical_sessions` +- `canonical_prompts` +- `canonical_skill_invocations` +- `canonical_execution_facts` +- `normalization_runs` +- `orchestrate_runs` when they reflect an individual user’s local run + +### Tables That Can Stay Org-Visible By Default + +- skill aggregates / trend summaries +- alerts +- proposals +- proposal review state +- deployed evolution outcomes +- body/description evolution audit summaries intended for the shared skill + +### New Fields + +Add the following as appropriate: + +- `owner_user_id` +- `uploaded_by_user_id` +- `visibility` + +Suggested semantics: + +- `owner_user_id`: the human whose local session generated the raw data +- `uploaded_by_user_id`: the authenticated cloud user who sent the push +- `visibility`: `private`, `org_shared`, or `promoted` + +In many cases `owner_user_id` and `uploaded_by_user_id` will be the same. They +should still be modeled separately because they mean different things. + +## Access Model + +### Private By Default + +For raw/session-level endpoints: + +- only the owner user should see their raw prompt/session data by default +- org admins should not automatically see raw prompts or session trails + +### Org-Visible By Default + +For derived operator/product surfaces: + +- org members can see shared skill health +- org members can see org-level outcome metrics +- org members can see reviewed/deployed proposal outcomes + +### Explicit Sharing + +If a user wants to share raw evidence with the org: + +- sharing must be explicit +- sharing should happen at the level of a promoted exemplar, reviewed proposal, + or an intentionally shared session sample + +## API And UI Changes + +### Cloud API + +Update cloud routes so they stop assuming org scope implies raw-data access. + +Required changes: + +- raw/session endpoints must filter by `owner_user_id` +- org-visible derived endpoints remain org-scoped +- add new endpoints or query modes for promoted/shared examples if needed + +### Cloud UI + +Update product copy to match the real privacy model. + +Required changes: + +- raw activity views should clearly say “your sessions” +- org dashboards should clearly say “team/shared outcomes” +- avoid any UI copy that implies raw prompt text is org-visible by default + +## Migration Plan + +### Phase 0: Freeze Semantics + +**Goal:** stop making the current org-owned interpretation stronger. + +Actions: + +- stop adding new product surfaces that expose raw session data org-wide +- stop documenting raw session telemetry as org-owned + +Completion: + +- docs and new code stop reinforcing the wrong default + +### Phase 1: Add Ownership Fields + +**Goal:** make raw ownership explicit in storage. + +Actions: + +- add `owner_user_id` to raw/session canonical tables +- add `uploaded_by_user_id` where useful for auditability +- add `visibility` only where the record may later be shared/promoted + +Backfill: + +- derive `uploaded_by_user_id` from `raw_pushes.user_id` +- derive `owner_user_id` initially from the same source for current alpha data + +Completion: + +- every raw/session canonical row can be attributed to a user + +### Phase 2: Change Read Paths + +**Goal:** enforce the new default semantics in the product. + +Actions: + +- update API routes to filter raw/session data by owner +- keep org filters for derived/aggregate routes +- add targeted joins where raw data must be traced back through `push_id` + +Completion: + +- no raw/session route leaks other users’ data by org membership alone + +### Phase 3: Separate Shared Outcomes From Raw Inputs + +**Goal:** make the shared layer explicit. + +Actions: + +- identify which current surfaces are raw-input views vs derived-outcome views +- move org-facing dashboards to derived models where needed +- add a promoted/shared exemplar path for intentionally shared evidence + +Completion: + +- org-visible surfaces are clearly derived or explicitly shared + +### Phase 4: Update Agent And Operator Docs + +**Goal:** make the product explanation honest. + +Files to update: + +- `docs/operator-guide.md` +- `docs/design-docs/system-overview.md` +- any cloud-side privacy or alpha onboarding docs + +Completion: + +- docs clearly distinguish private raw telemetry from shared outcomes + +## Implementation Notes + +### Keep `org_id` + +This plan does **not** remove `org_id`. + +`org_id` is still the correct key for: + +- tenancy partitioning +- billing/workspace membership +- shared skill namespace +- org-level derived analytics + +The fix is to stop treating `org_id` as the only ownership key for raw session +telemetry. + +### Do Not Block Current Alpha Upload + +The current ingest pipeline is already working. The fix should not require a +parallel ingest system or a rewrite of the push payload contract. + +Preferred approach: + +- keep ingest as-is +- add ownership fields and backfill +- tighten read paths +- then refactor dashboards and derived tables + +## Acceptance Criteria + +- raw session-level canonical rows have explicit user ownership +- raw session/prompt data is private to the owner by default +- org-visible dashboards and outcomes continue to work +- no org-wide raw prompt access exists by default +- derived evolution outcomes remain org-visible +- current alpha ingest path remains operational throughout the migration + +## Recommended Order + +1. Freeze semantics and docs +2. Add ownership fields and backfill +3. Tighten API read paths +4. Separate derived/shared surfaces from raw views +5. Update operator and product docs + +## Non-Goals + +- changing the local CLI upload contract right now +- redesigning the alpha bootstrap/auth flow +- deleting `org_id` from canonical storage +- building a new remote ingest service diff --git a/docs/exec-plans/reference/subagent-testing-checklist.md b/docs/exec-plans/reference/subagent-testing-checklist.md new file mode 100644 index 00000000..d526c665 --- /dev/null +++ b/docs/exec-plans/reference/subagent-testing-checklist.md @@ -0,0 +1,66 @@ +# Subagent Testing Checklist + +Use this checklist when changing any bundled selftune subagent in +`skill/agents/` or the specialized-agent summary in `skill/SKILL.md`. + +## 1. Static Validation + +- Run `bun run validate:subagents`. +- Confirm the validator passes with no stale phrases or missing sections. +- Confirm the changed agent file still has delegation-oriented frontmatter: + `name`, `description`, `tools`, `model`, `maxTurns`. +- Confirm read-only agents still deny edits and hands-on agents expose edit + tools intentionally. + +## 2. Parent-Skill Routing Smoke Tests + +Test through the parent selftune skill, not just by reading the markdown. + +- Diagnosis prompt: `diagnose why my Research skill is failing` +- Review prompt: `review this evolution proposal before deploy` +- Integration prompt: `set up selftune in this monorepo` +- Pattern prompt: `which of my skills overlap` + +Pass criteria: +- the parent chooses the correct bundled agent +- the parent provides the required inputs +- the subagent returns a structured worker report +- the subagent does not ask the user basic setup questions the parent already + knows the answer to + +## 3. Behavior Checks + +- `diagnosis-analyst` stays read-only and cites evidence. +- `pattern-analyst` stays read-only and returns a conflict matrix or concrete + ownership recommendations. +- `evolution-reviewer` stays read-only and returns `APPROVE`, + `APPROVE WITH CONDITIONS`, or `REJECT`. +- `integration-guide` defaults to inspect-plus-plan unless explicitly told to + run in hands-on mode. + +## 4. Contract Checks + +- No subagent claims `selftune status`, `selftune last`, or + `selftune eval generate --list-skills` are JSON contracts. +- No subagent tells the parent to manually merge `settings_snippet.json` as the + default setup path. +- No subagent refers to invalid evolution targets like `routing_table` or + `full_body`. +- `skill/SKILL.md` still describes the bundled agents as worker-style + subagents and matches the updated usage guidance. + +## 5. Optional Native Subagent Test + +If you also want to verify native Claude Code compatibility: + +- copy one agent into `.claude/agents/` +- invoke it directly or let Claude auto-delegate +- verify the tool restrictions and output shape match the file contract + +## 6. Minimum Evidence To Record In Review + +- the exact command output from `bun run validate:subagents` +- which smoke-test prompts were tried +- whether the correct agent was chosen +- whether the return format matched the contract +- any remaining gaps or ambiguous behavior diff --git a/docs/operator-guide.md b/docs/operator-guide.md index 715ef1d8..dbbcb09f 100644 --- a/docs/operator-guide.md +++ b/docs/operator-guide.md @@ -110,12 +110,23 @@ usually happen via the scheduler or as the first step inside `orchestrate`. selftune sync ``` -Use `--force` only when you explicitly want to rebuild local state from -scratch. +Use `--force` only when you explicitly want to rescan all source-truth inputs. +It is not a substitute for the export-first DB recovery path. When autonomy is already installed, treat this as a repair/verification command, not the main product interaction. +If you hit a SQLite/schema failure, do this instead of looking for a nonexistent +`rebuild-db` command: + +```bash +selftune export +rm ~/.selftune/selftune.db +selftune sync --force +``` + +Export first so recent SQLite-backed rows are preserved before recreating the DB. + ### 2. Inspect health ```bash diff --git a/package.json b/package.json index 2625db8f..63639ce8 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "selftune", - "version": "0.2.7", + "version": "0.2.8", "description": "Self-improving skills CLI for AI agents", "type": "module", "license": "MIT", @@ -50,8 +50,8 @@ "CHANGELOG.md" ], "scripts": { - "dev": "sh -c 'if lsof -iTCP:7888 -sTCP:LISTEN >/dev/null 2>&1; then if curl -fsS http://127.0.0.1:7888/api/health | grep -q selftune-dashboard; then echo \"Using existing dashboard server on 7888\"; cd apps/local-dashboard && bun install && bunx vite --strictPort; else echo \"Port 7888 is occupied by a non-selftune service\"; exit 1; fi; else cd apps/local-dashboard && bun install && bun run dev; fi'", - "dev:server": "bun --watch run cli/selftune/dashboard-server.ts --port 7888", + "dev": "sh -c 'if lsof -iTCP:7888 -sTCP:LISTEN >/dev/null 2>&1; then if curl -fsS http://localhost:7888/api/health | grep -q selftune-dashboard; then echo \"Using existing dashboard server on 7888\"; cd apps/local-dashboard && bunx vite --strictPort; else echo \"Port 7888 is occupied by a non-selftune service\"; exit 1; fi; else cd apps/local-dashboard && bun run dev; fi'", + "dev:server": "bun --watch run cli/selftune/dashboard-server.ts --port 7888 --runtime-mode dev-server", "dev:dashboard": "bun run cli/selftune/index.ts dashboard --port 7888 --no-open", "lint": "bunx @biomejs/biome check .", "lint:fix": "bunx @biomejs/biome check --write .", @@ -59,8 +59,10 @@ "test": "bun test tests/ packages/telemetry-contract/", "test:fast": "bun test $(find tests -name '*.test.ts' ! -name 'evolve.test.ts' ! -name 'integration.test.ts' ! -name 'dashboard-server.test.ts' ! -path '*/blog-proof/*')", "test:slow": "bun test tests/evolution/evolve.test.ts tests/evolution/integration.test.ts tests/monitoring/integration.test.ts tests/dashboard/dashboard-server.test.ts", - "build:dashboard": "cd apps/local-dashboard && bun install && bunx vite build", - "prepublishOnly": "bun run build:dashboard", + "build:dashboard": "cd apps/local-dashboard && bunx vite build", + "sync-version": "bun run scripts/sync-skill-version.ts", + "validate:subagents": "bun run scripts/validate-subagent-docs.ts", + "prepublishOnly": "bun run sync-version && bun run build:dashboard", "typecheck:dashboard": "cd apps/local-dashboard && bunx tsc --noEmit", "check": "bun run lint && bun run lint:arch && bun run typecheck:dashboard && bun run test", "start": "bun run cli/selftune/index.ts --help" diff --git a/packages/telemetry-contract/fixtures/complete-push.ts b/packages/telemetry-contract/fixtures/complete-push.ts new file mode 100644 index 00000000..7e88f54d --- /dev/null +++ b/packages/telemetry-contract/fixtures/complete-push.ts @@ -0,0 +1,184 @@ +import type { PushPayloadV2 } from "../src/schemas.js"; + +/** + * A valid PushPayloadV2 with at least one of every record type. + * All fields populated. + */ +export const completePush: PushPayloadV2 = { + schema_version: "2.0", + client_version: "0.9.0", + push_id: "a1b2c3d4-e5f6-7890-abcd-ef1234567890", + normalizer_version: "0.2.1", + canonical: { + sessions: [ + { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T10:00:00Z", + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: { path: "/tmp/raw/session-100.jsonl" }, + source_session_kind: "interactive", + session_id: "fix-session-100", + external_session_id: "ext-100", + agent_id: "agent-abc", + agent_type: "claude", + agent_cli: "claude-code", + session_key: "sk-100", + channel: "terminal", + workspace_path: "/home/user/project", + repo_root: "/home/user/project", + repo_remote: "git@github.com:user/project.git", + branch: "main", + commit_sha: "abc123def456", + permission_mode: "default", + approval_policy: "auto", + sandbox_policy: "lenient", + provider: "anthropic", + model: "claude-sonnet-4-20250514", + started_at: "2026-03-19T09:50:00Z", + ended_at: "2026-03-19T10:05:00Z", + completion_status: "completed", + end_reason: "user_exit", + }, + ], + prompts: [ + { + record_kind: "prompt", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T10:00:00Z", + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: { path: "/tmp/raw/session-100.jsonl", line: 3 }, + source_session_kind: "interactive", + session_id: "fix-session-100", + prompt_id: "fix-prompt-001", + occurred_at: "2026-03-19T09:51:00Z", + prompt_text: "Fix the authentication middleware", + prompt_hash: "sha256-abc123", + prompt_kind: "user", + is_actionable: true, + prompt_index: 0, + }, + ], + skill_invocations: [ + { + record_kind: "skill_invocation", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T10:00:00Z", + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: { path: "/tmp/raw/session-100.jsonl", line: 7 }, + source_session_kind: "interactive", + session_id: "fix-session-100", + skill_invocation_id: "fix-inv-001", + occurred_at: "2026-03-19T09:52:00Z", + matched_prompt_id: "fix-prompt-001", + skill_name: "auth-debug", + skill_path: "/home/user/.claude/skills/auth-debug/SKILL.md", + skill_version_hash: "v1-hash-xyz", + invocation_mode: "explicit", + triggered: true, + confidence: 0.95, + tool_name: "Read", + tool_call_id: "tc-001", + agent_type: "claude", + }, + ], + execution_facts: [ + { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T10:00:00Z", + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: { path: "/tmp/raw/session-100.jsonl", line: 15 }, + source_session_kind: "interactive", + session_id: "fix-session-100", + execution_fact_id: "fix-ef-001", + occurred_at: "2026-03-19T10:04:00Z", + prompt_id: "fix-prompt-001", + tool_calls_json: { Read: 5, Edit: 3, Bash: 2 }, + total_tool_calls: 10, + bash_commands_redacted: ["git status", "bun test"], + assistant_turns: 4, + errors_encountered: 0, + input_tokens: 12000, + output_tokens: 3500, + duration_ms: 45000, + completion_status: "completed", + end_reason: "natural", + }, + ], + normalization_runs: [ + { + record_kind: "normalization_run", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T10:00:00Z", + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: {}, + run_id: "fix-run-001", + run_at: "2026-03-19T10:00:00Z", + raw_records_seen: 42, + canonical_records_written: 38, + repair_applied: false, + }, + ], + evolution_evidence: [ + { + evidence_id: "ev_complete_authdebug_001", + skill_name: "auth-debug", + proposal_id: "prop-001", + target: "description", + stage: "proposed", + rationale: "Improved trigger for auth-related queries", + confidence: 0.82, + original_text: "Debug authentication issues", + proposed_text: + "Debug and fix authentication middleware, token validation, and session management issues", + raw_source_ref: { path: "/tmp/evolution/prop-001.json" }, + }, + ], + orchestrate_runs: [ + { + run_id: "orch-001", + timestamp: "2026-03-19T10:10:00Z", + elapsed_ms: 12000, + dry_run: false, + approval_mode: "auto", + total_skills: 5, + evaluated: 4, + evolved: 1, + deployed: 1, + watched: 2, + skipped: 1, + skill_actions: [ + { + skill: "auth-debug", + action: "evolve", + reason: "Pass rate below threshold", + deployed: true, + elapsed_ms: 8000, + llm_calls: 3, + }, + { + skill: "commit", + action: "watch", + reason: "Recently deployed, monitoring", + }, + { + skill: "test-runner", + action: "skip", + reason: "Insufficient data", + }, + ], + }, + ], + }, +}; diff --git a/packages/telemetry-contract/fixtures/evidence-only-push.ts b/packages/telemetry-contract/fixtures/evidence-only-push.ts new file mode 100644 index 00000000..217febb0 --- /dev/null +++ b/packages/telemetry-contract/fixtures/evidence-only-push.ts @@ -0,0 +1,58 @@ +import type { PushPayloadV2 } from "../src/schemas.js"; + +/** + * A valid PushPayloadV2 with only evolution_evidence entries and + * empty arrays for all other record types. + */ +export const evidenceOnlyPush: PushPayloadV2 = { + schema_version: "2.0", + client_version: "0.9.0", + push_id: "d4e5f6a7-b8c9-0123-defa-234567890123", + normalizer_version: "0.2.1", + canonical: { + sessions: [], + prompts: [], + skill_invocations: [], + execution_facts: [], + normalization_runs: [], + evolution_evidence: [ + { + evidence_id: "ev_fixture_commit_001", + skill_name: "commit", + proposal_id: "evo-only-001", + target: "description", + stage: "deployed", + rationale: "Broadened trigger to catch 'save my work' patterns", + confidence: 0.91, + original_text: "Create git commits with good messages", + proposed_text: + "Create git commits with descriptive messages when asked to commit, save work, or checkpoint progress", + eval_set_json: { + positives: ["commit this", "save my work", "checkpoint"], + negatives: ["show git log", "what changed"], + }, + validation_json: { + pass_rate_before: 0.76, + pass_rate_after: 0.92, + improvement: 0.16, + }, + }, + { + evidence_id: "ev_fixture_testrunner_002", + skill_name: "test-runner", + target: "routing", + stage: "proposed", + rationale: "Missing trigger for 'run my specs'", + }, + { + evidence_id: "ev_fixture_deploy_003", + skill_name: "deploy-helper", + proposal_id: "evo-only-003", + target: "body", + stage: "validated", + confidence: 0.85, + raw_source_ref: { event_type: "evolution_evidence", raw_id: "evo-only-003" }, + }, + ], + }, +}; diff --git a/packages/telemetry-contract/fixtures/golden.json b/packages/telemetry-contract/fixtures/golden.json index 1b5fa512..3823b82c 100644 --- a/packages/telemetry-contract/fixtures/golden.json +++ b/packages/telemetry-contract/fixtures/golden.json @@ -61,6 +61,7 @@ "raw_source_ref": { "path": "/tmp/raw/session-001.jsonl", "line": 15 }, "source_session_kind": "interactive", "session_id": "golden-session-001", + "execution_fact_id": "golden-exec-fact-001", "occurred_at": "2026-01-15T12:04:00Z", "tool_calls_json": { "Read": 5, "Edit": 3, "Bash": 2 }, "total_tool_calls": 10, diff --git a/packages/telemetry-contract/fixtures/index.ts b/packages/telemetry-contract/fixtures/index.ts new file mode 100644 index 00000000..165f6692 --- /dev/null +++ b/packages/telemetry-contract/fixtures/index.ts @@ -0,0 +1,4 @@ +export { completePush } from "./complete-push.js"; +export { evidenceOnlyPush } from "./evidence-only-push.js"; +export { partialPushNoSessions } from "./partial-push-no-sessions.js"; +export { partialPushUnresolvedParents } from "./partial-push-unresolved-parents.js"; diff --git a/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts b/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts new file mode 100644 index 00000000..d876d0f2 --- /dev/null +++ b/packages/telemetry-contract/fixtures/partial-push-no-sessions.ts @@ -0,0 +1,40 @@ +import type { PushPayloadV2 } from "../src/schemas.js"; + +/** + * A valid PushPayloadV2 with zero sessions but non-empty evolution_evidence. + * Tests that partial pushes (no sessions) pass validation. + */ +export const partialPushNoSessions: PushPayloadV2 = { + schema_version: "2.0", + client_version: "0.9.0", + push_id: "b2c3d4e5-f6a7-8901-bcde-f12345678901", + normalizer_version: "0.2.1", + canonical: { + sessions: [], + prompts: [], + skill_invocations: [], + execution_facts: [], + normalization_runs: [], + evolution_evidence: [ + { + evidence_id: "ev_nosess_deploy_001", + skill_name: "deploy-helper", + proposal_id: "prop-nosess-001", + target: "description", + stage: "validated", + rationale: "Expanded trigger coverage for deploy-related queries", + confidence: 0.88, + original_text: "Help with deployments", + proposed_text: + "Assist with deployment pipelines, rollbacks, and infrastructure provisioning", + }, + { + evidence_id: "ev_nosess_codereview_002", + skill_name: "code-review", + target: "body", + stage: "proposed", + rationale: "Body rewrite for clearer instructions", + }, + ], + }, +}; diff --git a/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts b/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts new file mode 100644 index 00000000..78d1d62c --- /dev/null +++ b/packages/telemetry-contract/fixtures/partial-push-unresolved-parents.ts @@ -0,0 +1,79 @@ +import type { PushPayloadV2 } from "../src/schemas.js"; + +/** + * A valid PushPayloadV2 with invocations and prompts that reference a + * session_id NOT present in the sessions array. + * + * Tests that the contract allows unresolved parent references -- the + * session may have been pushed in a prior payload or may arrive later. + */ +export const partialPushUnresolvedParents: PushPayloadV2 = { + schema_version: "2.0", + client_version: "0.9.0", + push_id: "c3d4e5f6-a7b8-9012-cdef-123456789012", + normalizer_version: "0.2.1", + canonical: { + sessions: [], + prompts: [ + { + record_kind: "prompt", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T11:00:00Z", + platform: "claude_code", + capture_mode: "replay", + raw_source_ref: { path: "/tmp/raw/orphan-session.jsonl", line: 2 }, + source_session_kind: "replayed", + session_id: "orphan-session-999", + prompt_id: "orphan-prompt-001", + occurred_at: "2026-03-19T10:30:00Z", + prompt_text: "Refactor the database layer", + prompt_kind: "user", + is_actionable: true, + prompt_index: 0, + }, + ], + skill_invocations: [ + { + record_kind: "skill_invocation", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T11:00:00Z", + platform: "claude_code", + capture_mode: "replay", + raw_source_ref: { path: "/tmp/raw/orphan-session.jsonl", line: 5 }, + source_session_kind: "replayed", + session_id: "orphan-session-999", + skill_invocation_id: "orphan-inv-001", + occurred_at: "2026-03-19T10:31:00Z", + matched_prompt_id: "orphan-prompt-001", + skill_name: "db-refactor", + invocation_mode: "inferred", + triggered: true, + confidence: 0.72, + }, + ], + execution_facts: [ + { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "0.2.1", + normalized_at: "2026-03-19T11:00:00Z", + platform: "claude_code", + capture_mode: "replay", + raw_source_ref: { path: "/tmp/raw/orphan-session.jsonl", line: 12 }, + source_session_kind: "replayed", + session_id: "orphan-session-999", + execution_fact_id: "orphan-ef-001", + occurred_at: "2026-03-19T10:45:00Z", + tool_calls_json: { Read: 8, Edit: 6, Bash: 4 }, + total_tool_calls: 18, + assistant_turns: 7, + errors_encountered: 1, + duration_ms: 90000, + completion_status: "completed", + }, + ], + normalization_runs: [], + }, +}; diff --git a/packages/telemetry-contract/package.json b/packages/telemetry-contract/package.json index d913ffc6..bc7aec69 100644 --- a/packages/telemetry-contract/package.json +++ b/packages/telemetry-contract/package.json @@ -14,6 +14,11 @@ "exports": { ".": "./index.ts", "./types": "./src/types.ts", - "./validators": "./src/validators.ts" + "./validators": "./src/validators.ts", + "./schemas": "./src/schemas.ts", + "./fixtures": "./fixtures/index.ts" + }, + "dependencies": { + "zod": "^3.24.0" } } diff --git a/packages/telemetry-contract/src/index.ts b/packages/telemetry-contract/src/index.ts index 3937d199..613aade1 100644 --- a/packages/telemetry-contract/src/index.ts +++ b/packages/telemetry-contract/src/index.ts @@ -1,2 +1,3 @@ +export * from "./schemas.js"; export * from "./types.js"; export * from "./validators.js"; diff --git a/packages/telemetry-contract/src/schemas.ts b/packages/telemetry-contract/src/schemas.ts new file mode 100644 index 00000000..71f9b881 --- /dev/null +++ b/packages/telemetry-contract/src/schemas.ts @@ -0,0 +1,215 @@ +/** + * Zod validation schemas for all canonical telemetry record types + * and the PushPayloadV2 envelope. + * + * This is the single source of truth -- cloud consumers should import + * from @selftune/telemetry-contract/schemas instead of maintaining + * their own copies. + */ + +import { z } from "zod"; +import { + CANONICAL_CAPTURE_MODES, + CANONICAL_COMPLETION_STATUSES, + CANONICAL_INVOCATION_MODES, + CANONICAL_PLATFORMS, + CANONICAL_PROMPT_KINDS, + CANONICAL_RECORD_KINDS, + CANONICAL_SCHEMA_VERSION, + CANONICAL_SOURCE_SESSION_KINDS, +} from "./types.js"; + +// ---------- Shared enum schemas ---------- + +export const canonicalPlatformSchema = z.enum(CANONICAL_PLATFORMS); +export const captureModeSchema = z.enum(CANONICAL_CAPTURE_MODES); +export const sourceSessionKindSchema = z.enum(CANONICAL_SOURCE_SESSION_KINDS); +export const promptKindSchema = z.enum(CANONICAL_PROMPT_KINDS); +export const invocationModeSchema = z.enum(CANONICAL_INVOCATION_MODES); +export const completionStatusSchema = z.enum(CANONICAL_COMPLETION_STATUSES); +export const recordKindSchema = z.enum(CANONICAL_RECORD_KINDS); + +// ---------- Shared structural schemas ---------- + +export const rawSourceRefSchema = z.object({ + path: z.string().optional(), + line: z.number().int().nonnegative().optional(), + event_type: z.string().optional(), + raw_id: z.string().optional(), + metadata: z.record(z.unknown()).optional(), +}); + +export const canonicalRecordBaseSchema = z.object({ + record_kind: recordKindSchema, + schema_version: z.literal(CANONICAL_SCHEMA_VERSION), + normalizer_version: z.string().min(1), + normalized_at: z.string().datetime(), + platform: canonicalPlatformSchema, + capture_mode: captureModeSchema, + raw_source_ref: rawSourceRefSchema, +}); + +export const canonicalSessionRecordBaseSchema = canonicalRecordBaseSchema.extend({ + source_session_kind: sourceSessionKindSchema, + session_id: z.string().min(1), +}); + +// ---------- Canonical record schemas ---------- + +export const CanonicalSessionRecordSchema = canonicalSessionRecordBaseSchema.extend({ + record_kind: z.literal("session"), + external_session_id: z.string().optional(), + parent_session_id: z.string().optional(), + agent_id: z.string().optional(), + agent_type: z.string().optional(), + agent_cli: z.string().optional(), + session_key: z.string().optional(), + channel: z.string().optional(), + workspace_path: z.string().optional(), + repo_root: z.string().optional(), + repo_remote: z.string().optional(), + branch: z.string().optional(), + commit_sha: z.string().optional(), + permission_mode: z.string().optional(), + approval_policy: z.string().optional(), + sandbox_policy: z.string().optional(), + provider: z.string().optional(), + model: z.string().optional(), + started_at: z.string().datetime().optional(), + ended_at: z.string().datetime().optional(), + completion_status: completionStatusSchema.optional(), + end_reason: z.string().optional(), +}); + +export const CanonicalPromptRecordSchema = canonicalSessionRecordBaseSchema.extend({ + record_kind: z.literal("prompt"), + prompt_id: z.string().min(1), + occurred_at: z.string().datetime(), + prompt_text: z.string().min(1), + prompt_hash: z.string().optional(), + prompt_kind: promptKindSchema, + is_actionable: z.boolean(), + prompt_index: z.number().int().nonnegative().optional(), + parent_prompt_id: z.string().optional(), + source_message_id: z.string().optional(), +}); + +export const CanonicalSkillInvocationRecordSchema = canonicalSessionRecordBaseSchema.extend({ + record_kind: z.literal("skill_invocation"), + skill_invocation_id: z.string().min(1), + occurred_at: z.string().datetime(), + matched_prompt_id: z.string().min(1).optional(), + skill_name: z.string().min(1), + skill_path: z.string().optional(), + skill_version_hash: z.string().optional(), + invocation_mode: invocationModeSchema, + triggered: z.boolean(), + confidence: z.number().min(0).max(1), + tool_name: z.string().optional(), + tool_call_id: z.string().optional(), + agent_type: z.string().optional(), +}); + +export const CanonicalExecutionFactRecordSchema = canonicalSessionRecordBaseSchema.extend({ + record_kind: z.literal("execution_fact"), + execution_fact_id: z.string().min(1), + occurred_at: z.string().datetime(), + prompt_id: z.string().optional(), + tool_calls_json: z.record(z.number().finite()), + total_tool_calls: z.number().int().nonnegative(), + bash_commands_redacted: z.array(z.string()).optional(), + assistant_turns: z.number().int().nonnegative(), + errors_encountered: z.number().int().nonnegative(), + input_tokens: z.number().int().nonnegative().optional(), + output_tokens: z.number().int().nonnegative().optional(), + duration_ms: z.number().nonnegative().optional(), + completion_status: completionStatusSchema.optional(), + end_reason: z.string().optional(), +}); + +export const CanonicalNormalizationRunRecordSchema = canonicalRecordBaseSchema.extend({ + record_kind: z.literal("normalization_run"), + run_id: z.string().min(1), + run_at: z.string().datetime(), + raw_records_seen: z.number().int().nonnegative(), + canonical_records_written: z.number().int().nonnegative(), + repair_applied: z.boolean(), +}); + +export const CanonicalEvolutionEvidenceRecordSchema = z.object({ + evidence_id: z.string().min(1), + skill_name: z.string().min(1), + proposal_id: z.string().optional(), + target: z.string().min(1), + stage: z.string().min(1), + rationale: z.string().optional(), + confidence: z.number().min(0).max(1).optional(), + original_text: z.string().optional(), + proposed_text: z.string().optional(), + eval_set_json: z.unknown().optional(), + validation_json: z.unknown().optional(), + raw_source_ref: rawSourceRefSchema.optional(), +}); + +// ---------- Orchestrate run schemas ---------- + +export const OrchestrateRunSkillActionSchema = z.object({ + skill: z.string().min(1), + action: z.enum(["evolve", "watch", "skip"]), + reason: z.string(), + deployed: z.boolean().optional(), + rolledBack: z.boolean().optional(), + alert: z.string().nullable().optional(), + elapsed_ms: z.number().nonnegative().optional(), + llm_calls: z.number().int().nonnegative().optional(), +}); + +export const PushOrchestrateRunRecordSchema = z.object({ + run_id: z.string().min(1), + timestamp: z.string().datetime(), + elapsed_ms: z.number().int().nonnegative(), + dry_run: z.boolean(), + approval_mode: z.enum(["auto", "review"]), + total_skills: z.number().int().nonnegative(), + evaluated: z.number().int().nonnegative(), + evolved: z.number().int().nonnegative(), + deployed: z.number().int().nonnegative(), + watched: z.number().int().nonnegative(), + skipped: z.number().int().nonnegative(), + skill_actions: z.array(OrchestrateRunSkillActionSchema), +}); + +// ---------- Push V2 envelope ---------- + +export const PushPayloadV2Schema = z.object({ + schema_version: z.literal("2.0"), + client_version: z.string().min(1), + push_id: z.string().uuid(), + normalizer_version: z.string().min(1), + canonical: z.object({ + sessions: z.array(CanonicalSessionRecordSchema).min(0), + prompts: z.array(CanonicalPromptRecordSchema).min(0), + skill_invocations: z.array(CanonicalSkillInvocationRecordSchema).min(0), + execution_facts: z.array(CanonicalExecutionFactRecordSchema).min(0), + normalization_runs: z.array(CanonicalNormalizationRunRecordSchema).min(0), + evolution_evidence: z.array(CanonicalEvolutionEvidenceRecordSchema).optional(), + orchestrate_runs: z.array(PushOrchestrateRunRecordSchema).optional(), + }), +}); + +// ---------- Inferred types from Zod schemas ---------- + +export type PushPayloadV2 = z.infer; +export type ZodCanonicalSessionRecord = z.infer; +export type ZodCanonicalPromptRecord = z.infer; +export type ZodCanonicalSkillInvocationRecord = z.infer< + typeof CanonicalSkillInvocationRecordSchema +>; +export type ZodCanonicalExecutionFactRecord = z.infer; +export type ZodCanonicalNormalizationRunRecord = z.infer< + typeof CanonicalNormalizationRunRecordSchema +>; +export type ZodCanonicalEvolutionEvidenceRecord = z.infer< + typeof CanonicalEvolutionEvidenceRecordSchema +>; +export type ZodPushOrchestrateRunRecord = z.infer; diff --git a/packages/telemetry-contract/src/types.ts b/packages/telemetry-contract/src/types.ts index a2792557..1e0e733d 100644 --- a/packages/telemetry-contract/src/types.ts +++ b/packages/telemetry-contract/src/types.ts @@ -133,11 +133,12 @@ export interface CanonicalSkillInvocationRecord extends CanonicalSessionRecordBa export interface CanonicalExecutionFactRecord extends CanonicalSessionRecordBase { record_kind: "execution_fact"; + execution_fact_id: string; occurred_at: string; prompt_id?: string; tool_calls_json: Record; total_tool_calls: number; - bash_commands_redacted: string[]; + bash_commands_redacted?: string[]; assistant_turns: number; errors_encountered: number; input_tokens?: number; diff --git a/packages/telemetry-contract/src/validators.ts b/packages/telemetry-contract/src/validators.ts index daad6e53..ae2d61fe 100644 --- a/packages/telemetry-contract/src/validators.ts +++ b/packages/telemetry-contract/src/validators.ts @@ -86,10 +86,12 @@ export function isCanonicalRecord(value: unknown): value is CanonicalRecord { case "execution_fact": return ( hasSessionScope(value) && + hasString(value, "execution_fact_id") && hasString(value, "occurred_at") && isNumberRecord(value.tool_calls_json) && isFiniteNumber(value.total_tool_calls) && - isStringArray(value.bash_commands_redacted) && + (value.bash_commands_redacted === undefined || + isStringArray(value.bash_commands_redacted)) && isFiniteNumber(value.assistant_turns) && isFiniteNumber(value.errors_encountered) && (value.completion_status === undefined || diff --git a/packages/telemetry-contract/tests/compatibility.test.ts b/packages/telemetry-contract/tests/compatibility.test.ts new file mode 100644 index 00000000..b31e74ba --- /dev/null +++ b/packages/telemetry-contract/tests/compatibility.test.ts @@ -0,0 +1,144 @@ +import { describe, expect, test } from "bun:test"; +import { completePush } from "../fixtures/complete-push.js"; +import { evidenceOnlyPush } from "../fixtures/evidence-only-push.js"; +import { partialPushNoSessions } from "../fixtures/partial-push-no-sessions.js"; +import { partialPushUnresolvedParents } from "../fixtures/partial-push-unresolved-parents.js"; +import { PushPayloadV2Schema } from "../src/schemas.js"; + +describe("PushPayloadV2Schema compatibility", () => { + // ---- Fixture validation ---- + + test("complete-push fixture passes validation", () => { + const result = PushPayloadV2Schema.safeParse(completePush); + if (!result.success) { + throw new Error(`Validation failed: ${JSON.stringify(result.error.issues, null, 2)}`); + } + expect(result.success).toBe(true); + }); + + test("partial-push-no-sessions fixture passes validation", () => { + const result = PushPayloadV2Schema.safeParse(partialPushNoSessions); + if (!result.success) { + throw new Error(`Validation failed: ${JSON.stringify(result.error.issues, null, 2)}`); + } + expect(result.success).toBe(true); + }); + + test("partial-push-unresolved-parents fixture passes validation", () => { + const result = PushPayloadV2Schema.safeParse(partialPushUnresolvedParents); + if (!result.success) { + throw new Error(`Validation failed: ${JSON.stringify(result.error.issues, null, 2)}`); + } + expect(result.success).toBe(true); + }); + + test("evidence-only-push fixture passes validation", () => { + const result = PushPayloadV2Schema.safeParse(evidenceOnlyPush); + if (!result.success) { + throw new Error(`Validation failed: ${JSON.stringify(result.error.issues, null, 2)}`); + } + expect(result.success).toBe(true); + }); + + // ---- execution_fact_id is required ---- + + test("execution_fact_id is required on execution facts", () => { + const badPayload = structuredClone(completePush); + delete (badPayload.canonical.execution_facts[0] as Record).execution_fact_id; + const result = PushPayloadV2Schema.safeParse(badPayload); + expect(result.success).toBe(false); + if (!result.success) { + const paths = result.error.issues.map((i) => i.path.join(".")); + expect(paths).toContain("canonical.execution_facts.0.execution_fact_id"); + } + }); + + test("execution_fact_id rejects empty string", () => { + const badPayload = structuredClone(completePush); + (badPayload.canonical.execution_facts[0] as Record).execution_fact_id = ""; + const result = PushPayloadV2Schema.safeParse(badPayload); + expect(result.success).toBe(false); + if (!result.success) { + const paths = result.error.issues.map((i) => i.path.join(".")); + expect(paths).toContain("canonical.execution_facts.0.execution_fact_id"); + } + }); + + // ---- bash_commands_redacted is optional ---- + + test("bash_commands_redacted is optional (omitting it passes)", () => { + // The unresolved-parents fixture already omits bash_commands_redacted + const ef = partialPushUnresolvedParents.canonical.execution_facts[0]; + expect(ef.bash_commands_redacted).toBeUndefined(); + + const result = PushPayloadV2Schema.safeParse(partialPushUnresolvedParents); + expect(result.success).toBe(true); + }); + + test("bash_commands_redacted accepts an array when present", () => { + const ef = completePush.canonical.execution_facts[0]; + expect(Array.isArray(ef.bash_commands_redacted)).toBe(true); + + const result = PushPayloadV2Schema.safeParse(completePush); + expect(result.success).toBe(true); + }); + + // ---- Zero-session pushes ---- + + test("zero-session pushes pass validation", () => { + expect(partialPushNoSessions.canonical.sessions).toHaveLength(0); + const result = PushPayloadV2Schema.safeParse(partialPushNoSessions); + expect(result.success).toBe(true); + }); + + test("evidence-only push with all empty arrays passes", () => { + expect(evidenceOnlyPush.canonical.sessions).toHaveLength(0); + expect(evidenceOnlyPush.canonical.prompts).toHaveLength(0); + expect(evidenceOnlyPush.canonical.skill_invocations).toHaveLength(0); + expect(evidenceOnlyPush.canonical.execution_facts).toHaveLength(0); + expect(evidenceOnlyPush.canonical.normalization_runs).toHaveLength(0); + const result = PushPayloadV2Schema.safeParse(evidenceOnlyPush); + expect(result.success).toBe(true); + }); + + // ---- Unresolved parent references ---- + + test("unresolved parent references pass (invocation references session_id not in sessions)", () => { + const sessionIds = new Set( + partialPushUnresolvedParents.canonical.sessions.map((s) => s.session_id), + ); + const invSessionIds = partialPushUnresolvedParents.canonical.skill_invocations.map( + (i) => i.session_id, + ); + + // Precondition: arrays must be non-empty for the test to be meaningful + expect(invSessionIds.length).toBeGreaterThan(0); + + // Confirm the invocation references a session not in the sessions array + for (const sid of invSessionIds) { + expect(sessionIds.has(sid)).toBe(false); + } + + const result = PushPayloadV2Schema.safeParse(partialPushUnresolvedParents); + expect(result.success).toBe(true); + }); + + test("prompts with unresolved session_id pass validation", () => { + const sessionIds = new Set( + partialPushUnresolvedParents.canonical.sessions.map((s) => s.session_id), + ); + const promptSessionIds = partialPushUnresolvedParents.canonical.prompts.map( + (p) => p.session_id, + ); + + // Precondition: arrays must be non-empty for the test to be meaningful + expect(promptSessionIds.length).toBeGreaterThan(0); + + for (const sid of promptSessionIds) { + expect(sessionIds.has(sid)).toBe(false); + } + + const result = PushPayloadV2Schema.safeParse(partialPushUnresolvedParents); + expect(result.success).toBe(true); + }); +}); diff --git a/packages/ui/src/components/ActivityTimeline.tsx b/packages/ui/src/components/ActivityTimeline.tsx index b4115ee6..f726ef42 100644 --- a/packages/ui/src/components/ActivityTimeline.tsx +++ b/packages/ui/src/components/ActivityTimeline.tsx @@ -7,6 +7,7 @@ import { CardTitle, } from "../primitives/card" import { Tabs, TabsContent, TabsList, TabsTrigger } from "../primitives/tabs" +import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from "../primitives/tooltip" import type { EvolutionEntry, PendingProposal, UnmatchedQuery } from "../types" import { timeAgo } from "../lib/format" import { @@ -29,10 +30,12 @@ export function ActivityPanel({ evolution, pendingProposals, unmatchedQueries, + onSelectProposal, }: { evolution: EvolutionEntry[] pendingProposals: PendingProposal[] unmatchedQueries: UnmatchedQuery[] + onSelectProposal?: (skillName: string, proposalId: string) => void }) { const hasActivity = evolution.length > 0 || pendingProposals.length > 0 || unmatchedQueries.length > 0 @@ -73,35 +76,51 @@ export function ActivityPanel({ : "unmatched" } > - - {pendingProposals.length > 0 && ( - - - Pending - - {pendingProposals.length} - - - )} - - - Timeline - - {unmatchedQueries.length > 0 && ( - - - Unmatched - - {unmatchedQueries.length} - - - )} - + + + {pendingProposals.length > 0 && ( + + }> + + + {pendingProposals.length} + + + Pending proposals + + )} + + }> + + + Timeline + + {unmatchedQueries.length > 0 && ( + + }> + + + {unmatchedQueries.length} + + + Unmatched queries + + )} + + {pendingProposals.length > 0 && ( {pendingProposals.slice(0, 10).map((p) => ( -
+ ))} )} {evolution.slice(0, 30).map((entry, i) => ( -
+
+ ))} {evolution.length === 0 && (

No timeline events

diff --git a/packages/ui/src/types.ts b/packages/ui/src/types.ts index a851c63f..1fe33767 100644 --- a/packages/ui/src/types.ts +++ b/packages/ui/src/types.ts @@ -27,6 +27,7 @@ export interface EvalSnapshot { export interface EvolutionEntry { timestamp: string; proposal_id: string; + skill_name?: string; action: string; details: string; eval_snapshot?: EvalSnapshot | null; diff --git a/scripts/sync-skill-version.ts b/scripts/sync-skill-version.ts new file mode 100644 index 00000000..bab385b5 --- /dev/null +++ b/scripts/sync-skill-version.ts @@ -0,0 +1,44 @@ +#!/usr/bin/env bun +/** + * Stamps skill/SKILL.md frontmatter version to match package.json. + * Run automatically via `bun run sync-version` or during prepublishOnly. + */ +import { readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; + +const root = join(import.meta.dir, ".."); +const pkg = JSON.parse(readFileSync(join(root, "package.json"), "utf-8")) as { + version?: unknown; +}; +if (typeof pkg.version !== "string" || pkg.version.trim() === "") { + console.error("ERROR: package.json `version` must be a non-empty string"); + process.exit(1); +} +const pkgVersion = pkg.version; + +const skillPath = join(root, "skill", "SKILL.md"); +const content = readFileSync(skillPath, "utf-8"); + +const frontmatterRegex = /^---\r?\n([\s\S]*?)\r?\n---/; +const frontmatterMatch = content.match(frontmatterRegex); +if (!frontmatterMatch) { + console.error(`ERROR: No YAML frontmatter found in ${skillPath}`); + process.exit(1); +} + +const frontmatter = frontmatterMatch[1]; +const versionRegex = /^(\s*version:\s*).+$/m; +if (!versionRegex.test(frontmatter)) { + console.error(`ERROR: No version frontmatter found in ${skillPath}`); + process.exit(1); +} + +const updatedFrontmatter = frontmatter.replace(versionRegex, `$1${pkgVersion}`); +const updated = content.replace(frontmatterRegex, `---\n${updatedFrontmatter}\n---`); + +if (content === updated) { + console.log(`skill/SKILL.md already at v${pkgVersion}`); +} else { + writeFileSync(skillPath, updated); + console.log(`skill/SKILL.md version updated to v${pkgVersion}`); +} diff --git a/scripts/validate-subagent-docs.ts b/scripts/validate-subagent-docs.ts new file mode 100644 index 00000000..7bc50aee --- /dev/null +++ b/scripts/validate-subagent-docs.ts @@ -0,0 +1,276 @@ +#!/usr/bin/env bun + +import { readFileSync } from "node:fs"; +import { join } from "node:path"; + +type AgentSpec = { + file: string; + name: string; + mode: "read-only" | "hands-on"; + requiredSections: string[]; + requiredPhrases: string[]; + forbiddenPhrases: string[]; +}; + +type ValidationFailure = { + file: string; + message: string; +}; + +const repoRoot = join(import.meta.dir, ".."); + +const sharedRequiredSections = [ + "## Required Inputs From Parent", + "## Operating Rules", + "## Stop Conditions", + "## Return Format", +]; + +const sharedForbiddenPhrases = [ + "Ask the user", + "Parse JSON", + "parse JSON", + "settings_snippet.json", + "routing_table", + "full_body", +]; + +const agents: AgentSpec[] = [ + { + file: "skill/agents/diagnosis-analyst.md", + name: "diagnosis-analyst", + mode: "read-only", + requiredSections: [...sharedRequiredSections, "## Investigation Workflow"], + requiredPhrases: [ + "Use when", + "Do not ask the user directly unless the parent explicitly told you to.", + "selftune status", + "selftune last", + "selftune doctor", + ], + forbiddenPhrases: sharedForbiddenPhrases, + }, + { + file: "skill/agents/evolution-reviewer.md", + name: "evolution-reviewer", + mode: "read-only", + requiredSections: [...sharedRequiredSections, "## Review Workflow"], + requiredPhrases: [ + "Use when", + "Do not ask the user directly unless the parent explicitly told you to.", + "selftune evolve --skill --skill-path --dry-run", + "routing|body", + ], + forbiddenPhrases: sharedForbiddenPhrases, + }, + { + file: "skill/agents/integration-guide.md", + name: "integration-guide", + mode: "hands-on", + requiredSections: [...sharedRequiredSections, "## Setup Workflow"], + requiredPhrases: [ + "Use when", + "Do not ask the user directly unless the parent explicitly told you to.", + "`requestedMode`: `plan-only` or `hands-on`", + "`selftune init` is the source of truth for config bootstrap and automatic", + ], + forbiddenPhrases: sharedForbiddenPhrases, + }, + { + file: "skill/agents/pattern-analyst.md", + name: "pattern-analyst", + mode: "read-only", + requiredSections: [...sharedRequiredSections, "## Analysis Workflow"], + requiredPhrases: [ + "Use when", + "Do not ask the user directly unless the parent explicitly told", + "selftune eval composability", + "selftune eval generate --list-skills", + ], + forbiddenPhrases: sharedForbiddenPhrases, + }, +]; + +function getFrontmatterBlock(content: string): string | null { + const lines = content.split("\n"); + if (lines[0]?.trim() !== "---") return null; + + for (let i = 1; i < lines.length; i++) { + if (lines[i].trim() === "---") { + return lines.slice(1, i).join("\n"); + } + } + + return null; +} + +function getFrontmatterValue(frontmatter: string, key: string): string { + const match = frontmatter.match(new RegExp(`^${key}:\\s*(.+)$`, "m")); + return match?.[1]?.trim() ?? ""; +} + +function requireIncludes( + failures: ValidationFailure[], + file: string, + content: string, + needle: string, + label = needle, +): void { + if (!content.includes(needle)) { + failures.push({ file, message: `Missing required content: ${label}` }); + } +} + +function requireExcludes( + failures: ValidationFailure[], + file: string, + content: string, + needle: string, +): void { + if (content.includes(needle)) { + failures.push({ file, message: `Contains forbidden stale content: ${needle}` }); + } +} + +function validateAgent(spec: AgentSpec, failures: ValidationFailure[]): void { + const filePath = join(repoRoot, spec.file); + let content: string; + try { + content = readFileSync(filePath, "utf8"); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + failures.push({ file: spec.file, message: `Failed to read file: ${msg}` }); + return; + } + const frontmatter = getFrontmatterBlock(content); + + if (!frontmatter) { + failures.push({ file: spec.file, message: "Missing YAML frontmatter block" }); + return; + } + + const name = getFrontmatterValue(frontmatter, "name"); + const description = getFrontmatterValue(frontmatter, "description"); + const tools = getFrontmatterValue(frontmatter, "tools"); + const disallowedTools = getFrontmatterValue(frontmatter, "disallowedTools"); + const model = getFrontmatterValue(frontmatter, "model"); + const maxTurns = getFrontmatterValue(frontmatter, "maxTurns"); + + if (name !== spec.name) { + failures.push({ + file: spec.file, + message: `Expected frontmatter name '${spec.name}', found '${name || "(missing)"}'`, + }); + } + + if (!description.startsWith("Use when")) { + failures.push({ + file: spec.file, + message: "Description must be delegation-oriented and start with 'Use when'", + }); + } + + if (!model) { + failures.push({ file: spec.file, message: "Missing frontmatter field: model" }); + } + + if (!maxTurns) { + failures.push({ file: spec.file, message: "Missing frontmatter field: maxTurns" }); + } + + if (!tools) { + failures.push({ file: spec.file, message: "Missing frontmatter field: tools" }); + } + + if (spec.mode === "read-only") { + if (disallowedTools !== "Write, Edit") { + failures.push({ + file: spec.file, + message: "Read-only subagents must set 'disallowedTools: Write, Edit'", + }); + } + } else { + if (!tools.includes("Write") || !tools.includes("Edit")) { + failures.push({ + file: spec.file, + message: "Hands-on subagents must expose Write and Edit in tools", + }); + } + } + + for (const section of spec.requiredSections) { + requireIncludes(failures, spec.file, content, section); + } + + for (const phrase of spec.requiredPhrases) { + requireIncludes(failures, spec.file, content, phrase); + } + + for (const phrase of spec.forbiddenPhrases) { + requireExcludes(failures, spec.file, content, phrase); + } +} + +function validateSkillSummary(failures: ValidationFailure[]): void { + const file = "skill/SKILL.md"; + let content: string; + try { + content = readFileSync(join(repoRoot, file), "utf8"); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + failures.push({ file, message: `Failed to read file: ${msg}` }); + return; + } + + requireIncludes(failures, file, content, "Treat these as worker-style subagents:"); + const specializedAgentsSection = + content.match( + /## Specialized Agents[\s\S]*?\n\| Trigger keywords \| Agent file \| When to use \|\n([\s\S]*?)\n## /, + )?.[1] ?? ""; + + if (!specializedAgentsSection) { + failures.push({ + file, + message: "Missing or malformed Specialized Agents table in SKILL.md", + }); + return; + } + + const agentRows = specializedAgentsSection + .split("\n") + .map((line) => line.trim()) + .filter((line) => line.startsWith("|") && !line.includes("---")); + + for (const agent of agents) { + const agentPath = `\`${agent.file.replace("skill/", "")}\``; + if (!agentRows.some((row) => row.includes(agentPath))) { + failures.push({ + file, + message: `Specialized Agents table is missing row for ${agentPath}`, + }); + } + } +} + +function main(): void { + const failures: ValidationFailure[] = []; + + for (const agent of agents) { + validateAgent(agent, failures); + } + validateSkillSummary(failures); + + if (failures.length > 0) { + console.error("Subagent doc validation failed:\n"); + for (const failure of failures) { + console.error(`- ${failure.file}: ${failure.message}`); + } + process.exit(1); + } + + console.log( + `Validated ${agents.length} bundled subagent docs and the SKILL.md specialized-agent summary.`, + ); +} + +main(); diff --git a/skill/SKILL.md b/skill/SKILL.md index 4eea0c04..4b41c889 100644 --- a/skill/SKILL.md +++ b/skill/SKILL.md @@ -1,17 +1,19 @@ --- name: selftune description: > - Self-improving skills toolkit. Use when the user wants to: - grade a session, generate evals, check undertriggering, evolve a skill - description or full body, evolve routing tables, rollback an evolution, - monitor post-deploy performance, check skill health status, view last - session insight, open the dashboard, serve the live dashboard, run health - checks, manage activation rules, ingest sessions from Codex/OpenCode/OpenClaw, - replay Claude Code transcripts, contribute anonymized data to the community, - set up autonomous cron jobs, manage evolution memory, configure auto-activation - suggestions, diagnose underperforming skills, analyze cross-skill patterns, - review evolution proposals, measure baseline lift, run skill unit tests, - analyze skill composability, or import SkillsBench evaluation corpora. + Self-improving skills toolkit that watches real agent sessions, detects missed + triggers, grades execution quality, and evolves skill descriptions to match how + users actually talk. Use when grading sessions, generating evals, evolving skill + descriptions or routing tables, checking skill health, viewing the dashboard, + ingesting sessions from other platforms, or running autonomous improvement loops. + Make sure to use this skill whenever the user mentions skill improvement, skill + performance, skill triggers, skill evolution, skill health, undertriggering, + overtriggering, session grading, or wants to know how their skills are doing — + even if they don't say "selftune" explicitly. +metadata: + author: selftune-dev + version: 0.2.7 + category: developer-tools --- # selftune @@ -24,6 +26,25 @@ skill health autonomously. They will say things like "set up selftune", "improve my skills", or "how are my skills doing?" — and you route to the correct workflow below. The user does not run CLI commands directly; you do. +## Why this matters + +Skills are only useful when they trigger at the right time with the right +instructions. But user language drifts — the phrases people use to ask for help +rarely match the trigger keywords a skill author imagined. selftune closes this +gap by observing real sessions, finding where skills fail to activate or +execute poorly, and rewriting descriptions to match actual usage patterns. The +result: skills that get better over time without manual tuning. + +## Communicating with the user + +Users range from experienced developers who'll say "evolve the pptx description +using the latest eval set" to non-technical users who'll say "make my skills +better". Pay attention to context cues: + +- If they use terms like "eval set", "routing table", "JSONL" — match their precision +- If they say "improve my skills" or "how's it going" — explain what you're doing in plain language, summarize results, and suggest next steps +- When in doubt, briefly explain what a command does before running it + ## Bootstrap If `~/.selftune/config.json` does not exist, read `Workflows/Initialize.md` @@ -36,9 +57,13 @@ will work. Do not proceed with other commands until initialization is complete. selftune [options] ``` -Most commands output deterministic JSON. Parse JSON output for machine-readable commands. -`selftune dashboard` is an exception: `--export` generates an HTML artifact, while -`--serve` starts a local server; both may print informational progress lines. +Commands vary in output format. `selftune orchestrate`, `selftune watch`, and +`selftune evolve --dry-run` emit structured JSON on stdout. `selftune status`, +`selftune last`, and `selftune doctor` print human-readable text or structured +JSON depending on the command. For alpha/bootstrap and health remediation, prefer +machine-readable `guidance.next_command` or top-level `next_command` when present +instead of inferring the next step from prose. `selftune dashboard` starts a +local SPA server — it does not emit data. ## Quick Reference @@ -56,11 +81,11 @@ selftune grade baseline --skill --skill-path [--eval-set ] # Evolve group selftune evolve --skill --skill-path [--dry-run] -selftune evolve body --skill --skill-path --target [--dry-run] +selftune evolve body --skill --skill-path --target [--dry-run] selftune evolve rollback --skill --skill-path [--proposal-id ] # Eval group -selftune eval generate --skill [--list-skills] [--stats] [--max N] +selftune eval generate --skill [--list-skills] [--stats] [--max N] [--seed N] [--output PATH] selftune eval unit-test --skill --tests [--run-agent] [--generate] selftune eval import --dir --skill --output [--match-strategy exact|fuzzy] selftune eval composability --skill [--window N] [--telemetry-log ] @@ -70,8 +95,7 @@ selftune watch --skill --skill-path [--auto-rollback] selftune status selftune last selftune doctor -selftune dashboard [--export] [--out FILE] [--serve] -selftune dashboard --serve [--port ] +selftune dashboard [--port ] [--no-open] selftune contribute [--skill NAME] [--preview] [--sanitize LEVEL] [--submit] selftune cron setup [--dry-run] # auto-detect platform (cron/launchd/systemd) selftune cron setup --platform openclaw [--dry-run] [--tz ] # OpenClaw-specific @@ -79,201 +103,213 @@ selftune cron list selftune cron remove [--dry-run] selftune telemetry [status|enable|disable] selftune export [TABLE...] [--output/-o DIR] [--since DATE] + +# Alpha enrollment (cloud app is control-plane only, not the main UX) +selftune init --alpha --alpha-email --alpha-key +selftune alpha upload [--dry-run] +selftune status # shows cloud link state + upload readiness ``` ## Workflow Routing | Trigger keywords | Workflow | File | |------------------|----------|------| -| grade, score, evaluate, assess session, auto-grade | Grade † | Workflows/Grade.md | +| grade, score, evaluate, assess session, auto-grade | Grade | Workflows/Grade.md | | evals, eval set, undertriggering, skill stats, eval generate | Evals | Workflows/Evals.md | -| evolve, improve, optimize skills, make skills better, triggers, catch more queries | Evolve † | Workflows/Evolve.md | +| evolve, improve, optimize skills, make skills better, triggers, catch more queries | Evolve | Workflows/Evolve.md | +| evolve body, evolve routing, full body evolution, rewrite skill, teacher student | EvolveBody | Workflows/EvolveBody.md | | evolve rollback, undo, restore, revert evolution, go back, undo last change | Rollback | Workflows/Rollback.md | -| watch, monitor, regression, post-deploy, performing, keep an eye on | Watch † | Workflows/Watch.md | +| watch, monitor, regression, post-deploy, keep an eye on | Watch | Workflows/Watch.md | | doctor, health, hooks, broken, diagnose, not working, something wrong | Doctor | Workflows/Doctor.md | -| ingest, import, codex logs, opencode, openclaw, wrap codex, ingest claude | Ingest † | Workflows/Ingest.md | -| ingest claude, backfill, claude transcripts, historical sessions | Replay | Workflows/Replay.md | -| contribute, share, community, export data, anonymized, give back, help others | Contribute | Workflows/Contribute.md | -| init, setup, set up, bootstrap, first time, install, configure selftune | Initialize | Workflows/Initialize.md | -| cron, schedule, autonomous, automate evolution, run automatically, run on its own | Cron | Workflows/Cron.md | +| ingest, import, codex logs, opencode, openclaw, wrap codex | Ingest | Workflows/Ingest.md | +| replay, backfill, claude transcripts, historical sessions | Replay | Workflows/Replay.md | +| contribute, share, community, export data, anonymized, give back | Contribute | Workflows/Contribute.md | +| init, setup, set up, bootstrap, first time, install, configure selftune, alpha, enroll, alpha enrollment, cloud link, upload credential | Initialize | Workflows/Initialize.md | +| cron, schedule, automate evolution, run automatically | Cron | Workflows/Cron.md | | auto-activate, suggestions, activation rules, nag, why suggest | AutoActivation | Workflows/AutoActivation.md | -| dashboard, visual, open dashboard, show dashboard, skill grid, serve dashboard, live dashboard | Dashboard | Workflows/Dashboard.md | -| evolution memory, context memory, session continuity, what happened last | EvolutionMemory | Workflows/EvolutionMemory.md | -| evolve body, evolve routing, full body evolution, rewrite skill, teacher student | EvolveBody | Workflows/EvolveBody.md | +| dashboard, visual, open dashboard, show dashboard, serve dashboard, live dashboard | Dashboard | Workflows/Dashboard.md | +| evolution memory, session continuity, what happened last | EvolutionMemory | Workflows/EvolutionMemory.md | | grade baseline, baseline lift, adds value, skill value, no-skill comparison | Baseline | Workflows/Baseline.md | -| eval unit-test, skill test, test skill, generate tests, run tests, assertions | UnitTest | Workflows/UnitTest.md | -| eval composability, co-occurrence, skill conflicts, skills together, conflict score | Composability | Workflows/Composability.md | -| eval import, skillsbench, external evals, benchmark tasks, import corpus | ImportSkillsBench | Workflows/ImportSkillsBench.md | -| telemetry, analytics, disable analytics, opt out, usage data, tracking, privacy | Telemetry | Workflows/Telemetry.md | -| export, dump, jsonl, export sqlite, export data, debug export | Export | *(direct command -- no workflow file)* | -| status, health summary, skill health, pass rates, how are skills, skills working, skills doing, run selftune, start selftune | Status | *(direct command — no workflow file)* | -| last, last session, recent session, what happened, what changed, what did selftune do | Last | *(direct command — no workflow file)* | - -Workflows marked with † also run autonomously via `selftune orchestrate` without user interaction. +| eval unit-test, skill test, test skill, generate tests, run tests | UnitTest | Workflows/UnitTest.md | +| eval composability, co-occurrence, skill conflicts, skills together | Composability | Workflows/Composability.md | +| eval import, skillsbench, external evals, benchmark tasks | ImportSkillsBench | Workflows/ImportSkillsBench.md | +| telemetry, analytics, disable analytics, opt out, tracking, privacy | Telemetry | Workflows/Telemetry.md | +| orchestrate, autonomous, full loop, improve all skills, run selftune loop | Orchestrate | Workflows/Orchestrate.md | +| sync, refresh, source truth, rescan sessions | Sync | Workflows/Sync.md | +| badge, readme badge, skill badge, health badge | Badge | Workflows/Badge.md | +| workflows, discover workflows, list workflows, multi-skill workflows | Workflows | Workflows/Workflows.md | +| alpha upload, upload data, send alpha data, manual upload, dry run upload | AlphaUpload | Workflows/AlphaUpload.md | +| export, dump, jsonl, export sqlite, debug export | Export | *(direct command — no workflow file)* | +| status, health summary, skill health, how are skills, skills doing, run selftune | Status | *(direct command — no workflow file)* | +| last, last session, recent session, what happened, what changed | Last | *(direct command — no workflow file)* | + +Workflows Grade, Evolve, Watch, and Ingest also run autonomously via `selftune orchestrate`. ## Interactive Configuration -Before running mutating workflows (evolve, evolve-body, evals, baseline), present -a pre-flight configuration prompt to the user. This gives them control over -execution mode, model selection, and key parameters. +Before running mutating workflows (evolve, evolve-body, evals, baseline), consult +`references/interactive-config.md` for the pre-flight configuration pattern, model +tier reference, and quick-path rules. -### Pre-Flight Pattern +## The Feedback Loop -Each mutating workflow has a **Pre-Flight Configuration** step. Follow this pattern: +The core idea: observe how users actually talk, find where skills miss, propose +better descriptions, validate them, and deploy — with automatic rollback if things +get worse. Every step produces evidence so you can explain *why* a change was made. -1. Present a brief summary of what the command will do -2. Use the `AskUserQuestion` tool to present structured options (max 4 questions per call — split into multiple calls if needed). Mark recommended defaults in option text with `(recommended)`. -3. Parse the user's selections from the tool response -4. Show a confirmation summary of selected options before executing +```text +Observe --> Detect --> Diagnose --> Propose --> Validate --> Audit --> Deploy --> Watch --> Rollback + | | + +--------------------------------------------------------------------+ +``` -**IMPORTANT:** Always use `AskUserQuestion` for pre-flight — never present options as inline numbered text. The tool provides a structured UI that is easier for users to interact with. If `AskUserQuestion` is not available, fall back to inline numbered options. +1. **Observe** — Hooks capture every session (queries, triggers, metrics) +2. **Detect** — `selftune eval generate` extracts missed-trigger patterns +3. **Diagnose** — `selftune grade` evaluates session quality with evidence +4. **Propose** — `selftune evolve` generates description improvements +5. **Validate** — Evolution is tested against the eval set before deploying +6. **Audit** — Persist proposal, evidence, and decision metadata for traceability +7. **Deploy** — Updated description replaces the original (backup kept) +8. **Watch** — `selftune watch` monitors for regressions post-deploy +9. **Rollback** — `selftune evolve rollback` restores previous version if needed -### Model Tier Reference +## Specialized Agents -When presenting model choices, use this table: +selftune bundles focused agents in `agents/`. When you need deeper analysis, +read the relevant agent file and follow its instructions — either inline or +by spawning a subagent with those instructions as its prompt. -| Tier | Model | Speed | Cost | Quality | Best for | -|------|-------|-------|------|---------|----------| -| Fast | `haiku` | ~2s/call | $ | Good | Iteration loops, bulk validation | -| Balanced | `sonnet` | ~5s/call | $$ | Great | Single-pass proposals, gate checks | -| Best | `opus` | ~10s/call | $$$ | Excellent | High-stakes final validation | +Treat these as worker-style subagents: +- pass the required inputs from the parent agent +- expect a structured report back +- do not have them question the user directly unless you explicitly want that -### Quick Path +| Trigger keywords | Agent file | When to use | +|------------------|-----------|-------------| +| diagnose, root cause, why failing, debug performance | `agents/diagnosis-analyst.md` | When one skill has recurring low grades, regressions, or unclear failures after basic doctor/status review | +| patterns, conflicts, cross-skill, overlap, optimize skills | `agents/pattern-analyst.md` | When multiple skills may overlap, misroute, or interfere, especially after composability flags conflict | +| review evolution, check proposal, safe to deploy | `agents/evolution-reviewer.md` | Before deploying a dry-run or pending proposal, especially for high-stakes skills or marginal improvements | +| set up selftune, integrate, configure project | `agents/integration-guide.md` | For complex setup and verification work in monorepos, multi-skill repos, or mixed-platform environments | -If the user says "use defaults", "just do it", or similar — skip the pre-flight -and run with recommended defaults. The pre-flight is for users who want control, -not a mandatory gate. +## Examples -### Workflows That Skip Pre-Flight +### Scenario 1: First-time setup -These read-only or simple workflows run immediately without prompting: -`status`, `last`, `doctor`, `dashboard`, `watch`, `evolve rollback`, -`grade auto`, `ingest *`, `contribute`, `cron`, `eval composability`, -`eval unit-test`, `eval import`. +User says: "Set up selftune" or "Install selftune" -## The Feedback Loop +Actions: +1. Read `Workflows/Initialize.md` +2. Run `selftune init` to bootstrap config (hooks are installed automatically) +3. Run `selftune doctor` to verify -```text -Observe --> Detect --> Diagnose --> Propose --> Validate --> Audit --> Deploy --> Watch --> Rollback - | | - +--------------------------------------------------------------------+ -``` +Result: Config at `~/.selftune/config.json`, hooks active, ready for session capture. -1. **Observe** -- Hooks capture every session (queries, triggers, metrics) -2. **Detect** -- `selftune eval generate` extracts missed-trigger patterns across invocation types -3. **Diagnose** -- `selftune grade` evaluates session quality with evidence -4. **Propose** -- `selftune evolve` generates description improvements -5. **Validate** -- Evolution is tested against the eval set -6. **Audit** -- Persist proposal, evidence, and decision metadata for traceability -7. **Deploy** -- Updated description replaces the original (with backup) -8. **Watch** -- `selftune watch` monitors for regressions post-deploy -9. **Rollback** -- `selftune evolve rollback` restores the previous version when regressions are detected +### Scenario 2: Improve a skill -## Resource Index +User says: "Make the pptx skill catch more queries" or "Evolve the Research skill" -| Resource | Purpose | -|----------|---------| -| `SKILL.md` | This file -- routing, triggers, quick reference | -| `references/logs.md` | Log file formats (telemetry, usage, queries, audit) | -| `references/grading-methodology.md` | 3-tier grading model, evidence standards, grading.json schema | -| `references/invocation-taxonomy.md` | 4 invocation types, coverage analysis, evolution connection | -| `settings_snippet.json` | Claude Code hook configuration template | -| `Workflows/Initialize.md` | First-time setup and config bootstrap | -| `Workflows/Grade.md` | Grade a session with expectations and evidence | -| `Workflows/Evals.md` | Generate eval sets, list skills, show stats | -| `Workflows/Evolve.md` | Evolve a skill description from failure patterns | -| `Workflows/Rollback.md` | Undo an evolution, restore previous description | -| `Workflows/Watch.md` | Post-deploy regression monitoring | -| `Workflows/Doctor.md` | Health checks on logs, hooks, schema | -| `Workflows/Ingest.md` | Import sessions from Codex, OpenCode, and OpenClaw | -| `Workflows/Replay.md` | Backfill logs from Claude Code transcripts | -| `Workflows/Contribute.md` | Export anonymized data for community contribution | -| `Workflows/Cron.md` | Scheduling & automation (cron/launchd/systemd/OpenClaw) | -| `Workflows/AutoActivation.md` | Auto-activation hook behavior and rules | -| `Workflows/Dashboard.md` | Dashboard modes: static, export, live server | -| `Workflows/EvolutionMemory.md` | Evolution memory system for session continuity | -| `Workflows/EvolveBody.md` | Full body and routing table evolution | -| `Workflows/Baseline.md` | No-skill baseline comparison and lift measurement | -| `Workflows/UnitTest.md` | Skill-level unit test runner and generator | -| `Workflows/Composability.md` | Multi-skill co-occurrence conflict analysis | -| `Workflows/ImportSkillsBench.md` | SkillsBench task corpus importer | -| `Workflows/Telemetry.md` | Telemetry status, opt-in/opt-out, and privacy | +Actions: +1. `selftune eval generate --skill pptx` to find missed triggers +2. `selftune evolve --skill pptx --skill-path ` to propose changes +3. `selftune watch --skill pptx --skill-path ` to monitor post-deploy -## Specialized Agents +Result: Skill description updated to match real user language, with rollback available. -selftune provides focused agents for deeper analysis. These live in -`.claude/agents/` and can be spawned as subagents for specialized tasks. +### Scenario 3: Check skill health -| Trigger keywords | Agent | Purpose | When to spawn | -|------------------|-------|---------|---------------| -| diagnose, root cause, why failing, skill failure, debug performance | diagnosis-analyst | Deep-dive analysis of underperforming skills | After doctor finds persistent issues, grades are consistently low, or status shows CRITICAL/WARNING | -| patterns, conflicts, cross-skill, overlap, trigger conflicts, optimize skills | pattern-analyst | Cross-skill pattern analysis and conflict detection | When user asks about cross-skill conflicts or composability scores indicate moderate-to-severe conflicts | -| review evolution, check proposal, safe to deploy, approve evolution | evolution-reviewer | Safety gate review of pending evolution proposals | Before deploying an evolution in interactive mode, especially for high-stakes or low-confidence proposals | -| set up selftune, integrate, configure project, install selftune | integration-guide | Guided interactive setup for specific project types | For complex project structures (monorepo, multi-skill, mixed agent platforms) | +User says: "How are my skills doing?" or "Run selftune" -## Examples +Actions: +1. `selftune status` for overall health summary +2. `selftune last` for most recent session insight +3. `selftune doctor` if issues detected + +Result: Pass rates, trend data, and actionable recommendations. + +### Scenario 4: Autonomous operation + +User says: "Set up cron jobs" or "Run selftune automatically" + +Actions: +1. `selftune cron setup` to install OS-level scheduling +2. Orchestrate loop runs: ingest → grade → evolve → watch + +Result: Skills improve continuously without manual intervention. + +## Troubleshooting + +### CLI not found + +Error: `command not found: selftune` + +Cause: CLI not installed or not on PATH. + +Solution: +1. Run `npm install -g selftune` or check `bin/selftune.cjs` exists +2. Verify with `which selftune` +3. If using bun: `bun link` in the repo root -- "Grade my last pptx session" -- "What skills are undertriggering?" -- "Generate evals for the pptx skill" -- "Evolve the pptx skill to catch more queries" -- "Rollback the last evolution" -- "Is the skill performing well after the change?" -- "Check selftune health" -- "Ingest my codex logs" -- "Show me skill stats" -- "How are my skills performing?" -- "What happened in my last session?" -- "Open the selftune dashboard" -- "Serve the dashboard at http://localhost:3141" -- "Show skill health status" -- "Replay my Claude Code transcripts" -- "Backfill logs from historical sessions" -- "Contribute my selftune data to the community" -- "Share anonymized skill data" -- "Set up cron jobs for autonomous evolution" -- "Schedule selftune to run automatically" -- "Ingest my OpenClaw sessions" -- "Why is selftune suggesting things?" -- "Customize activation rules" -- "Start the live dashboard" -- "Serve the dashboard on port 8080" -- "What happened in the last evolution?" -- "Read the evolution memory" -- "Why is this skill underperforming?" -- "Are there conflicts between my skills?" -- "Review this evolution before deploying" -- "Set up selftune for my project" -- "Evolve the full body of the Research skill" -- "Rewrite the routing table for pptx" -- "Does this skill add value over no-skill baseline?" -- "Measure baseline lift for the Research skill" -- "Generate unit tests for the pptx skill" -- "Run skill unit tests" -- "Which skills conflict with each other?" -- "Analyze composability for the Research skill" -- "Import SkillsBench tasks for my skill" -- "Install selftune" -- "Configure selftune for this project" -- "Make my skills better" -- "Optimize my skills" -- "Are my skills working?" -- "Show me the dashboard" -- "What changed since last time?" -- "What did selftune do?" -- "Run selftune" -- "Start selftune" -- "Go back to the previous version" -- "Undo the last change" +### No sessions to grade + +Error: `selftune grade` returns empty results. + +Cause: Hooks not capturing sessions, or no sessions since last ingest. + +Solution: +1. Run `selftune doctor` to verify hook installation +2. Run `selftune ingest claude --force` to re-ingest +3. Check `~/.claude/` for telemetry JSONL files + +### Evolution proposes no changes + +Cause: Eval set too small or skill already well-tuned. + +Solution: +1. Run `selftune eval generate --skill --max 50` for a larger eval set +2. Check `selftune status` — if pass rate is >90%, evolution may not be needed +3. Try `selftune evolve body` for deeper structural changes + +### Dashboard won't serve + +Error: Port already in use or blank page. + +Solution: +1. Try a different port: `selftune dashboard --port 3142` +2. Check if another process holds the port: `lsof -i :3141` +3. Use `--no-open` to start the server without opening a browser ## Negative Examples -These should NOT trigger selftune: +These should NOT trigger selftune — note that several are near-misses that +share keywords but need different solutions: -- "Fix this React hydration bug" -- "Create a PowerPoint about Q3 results" (this is pptx, not selftune) -- "Run my unit tests" -- "What does this error mean?" +- "Fix this React hydration bug" — general debugging, not skill improvement +- "Create a PowerPoint about Q3 results" — this is pptx skill, not selftune +- "Run my unit tests" — project tests, not skill eval tests (even though selftune has "eval unit-test", this is about *project* tests) +- "How do I use the Research skill?" — skill *usage*, not skill *improvement* (route to the Research skill itself) +- "Generate a report from this data" — content generation, not skill evolution +- "My build is failing" — project issue, not selftune health issue (even though "failing" overlaps with skill diagnostics language) +- "Evaluate this code for security issues" — "evaluate" here means code review, not session grading +- "Improve this function's performance" — code optimization, not skill optimization (even though "improve" and "performance" are selftune keywords) + +The key distinction: selftune is about improving *skills themselves* (their +descriptions, triggers, and execution quality). If the user is trying to +accomplish a task *using* a skill, route to that skill instead. + +## Resource Index -Route to other skills or general workflows unless the user explicitly -asks about grading, evals, evolution, monitoring, or skill observability. +| Resource | Purpose | When to read | +|----------|---------|--------------| +| `SKILL.md` | This file — routing, triggers, quick reference | Always loaded | +| `Workflows/*.md` | Step-by-step instructions for each workflow | When routing to a workflow | +| `agents/diagnosis-analyst.md` | Deep-dive skill failure analysis | Spawn when doctor/grades show persistent issues | +| `agents/pattern-analyst.md` | Cross-skill conflict detection | Spawn when composability flags conflicts | +| `agents/evolution-reviewer.md` | Safety gate for evolution proposals | Spawn before deploying high-stakes evolutions | +| `agents/integration-guide.md` | Guided setup for complex projects | Spawn for monorepos, multi-skill setups | +| `references/logs.md` | Log file formats (telemetry, usage, queries, audit) | When parsing or debugging log files | +| `references/grading-methodology.md` | 3-tier grading model, evidence standards | When grading sessions or interpreting grades | +| `references/invocation-taxonomy.md` | 4 invocation types, coverage analysis | When analyzing trigger coverage | +| `references/interactive-config.md` | Pre-flight config pattern, model tiers | Before running mutating workflows | +| `references/setup-patterns.md` | Platform-specific setup patterns | During complex setup scenarios | +| `settings_snippet.json` | Claude Code hook configuration template | During initialization | +| `assets/*.json` | Config templates (activation rules, settings) | During initialization | diff --git a/skill/Workflows/AlphaUpload.md b/skill/Workflows/AlphaUpload.md new file mode 100644 index 00000000..30e533ea --- /dev/null +++ b/skill/Workflows/AlphaUpload.md @@ -0,0 +1,45 @@ +# AlphaUpload Workflow + +Use this workflow when the user or parent agent wants to manually push alpha data +to the selftune cloud or preview what would be sent. + +## Command + +```bash +selftune alpha upload [--dry-run] +``` + +## Flags + +| Flag | Meaning | Default | +|------|---------|---------| +| `--dry-run` | Stage and summarize the upload without sending the HTTP request | Off | +| `-h`, `--help` | Show command help | Off | + +## Behavior + +1. Read the local alpha identity from `~/.selftune/config.json` +2. Fail with guidance if alpha is not enrolled or the API key is missing +3. Stage new canonical records from local SQLite into `canonical_upload_staging` +4. Build V2 push envelopes and flush them to the cloud API +5. Print a JSON summary with `enrolled`, `prepared`, `sent`, `failed`, `skipped`, and optional `guidance` + +## Examples + +Preview the upload: + +```bash +selftune alpha upload --dry-run +``` + +Run the upload now: + +```bash +selftune alpha upload +``` + +## When To Use + +- The user wants to manually push data before waiting for `orchestrate` +- `selftune status` or `selftune doctor` shows failed or pending alpha uploads +- You want to confirm what will be uploaded without sending data yet diff --git a/skill/Workflows/Composability.md b/skill/Workflows/Composability.md index 17507fc0..848724f9 100644 --- a/skill/Workflows/Composability.md +++ b/skill/Workflows/Composability.md @@ -88,9 +88,9 @@ When conflict candidates are identified, present them to the user with recommend ## Subagent Escalation For deep cross-skill analysis beyond what the composability command provides, -spawn the `pattern-analyst` agent as a subagent. This is useful when conflict -scores are high (> 0.3) and you need a full resolution plan with trigger -ownership recommendations. +read `skill/agents/pattern-analyst.md` and spawn a subagent with those instructions. +This is useful when conflict scores are high (> 0.3) and you need a full +resolution plan with trigger ownership recommendations. ## Common Patterns diff --git a/skill/Workflows/Dashboard.md b/skill/Workflows/Dashboard.md index 4f77d1ed..60ef4fa9 100644 --- a/skill/Workflows/Dashboard.md +++ b/skill/Workflows/Dashboard.md @@ -1,8 +1,8 @@ # selftune Dashboard Workflow Visual dashboard for selftune telemetry, skill performance, evolution -audit, and monitoring data. Supports static HTML export, file output, -and a live server with SSE-based real-time updates and action buttons. +audit, and monitoring data. Starts a local SPA server with SSE-based +real-time updates and action buttons. ## Default Command @@ -10,60 +10,26 @@ and a live server with SSE-based real-time updates and action buttons. selftune dashboard ``` -Opens a standalone HTML dashboard in the default browser with embedded -data from all selftune log files. +Starts a Bun HTTP server with a React SPA dashboard and opens it in the +default browser. The dashboard reads SQLite directly, but the current +live-update invalidation path still watches JSONL logs and pushes +updates via Server-Sent Events (SSE). That means the dashboard usually +refreshes quickly, but SQLite-only writes can still lag until the WAL +cutover lands. TanStack Query polling (60s) acts as a fallback. Action +buttons trigger selftune commands directly from the dashboard. Use +`selftune export` to generate JSONL from SQLite for debugging or +offline analysis. ## Options | Flag | Description | Default | |------|-------------|---------| -| `--export` | Export data-embedded HTML to stdout | Off | -| `--out FILE` | Write data-embedded HTML to FILE | None | -| `--serve` | Start live dashboard server | Off | -| `--port ` | Custom port for live server (requires `--serve`) | 3141 | +| `--port ` | Custom port for the server | 3141 | +| `--no-open` | Start server without opening browser | Off | +| `--serve` | *(Deprecated)* Alias for default behavior | — | -## Modes - -### Static (Default) - -Builds an HTML file with all telemetry data embedded as JSON, saves it -to `~/.selftune/dashboard.html`, and opens it in the default browser. -The data is a point-in-time snapshot -- refresh by re-running the command. - -```bash -selftune dashboard -``` - -### Export - -Writes the same data-embedded HTML to stdout. Useful for piping to other -tools or capturing output programmatically. - -```bash -selftune dashboard --export > dashboard.html -``` - -### File - -Writes the data-embedded HTML to a specific file path. - -```bash -selftune dashboard --out /tmp/report.html -``` - -### Live Server - -Starts a Bun HTTP server with a React SPA dashboard. The server watches -SQLite WAL file changes and pushes updates via Server-Sent Events (SSE), -so new invocations and session data appear within ~1 second. TanStack -Query polling (60s) acts as a fallback. Action buttons trigger selftune -commands directly from the dashboard. Use `selftune export` to generate -JSONL from SQLite for debugging or offline analysis. - -```bash -selftune dashboard --serve -selftune dashboard --serve --port 8080 -``` +Note: `--export` and `--out` were removed. The CLI will error if used, +suggesting `selftune dashboard` instead. ## Live Server @@ -90,9 +56,11 @@ override. ### Live Updates (SSE) The dashboard connects to `/api/v2/events` via Server-Sent Events. -When the SQLite WAL file changes on disk, the server broadcasts an +When watched JSONL log files change on disk, the server broadcasts an `update` event. The SPA invalidates all cached queries, triggering -immediate refetches. New data appears within ~1s. +immediate refetches. New data usually appears quickly, but the runtime +footer and Status page will warn when the server is still in this +legacy JSONL watcher mode. TanStack Query polling (60s) acts as a fallback safety net in case the SSE connection drops. Data also refreshes on window focus. @@ -149,45 +117,32 @@ The dashboard displays data from these sources: | Data | Source | Description | |------|--------|-------------| -| Telemetry | `session_telemetry_log.jsonl` | Session-level telemetry records | -| Skills | `skill_usage_log.jsonl` | Skill activation and usage events | -| Queries | `all_queries_log.jsonl` | All user queries across sessions | -| Evolution | `evolution_audit_log.jsonl` | Evolution audit trail (create, deploy, rollback) | +| Telemetry | SQLite (`~/.selftune/selftune.db`) | Session-level telemetry records | +| Skills | SQLite (`~/.selftune/selftune.db`) | Skill activation and usage events | +| Queries | SQLite (`~/.selftune/selftune.db`) | All user queries across sessions | +| Evolution | SQLite (`~/.selftune/selftune.db`) | Evolution audit trail (create, deploy, rollback) | | Decisions | `~/.selftune/memory/` | Evolution decision records | | Snapshots | Computed | Per-skill monitoring snapshots (pass rate, regression status) | | Unmatched | Computed | Queries that did not trigger any skill | | Pending | Computed | Evolution proposals not yet deployed, rejected, or rolled back | -If no log data is found, the static modes exit with an error message -listing the checked file paths. +If no log data is found, the server reports an error listing the +checked file paths. ## Steps -### 1. Choose Mode - -| Goal | Command | -|------|---------| -| Quick visual check | `selftune dashboard` | -| Save report to file | `selftune dashboard --out report.html` | -| Pipe to another tool | `selftune dashboard --export` | -| Live monitoring | `selftune dashboard --serve` | - -### 2. Run Command +### 1. Run Dashboard ```bash -# Static (opens browser) selftune dashboard - -# Live server -selftune dashboard --serve +selftune dashboard --port 8080 +selftune dashboard --no-open ``` -### 3. Interact with Dashboard +### 2. Interact with Dashboard -- **Static mode**: View the snapshot. Re-run to refresh. -- **Live mode**: Data refreshes in real time via SSE (~1s latency). - Use action buttons to trigger watch, evolve, or rollback directly from - the dashboard. +Data refreshes in real time via SSE (~1s latency). Use action buttons +to trigger watch, evolve, or rollback directly from the dashboard. ## Common Patterns @@ -196,12 +151,8 @@ selftune dashboard --serve > Report to the user that the dashboard is open. **User wants live monitoring** -> Run `selftune dashboard --serve`. Inform the user that data updates -> in real time via SSE (~1 second latency). - -**User wants a shareable report** -> Run `selftune dashboard --out report.html`. Report the file path to the -> user. The HTML file is self-contained with all data embedded. +> Run `selftune dashboard`. The server provides real-time updates via SSE +> (~1 second latency). **Dashboard shows no data** > Run `selftune doctor` to verify hooks are installed. If hooks are missing, @@ -209,8 +160,8 @@ selftune dashboard --serve > have run, inform the user that sessions must generate telemetry first. **User wants a different port** -> Run `selftune dashboard --serve --port `. Port must be 1-65535. +> Run `selftune dashboard --port `. Port must be 1-65535. **User wants to trigger actions from the dashboard** -> Run `selftune dashboard --serve` for live mode. The dashboard provides -> action buttons for watch, evolve, and rollback per skill via POST endpoints. +> Run `selftune dashboard`. The dashboard provides action buttons for +> watch, evolve, and rollback per skill via POST endpoints. diff --git a/skill/Workflows/Doctor.md b/skill/Workflows/Doctor.md index e7a1d76e..2db855d0 100644 --- a/skill/Workflows/Doctor.md +++ b/skill/Workflows/Doctor.md @@ -17,34 +17,57 @@ None. Doctor runs all checks unconditionally. ```json { - "healthy": true, + "command": "doctor", + "timestamp": "2026-02-28T10:00:00Z", "checks": [ { - "name": "session_telemetry_log exists", + "name": "config", + "path": "/Users/you/.selftune/config.json", "status": "pass", - "detail": "Found 142 entries" + "message": "Valid config with agent_type and llm_mode" }, { - "name": "skill_usage_log parseable", + "name": "log_session_telemetry", + "path": "/Users/you/.claude/session_telemetry_log.jsonl", "status": "pass", - "detail": "All 89 entries valid JSON" + "message": "Found 142 entries" }, { - "name": "hooks installed", + "name": "hook_settings", + "path": "/Users/you/.claude/settings.json", "status": "fail", - "detail": "PostToolUse hook not found in ~/.claude/settings.json" + "message": "PostToolUse hook not found in ~/.claude/settings.json" + }, + { + "name": "dashboard_freshness_mode", + "status": "warn", + "message": "Dashboard still uses legacy JSONL watcher invalidation" } ], "summary": { - "passed": 5, - "failed": 1, - "total": 6 - } + "pass": 8, + "fail": 1, + "warn": 1, + "total": 10 + }, + "healthy": false } ``` The process exits with code 0 if `healthy: true`, code 1 otherwise. +Failed or warning checks may include a machine-readable `guidance` object: + +```json +{ + "code": "config_missing", + "message": "selftune is not initialized yet.", + "next_command": "selftune init", + "suggested_commands": ["selftune doctor"], + "blocking": true +} +``` + ## Parsing Instructions ### Check Overall Health @@ -57,69 +80,64 @@ The process exits with code 0 if `healthy: true`, code 1 otherwise. ### Find Failed Checks ```bash -# Parse: .checks[] | select(.status == "fail") | { name, detail } +# Parse: .checks[] | select(.status == "fail") | { name, message } ``` ### Get Summary Counts ```bash -# Parse: .summary.passed, .summary.failed, .summary.total +# Parse: .summary.pass, .summary.fail, .summary.warn, .summary.total ``` ## Health Checks -Doctor validates these areas: +Doctor validates these baseline areas (10 checks total), and adds alpha cloud-link +or queue checks when alpha is configured: -### Log File Checks +### Config Check -| Check | What it validates | -|-------|-------------------| -| Log files exist | `session_telemetry_log.jsonl`, `skill_usage_log.jsonl`, `all_queries_log.jsonl` exist in `~/.claude/` | -| Logs are parseable | Every line in each log file is valid JSON | -| Schema conformance | Required fields present per log type (see `references/logs.md`) | +| Check name | What it validates | +|------------|-------------------| +| `config` | `~/.selftune/config.json` exists, is valid JSON, contains `agent_type` and `llm_mode` fields | -### Hook Checks +### Log Checks (4 checks) -| Check | What it validates | -|-------|-------------------| -| Hooks installed | `UserPromptSubmit`, `PreToolUse`, `PostToolUse`, and `Stop` hooks are configured in `~/.claude/settings.json` | -| Hook scripts exist | The script files referenced by hooks exist on disk | -| Auto-activate hook | `hooks/auto-activate.ts` is registered in `UserPromptSubmit` and the file is executable | -| Evolution guard hook | `hooks/evolution-guard.ts` is registered in `PreToolUse` and the file exists | +| Check name | What it validates | +|------------|-------------------| +| `log_session_telemetry` | `session_telemetry_log.jsonl` exists and is parseable | +| `log_skill_usage` | `skill_usage_log.jsonl` exists and is parseable | +| `log_all_queries` | `all_queries_log.jsonl` exists and is parseable | +| `log_evolution_audit` | `evolution_audit_log.jsonl` exists and is parseable | -### Memory Checks +### Hook Check -| Check | What it validates | -|-------|-------------------| -| Memory directory exists | `~/.selftune/memory/` directory is present | -| Memory files valid | `context.md`, `decisions.md`, `plan.md` exist and are non-empty (if previously written) | +| Check name | What it validates | +|------------|-------------------| +| `hook_settings` | `~/.claude/settings.json` has selftune hooks configured | -### Activation Rules Checks +### Evolution Check -| Check | What it validates | -|-------|-------------------| -| Rules file exists | `~/.selftune/activation-rules.json` is present | -| Rules file valid | The file contains valid JSON conforming to the activation rules schema | +| Check name | What it validates | +|------------|-------------------| +| `evolution_audit` | Evolution audit log entries have valid structure | -### Agent Checks +### Integrity Check -| Check | What it validates | -|-------|-------------------| -| Optional agent directory exists | If `.claude/agents/` is present, it is readable | -| Optional agent files present | If the repo bundles helper agents, the expected files are present | +| Check name | What it validates | +|------------|-------------------| +| `dashboard_freshness_mode` | Warns when the dashboard still relies on legacy JSONL watcher invalidation instead of SQLite WAL live refresh | -### Dashboard Checks (optional) +### Skill Version Sync Check -| Check | What it validates | -|-------|-------------------| -| Dashboard server accessible | `dashboard-server.ts` exists in the CLI directory | +| Check name | What it validates | +|------------|-------------------| +| `skill_version_sync` | SKILL.md frontmatter version matches package.json version | -### Evolution Audit Checks +### Version Check -| Check | What it validates | -|-------|-------------------| -| Audit log integrity | `evolution_audit_log.jsonl` entries have required fields (`timestamp`, `proposal_id`, `action`) | -| Valid action values | All entries use known action types: `created`, `validated`, `deployed`, `rolled_back` | +| Check name | What it validates | +|------------|-------------------| +| `version_up_to_date` | Installed version matches latest on npm registry | ## Steps @@ -139,18 +157,13 @@ For each failed check, take the appropriate action: | Failed check | Fix | |-------------|-----| -| Log files missing | Run a session to generate initial log entries. Check hook installation. | -| Logs not parseable | Inspect the corrupted log file. Remove or fix invalid lines. | -| Hooks not installed | Merge `skill/settings_snippet.json` into `~/.claude/settings.json`. Update paths. | -| Hook scripts missing | Verify the selftune repo path. Re-run `init` if the repo was moved. | -| Auto-activate missing | Add `hooks/auto-activate.ts` to `UserPromptSubmit` in settings. | -| Evolution guard missing | Add `hooks/evolution-guard.ts` to `PreToolUse` in settings. | -| Memory directory missing | Run `mkdir -p ~/.selftune/memory`. | -| Memory files invalid | Delete and let the memory writer recreate them on next evolve/watch. | -| Activation rules missing | Copy `assets/activation-rules-default.json` to `~/.selftune/activation-rules.json`. | -| Activation rules invalid | Validate JSON syntax. Re-copy from template if corrupted. | -| Agent files missing | If your repo uses optional helper agents, restore them in `.claude/agents/`. Otherwise ignore this advisory. | -| Audit log invalid | Remove corrupted entries. Future operations will append clean entries. | +| `config` | Run `selftune init` (or `selftune init --force` to regenerate). | +| `log_*` | Run a session to generate initial log entries. Check hook installation with `selftune init`. | +| `hook_settings` | Run `selftune init` to install hooks into `~/.claude/settings.json`. | +| `evolution_audit` | Remove corrupted entries. Future operations will append clean entries. | +| `dashboard_freshness_mode` | This is an operator warning, not a broken install. Expect possible freshness gaps for SQLite-only writes and export before destructive recovery. | +| `skill_version_sync` | Run `bun run sync-version` to stamp SKILL.md from package.json. | +| `version_up_to_date` | Run `npm install -g selftune` to update. | ### 4. Re-run Doctor @@ -159,14 +172,28 @@ After fixes, run doctor again to verify all checks pass. ## Subagent Escalation If doctor reveals persistent issues with a specific skill — especially -recurring failures that basic fixes do not resolve — spawn the -`diagnosis-analyst` agent as a subagent for root cause analysis. +recurring failures that basic fixes do not resolve — read +`skill/agents/diagnosis-analyst.md` and spawn a subagent with those instructions +for root cause analysis. + +### Alpha Upload Not Active + +**Symptoms:** `selftune status` shows alpha upload as "not enrolled" or "enrolled (missing credential)" + +**Diagnostic steps:** +1. Check `selftune status` — look at "Alpha Upload" and "Cloud link" lines +2. If `doctor` includes a `cloud_link` or alpha queue warning, prefer `.checks[].guidance.next_command` +3. If "not enrolled" or "not linked": run `selftune init --alpha --alpha-email --alpha-key ` +4. If "enrolled (missing credential)": re-run `selftune init --alpha --alpha-email --alpha-key --force` +5. If "api_key has invalid format": credential must start with `st_live_` or `st_test_` + +**Resolution:** Follow the setup sequence in Initialize workflow → Alpha Enrollment section. ## Common Patterns **User reports something seems broken** > Run `selftune doctor`. Parse the JSON output for failed checks. Report -> each failure's `name` and `detail` to the user with the recommended fix. +> each failure's `name` and `message` to the user with the recommended fix. **User asks if hooks are working** > Run `selftune doctor`. Parse `.checks[]` for hook-related entries. If diff --git a/skill/Workflows/Evals.md b/skill/Workflows/Evals.md index ac657a99..5fe072a6 100644 --- a/skill/Workflows/Evals.md +++ b/skill/Workflows/Evals.md @@ -26,9 +26,14 @@ selftune eval generate --skill [options] | `--skill ` | Skill to generate evals for | Required (unless `--list-skills`) | | `--list-skills` | List all logged skills with query counts | Off | | `--stats` | Show aggregate telemetry stats for the skill | Off | -| `--max ` | Maximum eval entries to generate | 50 | -| `--seed ` | Random seed for negative sampling | Random | -| `--out ` | Output file path | `evals-.json` | +| `--max ` | Maximum eval entries per side | 50 | +| `--seed ` | Seed for deterministic shuffling | 42 | +| `--output ` / `--out ` | Output file path | `{skillName}_trigger_eval.json` | +| `--no-negatives` | Exclude negative examples from output | Off | +| `--no-taxonomy` | Skip invocation_type classification | Off | +| `--skill-log ` | Path to skill_usage_log.jsonl | Default log path | +| `--query-log ` | Path to all_queries_log.jsonl | Default log path | +| `--telemetry-log ` | Path to session_telemetry_log.jsonl | Default log path | | `--synthetic` | Generate evals from SKILL.md via LLM (no logs needed) | Off | | `--skill-path ` | Path to SKILL.md (required with `--synthetic`) | — | | `--model ` | LLM model to use for synthetic generation | Agent default | @@ -40,24 +45,20 @@ selftune eval generate --skill [options] ```json [ { - "id": 1, "query": "Make me a slide deck for the Q3 board meeting", - "expected": true, - "invocation_type": "contextual", - "skill_name": "pptx", - "source_session": "abc123" + "should_trigger": true, + "invocation_type": "contextual" }, { - "id": 2, "query": "What format should I use for a presentation?", - "expected": false, - "invocation_type": "negative", - "skill_name": "pptx", - "source_session": null + "should_trigger": false } ] ``` +Each entry has `query` (string, max 500 chars), `should_trigger` (boolean), +and optional `invocation_type` (omitted when `--no-taxonomy` is set). + ### List Skills ```json @@ -93,14 +94,14 @@ selftune eval generate --skill [options] ### Find Missed Queries (False Negatives) ```bash -# Parse: .[] | select(.expected == true and .invocation_type != "explicit") +# Parse: .[] | select(.should_trigger == true and .invocation_type != "explicit") # These are queries that should trigger but might be missed ``` ### Get Negative Examples ```bash -# Parse: .[] | select(.expected == false) +# Parse: .[] | select(.should_trigger == false) ``` ## Sub-Workflows @@ -126,10 +127,16 @@ selftune eval generate --skill pptx --synthetic --skill-path /path/to/skills/ppt The command: 1. Reads the SKILL.md file content -2. Sends it to an LLM with a prompt requesting realistic test queries -3. Parses the response into eval entries with invocation type annotations -4. Classifies each positive query using the deterministic `classifyInvocation()` heuristic -5. Writes the eval set to the output file +2. Loads real user queries from the database (if available) as few-shot style examples so synthetic queries match real phrasing patterns +3. Sends skill content and real examples to an LLM with a prompt requesting realistic test queries +4. Parses the response into eval entries with invocation type annotations +5. Classifies each positive query using the deterministic `classifyInvocation()` heuristic +6. Writes the eval set to the output file + +**Note:** When real query data exists in the database, synthetic generation +automatically includes high-confidence positive triggers and general queries as +phrasing references. This produces more natural-sounding eval queries. If no +database is available, generation proceeds without real examples (fail-open). Use `--model` to override the default LLM model: @@ -144,7 +151,7 @@ Cross-reference `skill_usage_log.jsonl` (positive triggers) against an eval set annotated with invocation types. ```bash -selftune eval generate --skill pptx --max 50 --out evals-pptx.json +selftune eval generate --skill pptx --max 50 --output evals-pptx.json ``` The command: diff --git a/skill/Workflows/Evolve.md b/skill/Workflows/Evolve.md index a5721c7a..4666463a 100644 --- a/skill/Workflows/Evolve.md +++ b/skill/Workflows/Evolve.md @@ -30,7 +30,13 @@ selftune evolve --skill --skill-path [options] | `--confidence ` | Minimum confidence threshold (0-1) | 0.6 | | `--max-iterations ` | Maximum retry iterations | 3 | | `--validation-model ` | Model for trigger-check validation LLM calls | `haiku` | -| `--cheap-loop` | Use cheap models for loop, expensive for final gate | Off | +| `--pareto` | Generate multiple candidates per iteration | Off | +| `--candidates ` | Number of candidates per iteration (with `--pareto`) | 3 | +| `--token-efficiency` | Optimize for token efficiency in proposals | Off | +| `--with-baseline` | Include a no-skill baseline comparison | Off | +| `--cheap-loop` | Use cheap models for loop, expensive for final gate | On | +| `--full-model` | Use full-cost model throughout (disables cheap-loop) | Off | +| `--verbose` | Print detailed progress during evolution | Off | | `--gate-model ` | Model for final gate validation | `sonnet` (when `--cheap-loop`) | | `--proposal-model ` | Model for proposal generation LLM calls | None | | `--sync-first` | Refresh source-truth telemetry before generating evals/failure patterns | Off | @@ -197,6 +203,26 @@ The command groups missed queries by invocation type: See `references/invocation-taxonomy.md` for the taxonomy. +### 4b. Constitutional Pre-Validation Gate + +Before any LLM-based validation, each proposal passes through a +deterministic constitutional check that rejects obviously bad proposals +at zero cost. Four principles are enforced: + +1. **Size constraint** — description must be ≤1024 characters and within + 0.3x–3.0x word count of the original. +2. **No XML injection** — reject proposals containing XML/HTML tags. +3. **No unbounded broadening** — reject bare "all", "any", "every", + "everything" unless qualified by enumeration markers ("including", + "such as", "like", "e.g.", or a comma-separated list). +4. **Anchor preservation** — if the original contains `USE WHEN` trigger + phrases or `$skillName` references, those must appear in the proposal. + +If a proposal fails any principle, it is rejected with a descriptive +violation message and the pipeline retries (if iterations remain). + +For body evolution (`evolve body`), only the size constraint applies. + ### 5. Propose Description Changes An LLM generates a candidate description that would catch the missed @@ -215,6 +241,23 @@ The candidate is tested against the full eval set: If validation fails, the command retries up to `--max-iterations` times with adjusted proposals. +### Aggregate Metrics To Report + +When summarizing an evolution run, include these aggregate metrics rather +than only saying "passed" or "failed": + +| Metric | Meaning | +|--------|---------| +| `original_pass_rate` | Baseline pass rate before the proposal | +| `proposed_pass_rate` | Pass rate after applying the proposal | +| `regression_count` | Eval entries that passed before and failed after | +| `net_change` | Total passes gained minus regressions introduced | +| `iteration` / `iterations_used` | Which retry produced the current candidate | +| `baseline_lift` | Additional lift over the no-skill baseline when `--with-baseline` is enabled | + +These metrics explain whether the proposal is genuinely better, merely +different, or too risky to deploy. + ### 7. Deploy (or Preview) If `--dry-run`, the proposal is printed but not deployed. The audit log @@ -296,10 +339,11 @@ Use `--agent ` to override (claude, codex, opencode). ## Subagent Escalation -For high-stakes evolutions, consider spawning the `evolution-reviewer` agent -as a subagent to review the proposal before deploying. This is especially -valuable when the skill has a history of regressions, the evolution touches -many trigger phrases, or the confidence score is near the threshold. +For high-stakes evolutions, read `skill/agents/evolution-reviewer.md` and spawn a +subagent with those instructions to review the proposal before deploying. +This is especially valuable when the skill has a history of regressions, +the evolution touches many trigger phrases, or the confidence score is near +the threshold. ## Autonomous Mode diff --git a/skill/Workflows/EvolveBody.md b/skill/Workflows/EvolveBody.md index 40e591ba..983d20e9 100644 --- a/skill/Workflows/EvolveBody.md +++ b/skill/Workflows/EvolveBody.md @@ -16,7 +16,7 @@ selftune evolve body --skill --skill-path --target [optio |------|-------------|---------| | `--skill ` | Skill name | Required | | `--skill-path ` | Path to the skill's SKILL.md | Required | -| `--target ` | Evolution target: `routing_table` or `full_body` | Required | +| `--target ` | Evolution target: `routing` or `body` | Required | | `--teacher-agent ` | Agent CLI for proposal generation | Auto-detected | | `--student-agent ` | Agent CLI for validation | Same as teacher | | `--teacher-model ` | Model flag for teacher (e.g. `opus`) | Agent default | @@ -30,13 +30,13 @@ selftune evolve body --skill --skill-path --target [optio ## Evolution Targets -### `routing_table` +### `routing` Optimizes the `## Workflow Routing` markdown table in SKILL.md. The teacher LLM analyzes missed triggers and proposes new routing entries that map trigger keywords to the correct workflow files. -### `full_body` +### `body` Rewrites the entire SKILL.md body below the frontmatter. This includes the description, routing table, examples, and all other sections. The @@ -93,7 +93,7 @@ After the user responds, show a confirmation summary: ``` Configuration Summary: - Target: routing_table + Target: routing Mode: dry-run Teacher model: sonnet Student model: haiku @@ -125,8 +125,8 @@ pipeline. See `references/invocation-taxonomy.md`. ### 4. Generate Proposal (Teacher) The teacher LLM generates a proposal based on the target: -- **routing_table**: Optimized `## Workflow Routing` markdown table -- **full_body**: Complete SKILL.md body replacement +- **routing**: Optimized `## Workflow Routing` markdown table +- **body**: Complete SKILL.md body replacement Few-shot examples from `--few-shot` paths provide structural guidance. @@ -139,20 +139,20 @@ failure details and generates a refined proposal. If `--dry-run`, prints the proposal without deploying. Otherwise: 1. Creates a timestamped backup of the current SKILL.md -2. Applies the change: `replaceSection()` for routing, `replaceBody()` for full_body +2. Applies the change: `replaceSection()` for routing, `replaceBody()` for body 3. Records audit entries 4. Updates evolution memory ## Common Patterns **"Evolve the routing table for the Research skill"** -> `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing_table` +> `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target routing` **"Rewrite the entire skill body"** -> `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target full_body --dry-run` +> `selftune evolve body --skill Research --skill-path ~/.claude/skills/Research/SKILL.md --target body --dry-run` **"Use a stronger model for generation"** -> `selftune evolve body --skill pptx --skill-path /path/SKILL.md --target full_body --teacher-model opus --student-model haiku` +> `selftune evolve body --skill pptx --skill-path /path/SKILL.md --target body --teacher-model opus --student-model haiku` **"Preview what would change"** > Always start with `--dry-run` to review the proposal before deploying. diff --git a/skill/Workflows/Initialize.md b/skill/Workflows/Initialize.md index 7ee2ee18..0194dfa7 100644 --- a/skill/Workflows/Initialize.md +++ b/skill/Workflows/Initialize.md @@ -12,15 +12,24 @@ Bootstrap selftune for first-time use or after changing environments. ```bash selftune init [--agent ] [--cli-path ] [--force] +selftune init --alpha --alpha-email [--alpha-name "Name"] [--force] +selftune init --no-alpha [--force] ``` ## Options | Flag | Description | Default | |------|-------------|---------| -| `--agent ` | Agent platform: `claude`, `codex`, `opencode` | Auto-detected | +| `--agent ` | Agent platform: `claude_code`, `codex`, `opencode`, `openclaw` | Auto-detected | | `--cli-path ` | Override auto-detected CLI entry-point path | Auto-detected | | `--force` | Reinitialize even if config already exists | Off | +| `--enable-autonomy` | Enable autonomous scheduling during init | Off | +| `--schedule-format ` | Schedule format: `cron`, `launchd`, `systemd` | Auto-detected | +| `--alpha` | Enroll in the selftune alpha program | Off | +| `--no-alpha` | Unenroll from the alpha program (preserves user_id) | Off | +| `--alpha-email ` | Email for alpha enrollment (required with `--alpha`) | - | +| `--alpha-name ` | Display name for alpha enrollment | - | +| `--alpha-key ` | API key for cloud uploads (`st_live_*` format) | - | ## Output Format @@ -28,12 +37,19 @@ Creates `~/.selftune/config.json`: ```json { - "agent_type": "claude", + "agent_type": "claude_code", "cli_path": "/Users/you/selftune/cli/selftune/index.ts", "llm_mode": "agent", "agent_cli": "claude", "hooks_installed": true, - "initialized_at": "2026-02-28T10:00:00Z" + "initialized_at": "2026-02-28T10:00:00Z", + "alpha": { + "enrolled": true, + "user_id": "a1b2c3d4-e5f6-4a7b-8c9d-0e1f2a3b4c5d", + "email": "user@example.com", + "display_name": "User Name", + "consent_timestamp": "2026-02-28T10:00:00Z" + } } ``` @@ -47,6 +63,12 @@ Creates `~/.selftune/config.json`: | `agent_cli` | string | CLI binary name for the detected agent | | `hooks_installed` | boolean | Whether telemetry hooks are installed | | `initialized_at` | string | ISO 8601 timestamp | +| `alpha` | object? | Alpha program enrollment (present only if enrolled) | +| `alpha.enrolled` | boolean | Whether the user is currently enrolled | +| `alpha.user_id` | string | Stable UUID, generated once, preserved across reinits | +| `alpha.email` | string? | Email provided at enrollment | +| `alpha.display_name` | string? | Optional display name | +| `alpha.consent_timestamp` | string | ISO 8601 timestamp of consent | ## Steps @@ -135,21 +157,7 @@ The activation rules file configures auto-activation behavior -- which skills get suggested and under what conditions. Edit `~/.selftune/activation-rules.json` to customize thresholds and skill mappings for your project. -### 7. Verify Agent Availability - -`selftune init` installs the specialized agent files to `~/.claude/agents/` -automatically. Verify they are present: - -```bash -ls ~/.claude/agents/ -``` - -Expected agents: `diagnosis-analyst.md`, `pattern-analyst.md`, -`evolution-reviewer.md`, `integration-guide.md`. These are used by evolve -and doctor workflows for deeper analysis. If missing, run `selftune init --force` -to reinstall them. - -### 8. Verify with Doctor +### 7. Verify with Doctor ```bash selftune doctor @@ -163,17 +171,150 @@ reported issues before proceeding. For project-type-specific setup (single-skill, multi-skill, monorepo, Codex, OpenCode, mixed agents), see [docs/integration-guide.md](../../docs/integration-guide.md). -Templates for each project type are in the `templates/` directory: -- `templates/single-skill-settings.json` — hooks for single-skill projects -- `templates/multi-skill-settings.json` — hooks for multi-skill projects with activation rules -- `templates/activation-rules-default.json` — default auto-activation rule configuration +Templates for each project type are bundled with the skill: +- `skill/settings_snippet.json` — hooks for Claude Code projects +- `assets/activation-rules-default.json` — default auto-activation rule configuration ## Subagent Escalation For complex project structures (monorepos, multi-skill repos, mixed agent -platforms), spawn the `integration-guide` agent as a subagent for guided -setup. This agent handles project-type detection, per-package configuration, -and verification steps that go beyond what the basic init workflow covers. +platforms), read `agents/integration-guide.md` and spawn a subagent with +those instructions. That agent handles project-type detection, per-package +configuration, and verification steps that go beyond what the basic init +workflow covers. + +## Alpha Enrollment + +Enroll the user in the selftune alpha program for early access features. + +Before running the alpha command: +1. Ask whether the user wants to opt into the selftune alpha data-sharing program +2. If they opt in, ask for their email and optional display name +3. If they decline, skip alpha enrollment and continue with plain `selftune init` + +The CLI stays non-interactive. The agent is responsible for collecting consent +and the required `--alpha-email` value before invoking the command. + +## Alpha Enrollment (Agent-First Flow) + +The alpha program sends canonical telemetry to the selftune cloud for analysis. +Setup is agent-first — the cloud app is a one-time control-plane handoff, not the main UX. + +### Setup Sequence + +1. **Check local config**: Run `selftune status` — look for the "Alpha Upload" section +2. **If not linked**: Tell the user: + > To join the selftune alpha program, you need to create an account at https://app.selftune.dev and issue an upload credential. This is a one-time step — afterwards everything runs locally through the CLI. +3. **User completes cloud enrollment**: Signs in, enrolls, copies the `st_live_*` credential +4. **Store credential locally**: + + ```bash + selftune init --alpha --alpha-email --alpha-key + ``` + +5. **Verify readiness**: The init command prints a readiness check. If all checks pass, alpha upload is active. + The readiness JSON now includes a `guidance` object with: + - `message` + - `next_command` + - `suggested_commands[]` + - `blocking` +6. **If readiness fails**: Run `selftune doctor` to diagnose. Common issues: + - `api_key not set` → re-run init with `--alpha-key` + - `api_key has invalid format` → credential must start with `st_live_` or `st_test_` + - `not enrolled` → re-run init with `--alpha --alpha-email --alpha-key ` + +### Key Principle + +The cloud app is used **only** for: +- Sign-in +- Alpha enrollment +- Upload credential issuance + +All other selftune operations happen through the local CLI and this agent. + +### Enroll + +```bash +selftune init --alpha --alpha-email user@example.com --alpha-name "User Name" --force +selftune init --alpha-key st_live_abc123... # after enrollment, store the API key +``` + +The `--alpha-email` flag is required. The command will: +1. Generate a stable UUID (preserved across reinits) +2. Write the alpha block to `~/.selftune/config.json` +3. Print an `alpha_enrolled` JSON message to stdout +4. Print the consent notice to stderr +5. If an `--alpha-key` is provided, chmod `~/.selftune/config.json` to `0600` + +The consent notice explicitly states that the friendly alpha cohort shares raw +prompt/query text in addition to skill/session/evolution metadata. + +### API Key Provisioning + +After enrollment, users need to configure an API key for cloud uploads: + +1. Create a cloud account at the selftune web app +2. Generate an API key (format: `st_live_*`) +3. Store the key locally: + +```bash +selftune init --alpha --alpha-email --alpha-key st_live_abc123... --force +``` + +Without an API key, alpha enrollment is recorded locally but no uploads are attempted. When a key is stored, selftune tightens the local config file permissions to `0600`. + +### Upload Behavior + +Once enrolled and an API key is configured, `selftune orchestrate` automatically +uploads new session, invocation, and evolution data to the cloud API at the end of +each run. This upload step is fail-open -- errors never block the orchestrate loop. +Use `selftune alpha upload` for manual uploads or `selftune alpha upload --dry-run` +to preview what would be sent. + +The upload endpoint is `https://api.selftune.dev/api/v1/push`, authenticated with +the stored API key via `Authorization: Bearer` header. The endpoint can be +overridden with the `SELFTUNE_ALPHA_ENDPOINT` environment variable. + +### Unenroll + +```bash +selftune init --no-alpha --force +``` + +Sets `enrolled: false` in the alpha block but preserves the `user_id` so re-enrollment does not create a new identity. + +### Error Handling + +If `--alpha` is passed without `--alpha-email`, the CLI throws a JSON error: + +```json +{ + "code": "alpha_email_required", + "error": "alpha_email_required", + "message": "The --alpha-email flag is required for alpha enrollment.", + "next_command": "selftune init --alpha --alpha-email ", + "suggested_commands": ["selftune status", "selftune doctor"], + "blocking": true +} +``` + +When alpha readiness is evaluated after `selftune init --alpha`, the CLI emits: + +```json +{ + "alpha_readiness": { + "ready": false, + "missing": ["api_key not set"], + "guidance": { + "code": "alpha_credential_required", + "message": "Alpha enrollment exists, but the local upload credential is missing or invalid.", + "next_command": "selftune init --alpha --alpha-email user@example.com --alpha-key --force", + "suggested_commands": ["selftune status", "selftune doctor"], + "blocking": true + } + } +} +``` ## Common Patterns @@ -182,6 +323,11 @@ and verification steps that go beyond what the basic init workflow covers. > `npm install -g selftune`. Run `selftune init`, then verify with > `selftune doctor`. Report results to the user. +**User wants alpha enrollment** +> Ask whether they want to opt into alpha data sharing. If yes, collect email +> and optional display name, then run `selftune init --alpha --alpha-email ...`. +> If no, continue with plain `selftune init`. + **Hooks not capturing data** > Run `selftune doctor` to check hook installation. Parse the JSON output > for failed hook checks. If paths are wrong, update diff --git a/skill/Workflows/Orchestrate.md b/skill/Workflows/Orchestrate.md index 475772c7..66b88620 100644 --- a/skill/Workflows/Orchestrate.md +++ b/skill/Workflows/Orchestrate.md @@ -26,10 +26,13 @@ selftune orchestrate |------|-------------|---------| | `--dry-run` | Plan and validate without deploying changes | Off | | `--review-required` | Keep validated changes in review mode instead of deploying | Off | +| `--auto-approve` | *(Deprecated)* Autonomous mode is now the default | — | | `--skill ` | Limit the loop to one skill | All skills | -| `--max-skills ` | Cap how many candidates are processed in one run | `3` | -| `--recent-window ` | Window for post-deploy watch/rollback checks | `24` | +| `--max-skills ` | Cap how many candidates are processed in one run | `5` | +| `--recent-window ` | Window for post-deploy watch/rollback checks | `48` | | `--sync-force` | Force a full source replay before candidate selection | Off | +| `--loop` | Run as a long-lived process that cycles continuously | Off | +| `--loop-interval ` | Pause between cycles (minimum 60) | `3600` | ## Default Behavior @@ -133,6 +136,12 @@ In autonomous mode, orchestrate calls sub-workflows in this fixed order: 2. **Status** — compute skill health using existing grade results (reads `grading.json` outputs from previous sessions) 3. **Evolve** — run evolution on selected candidates (pre-flight is skipped, cheap-loop mode enabled, defaults used) 4. **Watch** — monitor recently evolved skills (auto-rollback enabled by default, `--recent-window` hours lookback) +5. **Alpha Upload** — if enrolled in the alpha program (`config.alpha.enrolled === true`) and an API key is configured, stage new canonical records (sessions, invocations, evolution evidence, orchestrate runs) into `canonical_upload_staging`, build V2 push payloads, and flush to the cloud API (`POST /api/v1/push`) with Bearer auth. Fail-open: upload errors never block the orchestrate loop. Respects `--dry-run`. + +Between candidate selection and evolution, orchestrate checks for +**cross-skill eval set overlap**. When two or more evolution candidates +share >30% of their positive eval queries, a warning is logged to stderr. +This is an informational diagnostic only — it does not block evolution. All sub-workflows run with defaults and no user interaction. The safety model relies on regression thresholds, automatic rollback, and SKILL.md diff --git a/skill/Workflows/Sync.md b/skill/Workflows/Sync.md index 4576eabd..03d01372 100644 --- a/skill/Workflows/Sync.md +++ b/skill/Workflows/Sync.md @@ -29,6 +29,7 @@ selftune sync | `--no-opencode` | Skip OpenCode ingest | | `--no-openclaw` | Skip OpenClaw ingest | | `--no-repair` | Skip rebuilding `skill_usage_repaired.jsonl` | +| `--json` | Output results as JSON | ## Output @@ -66,6 +67,28 @@ After sync completes, proceed with the user's intended workflow: `selftune status`, `selftune dashboard`, `selftune watch --sync-first`, or `selftune evolve --sync-first`. +## `--json` Usage + +```bash +selftune sync --json +``` + +Sample output: + +```json +{ + "sources": { + "claude": { "scanned": 12, "synced": 3, "skipped": 9 }, + "codex": { "scanned": 0, "synced": 0, "skipped": 0 } + }, + "repaired": { "total": 42 }, + "errors": [] +} +``` + +Use `--json` when the agent needs to parse sync results programmatically +(e.g., to decide whether to proceed with evolution or surface counts to the user). + ## Common Patterns **User wants to refresh telemetry data** diff --git a/skill/Workflows/Watch.md b/skill/Workflows/Watch.md index 8ebdc91b..2129d84b 100644 --- a/skill/Workflows/Watch.md +++ b/skill/Workflows/Watch.md @@ -17,8 +17,9 @@ selftune watch --skill --skill-path [options] | `--skill-path ` | Path to the skill's SKILL.md | Required | | `--window ` | Sliding window size (number of sessions) | 20 | | `--threshold ` | Regression threshold (drop from baseline) | 0.1 | -| `--baseline ` | Explicit baseline pass rate (0-1) | Auto-detected from last deploy | | `--auto-rollback` | Automatically rollback on detected regression | Off | +| `--sync-first` | Refresh source-truth telemetry before evaluating | Off | +| `--sync-force` | Force a full source rescan during `--sync-first` | Off | ## Output Format @@ -138,10 +139,6 @@ context window resets before the user acts on the results. > Use `--auto-rollback`. The command will restore the previous description > automatically if pass rate drops below baseline minus threshold. -**"Set a custom baseline"** -> Use `--baseline 0.85` to override auto-detection. Useful when the -> auto-detected baseline is from an older evolution. - ## Autonomous Mode When called by `selftune orchestrate`, watch runs automatically on recently diff --git a/skill/agents/diagnosis-analyst.md b/skill/agents/diagnosis-analyst.md new file mode 100644 index 00000000..555ac08e --- /dev/null +++ b/skill/agents/diagnosis-analyst.md @@ -0,0 +1,163 @@ +--- +name: diagnosis-analyst +description: Use when a specific skill has recurring low grades, warning or critical status, regressions, or unclear failures after basic doctor/status review. Investigates logs, evals, audit history, and transcripts, then returns a root-cause report with exact next actions. +tools: Read, Grep, Glob, Bash +disallowedTools: Write, Edit +model: sonnet +maxTurns: 8 +--- + +# Diagnosis Analyst + +Read-only specialist for explaining why one skill is underperforming. + +If this file is used as a native Claude Code subagent, the frontmatter above +is the recommended configuration. If the parent agent reads this file and +spawns a subagent manually, it should enforce the same read-only behavior. + +## Required Inputs From Parent + +- `skill`: canonical skill name +- `skillPath`: path to the skill's `SKILL.md` when known +- `reasonForEscalation`: why this diagnosis is needed now +- Optional: `sessionIds`, `proposalId`, `window`, `knownSymptoms` + +If a required input is missing, stop and return a blocking-input request to the +parent. Do not ask the user directly unless the parent explicitly told you to. + +## Operating Rules + +- Stay read-only. Do not edit skills, configs, logs, or settings. +- Use `selftune status` and `selftune last` for orientation only. They are + human-readable summaries, not stable machine contracts. +- Use `selftune doctor` when you need structured system-health data. +- Prefer direct evidence from log files, transcripts, workflow docs, and audit + history over guesses. +- Cite concrete evidence: log path, query text, session ID, proposal ID, or + timestamp. +- Classify the dominant problem as one of: + - `TRIGGER`: skill did not fire when it should have + - `PROCESS`: skill fired but the workflow was followed incorrectly + - `QUALITY`: workflow executed but the output quality was weak + - `INFRASTRUCTURE`: hooks, logs, config, or installation are broken + +## Evidence Sources + +- `~/.claude/session_telemetry_log.jsonl` +- `~/.claude/skill_usage_log.jsonl` +- `~/.claude/all_queries_log.jsonl` +- `~/.claude/evolution_audit_log.jsonl` +- The target skill's `SKILL.md` +- Session transcripts referenced from telemetry or grading evidence +- Relevant workflow docs: + - `skill/Workflows/Doctor.md` + - `skill/Workflows/Evals.md` + - `skill/Workflows/Evolve.md` + - `skill/references/grading-methodology.md` + - `skill/references/invocation-taxonomy.md` + +## Investigation Workflow + +### 1. Confirm scope and health context + +Start with a quick snapshot: + +```bash +selftune status +selftune last +selftune doctor +``` + +Use these to identify whether the issue is system-wide, skill-specific, or +just a noisy single session. + +### 2. Read the current skill contract + +Read the target `SKILL.md` and the workflow doc that the skill should have +used. Check whether the problem looks like bad triggering, bad workflow +instructions, or bad execution despite good instructions. + +### 3. Inspect trigger coverage + +Use eval generation as a diagnostic aid: + +```bash +selftune eval generate --skill --stats +selftune eval generate --skill --max 50 +``` + +Treat these outputs as exploratory summaries. Verify important claims against +the underlying logs: +- `~/.claude/skill_usage_log.jsonl` +- `~/.claude/all_queries_log.jsonl` +- `~/.claude/session_telemetry_log.jsonl` + +### 4. Review recent evolution history + +Read `~/.claude/evolution_audit_log.jsonl` for entries affecting the target +skill. Look for: +- recent deploys followed by regressions +- repeated dry-runs or validated proposals with no deploy +- rollbacks +- plateaus where descriptions keep changing without meaningful lift + +### 5. Inspect transcripts for failing sessions + +Prefer the specific sessions passed by the parent. Otherwise, select recent +sessions that show errors, unmatched queries, or clear misses. + +Look for: +- the skill never being read or invoked +- the wrong workflow being chosen +- steps performed out of order +- repeated retries or Bash thrashing +- missing tool use that the workflow clearly expected + +### 6. Synthesize the root cause + +State the dominant failure class, the strongest supporting evidence, and the +smallest credible next action. + +## Stop Conditions + +Stop and return to the parent if: +- the target skill is ambiguous +- the required logs or transcripts are unavailable +- the evidence is limited to one isolated session +- the problem is clearly installation health, not skill behavior + +## Return Format + +Return a compact report with these sections: + +```markdown +## Diagnosis Report: + +### Summary +[2-4 sentence explanation of what is going wrong] + +### Root Cause +[TRIGGER / PROCESS / QUALITY / INFRASTRUCTURE] + +### Findings +- [Finding 1] +- [Finding 2] +- [Finding 3] + +### Evidence +- [path or command result] +- [session ID / query / timestamp] +- [audit or transcript evidence] + +### Recommended Next Actions +1. [Highest-leverage next step] +2. [Second step] +3. [Optional follow-up] + +### Suggested Commands +- `...` +- `...` + +### Confidence +[high / medium / low] +``` diff --git a/skill/agents/evolution-reviewer.md b/skill/agents/evolution-reviewer.md new file mode 100644 index 00000000..f92bc4b0 --- /dev/null +++ b/skill/agents/evolution-reviewer.md @@ -0,0 +1,149 @@ +--- +name: evolution-reviewer +description: Use when reviewing a dry-run or pending evolution proposal before deployment, especially for high-stakes skills, marginal improvements, or recent regressions. Compares old vs new content, checks evidence quality, and returns an approve or reject verdict with conditions. +tools: Read, Grep, Glob, Bash +disallowedTools: Write, Edit +model: sonnet +maxTurns: 8 +--- + +# Evolution Reviewer + +Read-only safety reviewer for selftune proposals. + +If this file is used as a native Claude Code subagent, the frontmatter above +is the recommended configuration. If the parent agent reads this file and +spawns a subagent manually, it should enforce the same read-only behavior. + +## Required Inputs From Parent + +- `skill`: canonical skill name +- `skillPath`: path to the target `SKILL.md` +- `target`: `description`, `routing`, or `body` when known +- Optional: `proposalId`, `evalSetPath`, `proposalOutput`, `reasonForReview` + +If a required input is missing, stop and return a blocking-input request to the +parent. Do not ask the user directly unless the parent explicitly told you to. + +## Operating Rules + +- Stay read-only. Do not deploy, rollback, or edit files. +- If no proposal is available to review, do not create one yourself. Return + the exact dry-run command the parent should execute next. +- Use the current workflow contracts: + - `selftune evolve ...` for description proposals + - `selftune evolve body --target routing|body ...` for routing/body proposals +- Treat `selftune watch` as supporting context, not a substitute for proposal + validation. +- Reject proposals that broaden scope without evidence, remove important + anchors, or introduce obvious regressions. + +## Evidence Sources + +- Parent-supplied proposal output or diff +- `evolution_audit_log.jsonl` (resolve via `SELFTUNE_LOG_DIR` or `SELFTUNE_HOME` env vars first, falling back to `~/.claude/`) +- The current `SKILL.md` +- Existing backup files if present +- Eval set used for validation +- `skill/Workflows/Evolve.md` +- `skill/Workflows/EvolveBody.md` +- `skill/Workflows/Watch.md` +- `skill/references/invocation-taxonomy.md` + +## Review Workflow + +### 1. Locate the exact proposal + +Use the parent-supplied proposal or audit-log entry if available. If not, +inspect `evolution_audit_log.jsonl` using `SELFTUNE_LOG_DIR` or +`SELFTUNE_HOME` first, falling back to `~/.claude/`, for the latest +non-terminal proposal affecting the target skill. + +If there is nothing concrete to review, stop and return the next command the +parent should run, for example: + +```bash +selftune evolve --skill --skill-path --dry-run +``` + +### 2. Compare original vs proposed content + +For description proposals, compare: +- preserved working anchors +- added language for missed queries +- scope creep or vague broadening +- tone and style continuity + +For routing/body proposals, compare: +- workflow routing ownership changes +- added or removed operational steps +- whether the body still matches current CLI behavior +- whether the rewrite makes the skill easier or harder to trigger correctly + +### 3. Assess eval and evidence quality + +Check: +- eval size is meaningful for the change being proposed +- negatives exist for overtriggering protection +- explicit queries are protected +- examples look representative of real usage, not mostly synthetic edge cases + +### 4. Check metrics and history + +Review proposal metrics and recent history: +- pass-rate delta +- regression count or obvious explicit regressions +- confidence +- recent churn, rollbacks, or repeated low-lift proposals + +### 5. Render a safety verdict + +Issue one of: +- `APPROVE` +- `APPROVE WITH CONDITIONS` +- `REJECT` + +## Stop Conditions + +Stop and return to the parent if: +- there is no concrete proposal or diff to review +- the target skill or proposal is ambiguous +- the eval source is missing +- the review would require creating or deploying a proposal + +## Return Format + +Return a compact verdict with these sections: + +```markdown +## Evolution Review: + +### Proposal ID +[proposal ID or "not provided"] + +### Verdict +[APPROVE / APPROVE WITH CONDITIONS / REJECT] + +### Summary +[2-4 sentence explanation] + +### Findings +- [Finding 1] +- [Finding 2] +- [Finding 3] + +### Evidence +- [audit entry / eval fact / diff observation] +- [audit entry / eval fact / diff observation] + +### Required Changes +1. [Only if not approved] +2. [Only if not approved] + +### Post-Deploy Conditions +- [watch requirement or monitoring threshold] +- [follow-up check] + +### Confidence +[high / medium / low] +``` diff --git a/skill/agents/integration-guide.md b/skill/agents/integration-guide.md new file mode 100644 index 00000000..087c5a26 --- /dev/null +++ b/skill/agents/integration-guide.md @@ -0,0 +1,154 @@ +--- +name: integration-guide +description: Use when setting up selftune in a complex repo: monorepo, multi-skill workspace, mixed agent platforms, unclear hook state, or install problems that basic init/doctor does not resolve. Detects project structure, validates configuration, and returns or applies a verified setup plan. +tools: Read, Grep, Glob, Bash, Write, Edit +model: sonnet +maxTurns: 12 +--- + +# Integration Guide + +Setup specialist for selftune integration in non-trivial environments. + +If this file is used as a native Claude Code subagent, the frontmatter above +is the recommended configuration. If the parent agent reads this file and +spawns a subagent manually, it should preserve the same operating rules. + +## Required Inputs From Parent + +- `projectRoot`: repo root to inspect +- `requestedMode`: `plan-only` or `hands-on` +- Optional: `agentPlatform`, `knownSkillPaths`, `knownSymptoms` + +If a required input is missing, stop and return a blocking-input request to the +parent. Do not ask the user directly unless the parent explicitly told you to. + +## Operating Rules + +- Default to inspect plus plan. Only modify repo files or user config if the + parent explicitly requested hands-on setup. +- `selftune init` is the source of truth for config bootstrap and automatic + hook installation. Manual `settings.json` edits are a troubleshooting + fallback, not the default path. +- `selftune doctor` returns structured health data. Use it after each material + setup change. +- Use current workflow docs, especially: + - `skill/Workflows/Initialize.md` + - `skill/Workflows/Doctor.md` + - `skill/Workflows/Ingest.md` + - `skill/references/setup-patterns.md` +- Respect platform boundaries: + - Claude Code prefers hooks installed by `selftune init` + - Codex, OpenCode, and OpenClaw rely on ingest workflows + +## Setup Workflow + +### 1. Detect project structure + +Inspect the workspace and classify it as one of: +- single-skill project +- multi-skill repo +- monorepo with shared tooling +- no existing skills yet + +Identify the likely skills, agent platforms, and any path or workspace issues +that could affect hook or CLI behavior. + +### 2. Check current install health + +Use: + +```bash +which selftune +selftune doctor +``` + +Check: +- whether the CLI exists +- whether `config.json` exists and looks current (resolve via `SELFTUNE_CONFIG_DIR` or `SELFTUNE_HOME` env vars first, falling back to `~/.selftune/`; run `selftune doctor` to confirm the resolved path) +- whether hooks or ingest paths are healthy +- whether logs already exist + +### 3. Choose the correct setup path + +For Claude Code, prefer: + +```bash +selftune init [--agent claude_code] [--cli-path ] [--force] +``` + +For other platforms, route to the appropriate ingest workflow after init. + +If the repo layout is complex, decide whether the user needs: +- one shared setup at the repo root +- per-package setup guidance +- absolute paths to avoid cwd-dependent failures + +### 4. Apply changes only when authorized + +If `requestedMode` is `plan-only`, stop at a verified setup plan. + +If `requestedMode` is `hands-on`, you may: +- run `selftune init` +- create or refresh local activation-rules files +- repair obvious path or config issues +- re-run doctor after each meaningful change + +### 5. Verify end to end + +After setup, verify with: + +```bash +selftune doctor +selftune status +selftune last +selftune eval generate --list-skills +``` + +Treat `status`, `last`, and `eval generate --list-skills` as human-readable +smoke tests, not strict machine contracts. + +### 6. Hand back next steps + +Return the smallest useful next actions for the parent: inspect health, +run evals, improve a skill, or set up autonomous orchestration. + +## Stop Conditions + +Stop and return to the parent if: +- the project root is ambiguous +- the CLI is missing and installation is not allowed +- the repo has no skills and the task is really skill creation, not setup +- setup would require changing user-home files without explicit approval from + the parent + +## Return Format + +Return a setup report with these sections: + +```markdown +## selftune Setup Complete + +### Environment +- Agent platform: +- Project type: +- Skills detected: + +### Configuration +- Config: [created / verified / missing] +- Init path: [command used or recommended] +- Hooks or ingest: [healthy / needs work / not applicable] +- Doctor: [healthy / unhealthy with blockers] + +### Verification +- Telemetry capture: [working / not verified] +- Skill tracking: [working / not verified] + +### Next Steps +1. [Primary recommended action] +2. [Secondary action] +3. [Optional action] + +### Confidence +[high / medium / low] +``` diff --git a/skill/agents/pattern-analyst.md b/skill/agents/pattern-analyst.md new file mode 100644 index 00000000..7db9fe40 --- /dev/null +++ b/skill/agents/pattern-analyst.md @@ -0,0 +1,149 @@ +--- +name: pattern-analyst +description: Use when multiple skills may overlap, misroute, or interfere with each other, or when composability results suggest moderate or severe conflict. Analyzes trigger ownership, query overlap, and cross-skill health, then returns a conflict matrix and routing recommendations. +tools: Read, Grep, Glob, Bash +disallowedTools: Write, Edit +model: sonnet +maxTurns: 8 +--- + +# Pattern Analyst + +Read-only specialist for cross-skill overlap and ownership analysis. + +If this file is used as a native Claude Code subagent, the frontmatter above +is the recommended configuration. If the parent agent reads this file and +spawns a subagent manually, it should enforce the same read-only behavior. + +## Required Inputs From Parent + +- `scope`: target skill set or `"all-skills"` +- `question`: what conflict or overlap needs explanation +- Optional: `window`, `prioritySkills`, `knownConflictPairs` + +If a required input is missing, stop and return a blocking-input request to +the parent. Do not ask the user directly unless the parent explicitly told +you to. + +## Operating Rules + +- Stay read-only. Do not edit skill files or deploy routing changes. +- Use `selftune eval composability` as a starting signal when available, then + verify conclusions against actual skill docs and logs. +- Treat `selftune eval generate --list-skills` and `selftune status` as + human-readable summaries, not strict JSON contracts. +- Distinguish: + - trigger overlap + - misroutes + - negative-example gaps + - systemic infrastructure issues +- Prefer concrete ownership recommendations over abstract observations. + +## Evidence Sources + +- `~/.claude/skill_usage_log.jsonl` +- `~/.claude/all_queries_log.jsonl` +- `~/.claude/session_telemetry_log.jsonl` +- `~/.claude/evolution_audit_log.jsonl` +- Relevant `SKILL.md` files in the workspace +- `skill/Workflows/Composability.md` +- `skill/Workflows/Evals.md` +- `skill/references/invocation-taxonomy.md` + +## Analysis Workflow + +### 1. Inventory the relevant skills + +Use lightweight summaries first: + +```bash +selftune eval generate --list-skills +selftune status +``` + +Then read the actual `SKILL.md` files for the skills in scope. + +### 2. Extract each skill's ownership contract + +For each skill, capture: +- frontmatter description +- workflow-routing triggers +- explicit exclusions or negative examples +- any recent evolution that changed ownership or wording + +### 3. Detect conflicts and gaps + +Compare trigger keywords and description phrases across all skills. Flag: +- direct conflicts +- semantic overlaps +- negative-example gaps +- routing-table contradictions +- ambiguous ownership where two skills could both claim the same query + +### 4. Analyze real query behavior + +Read the logs and look for: +- queries that triggered multiple skills +- queries that triggered no skills despite matching one or more descriptions +- queries that appear to have been routed to the wrong skill +- sessions where co-occurring skills correlate with more errors or retries + +### 5. Check composability and history + +When useful, run: + +```bash +selftune eval composability --skill +``` + +Use the results to confirm or refute overlap hypotheses. Then inspect +`~/.claude/evolution_audit_log.jsonl` for recent changes that may have +shifted ownership or introduced churn. + +### 6. Recommend ownership changes + +For each important conflict, state: +- which skill should own the query family +- which skill should back off +- whether the fix is a description change, routing-table change, negative + examples, or simply leaving the current state alone + +## Stop Conditions + +Stop and return to the parent if: +- the skills in scope are not identifiable +- there is not enough log data to say anything useful +- the question is really about one underperforming skill rather than + cross-skill behavior + +## Return Format + +Return a compact report with these sections: + +```markdown +## Cross-Skill Pattern Analysis + +### Summary +[2-4 sentence overview] + +### Findings +- [Finding 1] +- [Finding 2] +- [Finding 3] + +### Conflict Matrix +| Skill A | Skill B | Problem | Evidence | Recommended Owner | +|---------|---------|---------|----------|-------------------| +| ... | ... | ... | ... | ... | + +### Coverage Gaps +- [query family or sample] + +### Recommended Changes +1. [Highest-priority change] +2. [Second change] +3. [Optional follow-up] + +### Confidence +[high / medium / low] +``` diff --git a/skill/references/interactive-config.md b/skill/references/interactive-config.md new file mode 100644 index 00000000..530e27c1 --- /dev/null +++ b/skill/references/interactive-config.md @@ -0,0 +1,39 @@ +# Interactive Configuration + +Before running mutating workflows (evolve, evolve-body, eval generate, baseline), present +a pre-flight configuration prompt to the user. This gives them control over +execution mode, model selection, and key parameters. + +## Pre-Flight Pattern + +Each mutating workflow has a **Pre-Flight Configuration** step. Follow this pattern: + +1. Present a brief summary of what the command will do +2. Use the `AskUserQuestion` tool to present structured options (max 4 questions per call — split into multiple calls if needed). Mark recommended defaults in option text with `(recommended)`. +3. Parse the user's selections from the tool response +4. Show a confirmation summary of selected options before executing + +**IMPORTANT:** Always use `AskUserQuestion` for pre-flight — never present options as inline numbered text. The tool provides a structured UI that is easier for users to interact with. If `AskUserQuestion` is not available, fall back to inline numbered options. + +## Model Tier Reference + +When presenting model choices, use this table: + +| Tier | Model | Speed | Cost | Quality | Best for | +|------|-------|-------|------|---------|----------| +| Fast | `haiku` | ~2s/call | $ | Good | Iteration loops, bulk validation | +| Balanced | `sonnet` | ~5s/call | $$ | Great | Single-pass proposals, gate checks | +| Best | `opus` | ~10s/call | $$$ | Excellent | High-stakes final validation | + +## Quick Path + +If the user says "use defaults", "just do it", or similar — skip the pre-flight +and run with recommended defaults. The pre-flight is for users who want control, +not a mandatory gate. + +## Workflows That Skip Pre-Flight + +These read-only or simple workflows run immediately without prompting: +`status`, `last`, `doctor`, `dashboard`, `watch`, `evolve rollback`, +`grade auto`, `ingest *`, `contribute`, `cron`, `eval composability`, +`eval unit-test`, `eval import`. diff --git a/skill/references/logs.md b/skill/references/logs.md index a0ca2bf4..7d6c412f 100644 --- a/skill/references/logs.md +++ b/skill/references/logs.md @@ -272,7 +272,9 @@ One record per evolution action. Written by the evolution and rollback modules. **Required fields:** `timestamp`, `proposal_id`, `action` -**Optional fields:** `details`, `eval_snapshot` +**Optional fields:** `details`, `eval_snapshot`, `iterations_used` + +- `iterations_used` (integer, nullable) — How many iteration loops the evolution run used before reaching a result. Present on `deployed` audit entries; null for legacy records or actions that don't track iterations. --- diff --git a/skill/references/setup-patterns.md b/skill/references/setup-patterns.md index 7010e426..f759ff10 100644 --- a/skill/references/setup-patterns.md +++ b/skill/references/setup-patterns.md @@ -60,6 +60,6 @@ combined. ## Optional Repository Extensions -Some repositories also bundle Claude-specific helper agents in `.claude/agents/` -for diagnosis, evolution review, or setup help. These are optional extensions, -not part of the core skill package installed by `npx skills add`. +selftune bundles specialized agent instruction files in `skill/agents/` for +diagnosis, evolution review, pattern analysis, and setup help. These ship with +the skill package and are read directly when needed — no installation step required. diff --git a/tests/alpha-identity/alpha-identity.test.ts b/tests/alpha-identity/alpha-identity.test.ts new file mode 100644 index 00000000..a7ef0904 --- /dev/null +++ b/tests/alpha-identity/alpha-identity.test.ts @@ -0,0 +1,293 @@ +/** + * Tests for alpha identity management — cached cloud identity model. + * + * Tests the AlphaIdentity interface, getAlphaLinkState() logic, + * migrateLocalIdentity() detection, and config read/write helpers. + */ + +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { + generateUserId, + getAlphaLinkState, + isValidApiKeyFormat, + migrateLocalIdentity, + readAlphaIdentity, + writeAlphaIdentity, +} from "../../cli/selftune/alpha-identity.js"; +import type { AlphaIdentity } from "../../cli/selftune/types.js"; + +// --------------------------------------------------------------------------- +// Test fixtures +// --------------------------------------------------------------------------- + +function makeIdentity(overrides: Partial = {}): AlphaIdentity { + return { + enrolled: true, + user_id: "local-uuid-123", + consent_timestamp: "2026-03-20T00:00:00.000Z", + ...overrides, + }; +} + +// --------------------------------------------------------------------------- +// generateUserId +// --------------------------------------------------------------------------- + +describe("generateUserId", () => { + test("returns a valid UUID v4 string", () => { + const id = generateUserId(); + expect(id).toMatch(/^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/i); + }); + + test("returns unique values on each call", () => { + const a = generateUserId(); + const b = generateUserId(); + expect(a).not.toBe(b); + }); +}); + +// --------------------------------------------------------------------------- +// isValidApiKeyFormat +// --------------------------------------------------------------------------- + +describe("isValidApiKeyFormat", () => { + test("accepts st_live_ prefix", () => { + expect(isValidApiKeyFormat("st_live_abc123")).toBe(true); + }); + + test("accepts st_test_ prefix", () => { + expect(isValidApiKeyFormat("st_test_abc123")).toBe(true); + }); + + test("rejects arbitrary strings", () => { + expect(isValidApiKeyFormat("sk_live_abc123")).toBe(false); + expect(isValidApiKeyFormat("random-key")).toBe(false); + expect(isValidApiKeyFormat("")).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// getAlphaLinkState — cloud-first model +// --------------------------------------------------------------------------- + +describe("getAlphaLinkState", () => { + test("returns not_linked when identity is null", () => { + expect(getAlphaLinkState(null)).toBe("not_linked"); + }); + + test("returns not_linked when not enrolled and no cloud_user_id", () => { + const identity = makeIdentity({ enrolled: false }); + expect(getAlphaLinkState(identity)).toBe("not_linked"); + }); + + test("returns linked_not_enrolled when has cloud_user_id but not enrolled", () => { + const identity = makeIdentity({ + enrolled: false, + cloud_user_id: "cloud-123", + }); + expect(getAlphaLinkState(identity)).toBe("linked_not_enrolled"); + }); + + test("returns enrolled_no_credential when enrolled but no cloud_user_id", () => { + const identity = makeIdentity({ + enrolled: true, + // no cloud_user_id + }); + expect(getAlphaLinkState(identity)).toBe("enrolled_no_credential"); + }); + + test("returns enrolled_no_credential when enrolled with cloud_user_id but no api_key", () => { + const identity = makeIdentity({ + enrolled: true, + cloud_user_id: "cloud-123", + // no api_key + }); + expect(getAlphaLinkState(identity)).toBe("enrolled_no_credential"); + }); + + test("returns enrolled_no_credential when api_key has invalid format", () => { + const identity = makeIdentity({ + enrolled: true, + cloud_user_id: "cloud-123", + api_key: "invalid_key", + }); + expect(getAlphaLinkState(identity)).toBe("enrolled_no_credential"); + }); + + test("returns ready when enrolled with cloud_user_id and valid api_key", () => { + const identity = makeIdentity({ + enrolled: true, + cloud_user_id: "cloud-123", + api_key: "st_live_abc123", + }); + expect(getAlphaLinkState(identity)).toBe("ready"); + }); + + test("returns ready with st_test_ api_key", () => { + const identity = makeIdentity({ + enrolled: true, + cloud_user_id: "cloud-123", + api_key: "st_test_abc123", + }); + expect(getAlphaLinkState(identity)).toBe("ready"); + }); +}); + +// --------------------------------------------------------------------------- +// migrateLocalIdentity +// --------------------------------------------------------------------------- + +describe("migrateLocalIdentity", () => { + test("detects legacy identity needing cloud link", () => { + const identity = makeIdentity({ + user_id: "local-uuid", + email: "user@example.com", + // no cloud_user_id + }); + const result = migrateLocalIdentity(identity); + expect(result.needsCloudLink).toBe(true); + expect(result.identity).toBe(identity); + }); + + test("recognizes already-linked identity", () => { + const identity = makeIdentity({ + user_id: "local-uuid", + cloud_user_id: "cloud-abc", + }); + const result = migrateLocalIdentity(identity); + expect(result.needsCloudLink).toBe(false); + expect(result.identity).toBe(identity); + }); +}); + +// --------------------------------------------------------------------------- +// Config read/write helpers +// --------------------------------------------------------------------------- + +describe("readAlphaIdentity / writeAlphaIdentity", () => { + let tempDir: string; + let configPath: string; + + beforeEach(() => { + tempDir = mkdtempSync(join(tmpdir(), "selftune-alpha-test-")); + configPath = join(tempDir, "config.json"); + }); + + afterEach(() => { + rmSync(tempDir, { recursive: true, force: true }); + }); + + test("returns null when config does not exist", () => { + expect(readAlphaIdentity(configPath)).toBeNull(); + }); + + test("returns null when config has no alpha block", () => { + writeFileSync(configPath, JSON.stringify({ agent_type: "claude_code" })); + expect(readAlphaIdentity(configPath)).toBeNull(); + }); + + test("returns null on invalid JSON", () => { + writeFileSync(configPath, "not-json"); + expect(readAlphaIdentity(configPath)).toBeNull(); + }); + + test("reads back identity after write", () => { + const identity = makeIdentity({ + cloud_user_id: "cloud-xyz", + cloud_org_id: "org-abc", + email: "user@example.com", + display_name: "Test User", + api_key: "st_live_key123", + }); + + writeAlphaIdentity(configPath, identity); + + const result = readAlphaIdentity(configPath); + expect(result).not.toBeNull(); + if (!result) { + throw new Error("expected alpha identity to be written"); + } + expect(result.enrolled).toBe(true); + expect(result.user_id).toBe("local-uuid-123"); + expect(result.cloud_user_id).toBe("cloud-xyz"); + expect(result.cloud_org_id).toBe("org-abc"); + expect(result.email).toBe("user@example.com"); + expect(result.api_key).toBe("st_live_key123"); + }); + + test("preserves existing config fields when writing alpha", () => { + writeFileSync( + configPath, + JSON.stringify({ + agent_type: "claude_code", + cli_path: "/some/path", + }), + ); + + writeAlphaIdentity(configPath, makeIdentity()); + + const raw = JSON.parse(readFileSync(configPath, "utf-8")); + expect(raw.agent_type).toBe("claude_code"); + expect(raw.cli_path).toBe("/some/path"); + expect(raw.alpha).toBeDefined(); + }); + + test("throws on corrupt existing config", () => { + writeFileSync(configPath, "not-json"); + expect(() => writeAlphaIdentity(configPath, makeIdentity())).toThrow(/not valid JSON/); + }); + + test("creates parent directories", () => { + const nestedPath = join(tempDir, "nested", "dir", "config.json"); + writeAlphaIdentity(nestedPath, makeIdentity()); + const result = readAlphaIdentity(nestedPath); + expect(result).not.toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// AlphaIdentity interface shape (compile-time checks via usage) +// --------------------------------------------------------------------------- + +describe("AlphaIdentity interface", () => { + test("supports all expected fields", () => { + const identity: AlphaIdentity = { + enrolled: true, + cloud_user_id: "cloud-123", + cloud_org_id: "org-456", + email: "user@example.com", + display_name: "Test User", + user_id: "local-uuid", + consent_timestamp: "2026-03-20T00:00:00.000Z", + api_key: "st_live_abc", + }; + + // All fields are accessible + expect(identity.enrolled).toBe(true); + expect(identity.cloud_user_id).toBe("cloud-123"); + expect(identity.cloud_org_id).toBe("org-456"); + expect(identity.email).toBe("user@example.com"); + expect(identity.display_name).toBe("Test User"); + expect(identity.user_id).toBe("local-uuid"); + expect(identity.consent_timestamp).toBe("2026-03-20T00:00:00.000Z"); + expect(identity.api_key).toBe("st_live_abc"); + }); + + test("optional fields can be undefined", () => { + const identity: AlphaIdentity = { + enrolled: false, + user_id: "local-uuid", + consent_timestamp: "2026-03-20T00:00:00.000Z", + }; + + expect(identity.cloud_user_id).toBeUndefined(); + expect(identity.cloud_org_id).toBeUndefined(); + expect(identity.email).toBeUndefined(); + expect(identity.display_name).toBeUndefined(); + expect(identity.api_key).toBeUndefined(); + }); +}); diff --git a/tests/alpha-upload/build-payloads.test.ts b/tests/alpha-upload/build-payloads.test.ts new file mode 100644 index 00000000..193c411c --- /dev/null +++ b/tests/alpha-upload/build-payloads.test.ts @@ -0,0 +1,523 @@ +/** + * Tests for V2 canonical push payload builder (staging-based). + * + * Validates that buildV2PushPayload correctly reads from the + * canonical_upload_staging table using a single monotonic cursor + * and assembles records into a V2 push payload. + */ + +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { buildV2PushPayload } from "../../cli/selftune/alpha-upload/build-payloads.js"; +import { ALL_DDL, MIGRATIONS, POST_MIGRATION_INDEXES } from "../../cli/selftune/localdb/schema.js"; + +// -- Test helpers ------------------------------------------------------------- + +function createTestDb(): Database { + const db = new Database(":memory:"); + for (const ddl of ALL_DDL) db.run(ddl); + for (const m of MIGRATIONS) { + try { + db.run(m); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (!message.includes("duplicate column")) { + throw error; + } + } + } + for (const idx of POST_MIGRATION_INDEXES) { + db.run(idx); + } + return db; +} + +function stageRecord( + db: Database, + opts: { + record_kind: string; + record_id: string; + record_json: unknown; + session_id?: string; + prompt_id?: string; + normalized_at?: string; + }, +): void { + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, prompt_id, normalized_at, staged_at) + VALUES (?, ?, ?, ?, ?, ?, ?)`, + [ + opts.record_kind, + opts.record_id, + typeof opts.record_json === "string" ? opts.record_json : JSON.stringify(opts.record_json), + opts.session_id ?? null, + opts.prompt_id ?? null, + opts.normalized_at ?? new Date().toISOString(), + new Date().toISOString(), + ], + ); +} + +function makeSessionJson(sessionId: string) { + return { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: { path: "/some/transcript.jsonl" }, + session_id: sessionId, + started_at: "2026-03-18T09:00:00.000Z", + ended_at: "2026-03-18T09:30:00.000Z", + model: "opus", + completion_status: "completed", + }; +} + +function makePromptJson(promptId: string, sessionId: string) { + return { + record_kind: "prompt", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sessionId, + prompt_id: promptId, + occurred_at: "2026-03-18T09:01:00.000Z", + prompt_text: "improve my skills", + prompt_kind: "user", + is_actionable: true, + prompt_index: 0, + }; +} + +function makeInvocationJson(invId: string, sessionId: string) { + return { + record_kind: "skill_invocation", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sessionId, + skill_invocation_id: invId, + occurred_at: "2026-03-18T09:02:00.000Z", + skill_name: "selftune", + invocation_mode: "implicit", + triggered: true, + confidence: 0.95, + }; +} + +function makeExecutionFactJson(sessionId: string) { + return { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sessionId, + execution_fact_id: `ef-${sessionId}`, + occurred_at: "2026-03-18T09:03:00.000Z", + tool_calls_json: { Read: 3, Edit: 2 }, + total_tool_calls: 5, + assistant_turns: 3, + errors_encountered: 0, + }; +} + +function makeEvolutionEvidenceJson(proposalId: string) { + return { + timestamp: "2026-03-18T10:10:00.000Z", + skill_name: "selftune", + proposal_id: proposalId, + target: "description", + stage: "deployed", + rationale: "improved routing accuracy", + confidence: 0.85, + original_text: "old description", + proposed_text: "new description", + }; +} + +// -- Tests -------------------------------------------------------------------- + +describe("buildV2PushPayload (staging-based)", () => { + let db: Database; + + beforeEach(() => { + db = createTestDb(); + }); + afterEach(() => { + db.close(); + }); + + test("returns null when staging table is empty", () => { + const result = buildV2PushPayload(db); + expect(result).toBeNull(); + }); + + test("returns null when all records are past cursor", () => { + stageRecord(db, { + record_kind: "session", + record_id: "sess-1", + record_json: makeSessionJson("sess-1"), + session_id: "sess-1", + }); + const result = buildV2PushPayload(db, 999999); + expect(result).toBeNull(); + }); + + test("builds V2 payload with correct schema_version", () => { + stageRecord(db, { + record_kind: "session", + record_id: "sess-1", + record_json: makeSessionJson("sess-1"), + session_id: "sess-1", + }); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + + const payload = result?.payload; + expect(payload.schema_version).toBe("2.0"); + expect(payload.push_id).toBeDefined(); + expect(typeof payload.push_id).toBe("string"); + }); + + test("includes sessions in canonical.sessions", () => { + stageRecord(db, { + record_kind: "session", + record_id: "sess-map", + record_json: makeSessionJson("sess-map"), + session_id: "sess-map", + }); + + const result = buildV2PushPayload(db); + const canonical = result?.payload.canonical as Record; + const sessions = canonical.sessions; + + expect(sessions).toHaveLength(1); + const s = sessions[0] as Record; + expect(s.record_kind).toBe("session"); + expect(s.schema_version).toBe("2.0"); + expect(s.session_id).toBe("sess-map"); + expect(s.platform).toBe("claude_code"); + expect(s.model).toBe("opus"); + expect(s.started_at).toBe("2026-03-18T09:00:00.000Z"); + expect(s.ended_at).toBe("2026-03-18T09:30:00.000Z"); + }); + + test("includes prompts in canonical.prompts", () => { + stageRecord(db, { + record_kind: "prompt", + record_id: "p-1", + record_json: makePromptJson("p-1", "sess-1"), + session_id: "sess-1", + prompt_id: "p-1", + }); + + const result = buildV2PushPayload(db); + const canonical = result?.payload.canonical as Record; + const prompts = canonical.prompts; + + expect(prompts).toHaveLength(1); + const p = prompts[0] as Record; + expect(p.record_kind).toBe("prompt"); + expect(p.prompt_id).toBe("p-1"); + expect(p.prompt_text).toBe("improve my skills"); + }); + + test("includes skill_invocations in canonical.skill_invocations", () => { + stageRecord(db, { + record_kind: "skill_invocation", + record_id: "inv-1", + record_json: makeInvocationJson("inv-1", "sess-1"), + session_id: "sess-1", + }); + + const result = buildV2PushPayload(db); + const canonical = result?.payload.canonical as Record; + const invocations = canonical.skill_invocations; + + expect(invocations).toHaveLength(1); + const inv = invocations[0] as Record; + expect(inv.record_kind).toBe("skill_invocation"); + expect(inv.skill_name).toBe("selftune"); + expect(inv.triggered).toBe(true); + expect(inv.confidence).toBe(0.95); + }); + + test("includes execution_facts in canonical.execution_facts", () => { + stageRecord(db, { + record_kind: "execution_fact", + record_id: "ef-1", + record_json: makeExecutionFactJson("sess-1"), + session_id: "sess-1", + }); + + const result = buildV2PushPayload(db); + const canonical = result?.payload.canonical as Record; + const facts = canonical.execution_facts; + + expect(facts).toHaveLength(1); + const f = facts[0] as Record; + expect(f.record_kind).toBe("execution_fact"); + expect(f.total_tool_calls).toBe(5); + expect(f.assistant_turns).toBe(3); + expect(f.errors_encountered).toBe(0); + }); + + test("includes evolution_evidence in canonical.evolution_evidence", () => { + stageRecord(db, { + record_kind: "evolution_evidence", + record_id: "prop-1:deployed:2026-03-18T10:10:00Z", + record_json: makeEvolutionEvidenceJson("prop-1"), + }); + + const result = buildV2PushPayload(db); + const canonical = result?.payload.canonical as Record; + const evidence = canonical.evolution_evidence; + + expect(evidence).toHaveLength(1); + const e = evidence[0] as Record; + expect(e.skill_name).toBe("selftune"); + expect(e.proposal_id).toBe("prop-1"); + expect(e.original_text).toBe("old description"); + expect(e.proposed_text).toBe("new description"); + }); + + test("returns correct lastSeq for cursor advancement", () => { + stageRecord(db, { + record_kind: "session", + record_id: "sess-1", + record_json: makeSessionJson("sess-1"), + session_id: "sess-1", + }); + stageRecord(db, { + record_kind: "session", + record_id: "sess-2", + record_json: makeSessionJson("sess-2"), + session_id: "sess-2", + }); + + const first = buildV2PushPayload(db); + expect(first).not.toBeNull(); + expect(first?.lastSeq).toBeGreaterThan(0); + + // Second call with cursor from first should get nothing + const second = buildV2PushPayload(db, first?.lastSeq); + expect(second).toBeNull(); + }); + + test("handles mixed data -- some record types present, others not", () => { + stageRecord(db, { + record_kind: "session", + record_id: "sess-1", + record_json: makeSessionJson("sess-1"), + session_id: "sess-1", + }); + stageRecord(db, { + record_kind: "skill_invocation", + record_id: "inv-1", + record_json: makeInvocationJson("inv-1", "sess-1"), + session_id: "sess-1", + }); + // No prompts, execution_facts, or evolution_evidence + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.sessions).toHaveLength(1); + expect(canonical.skill_invocations).toHaveLength(1); + expect(canonical.prompts).toHaveLength(0); + expect(canonical.execution_facts).toHaveLength(0); + }); + + test("canonical records have preserved base fields (no hardcoding)", () => { + const sessionJson = makeSessionJson("sess-fields"); + // Override with non-default values to prove they aren't hardcoded + sessionJson.capture_mode = "hook"; + sessionJson.normalizer_version = "3.5.0"; + sessionJson.raw_source_ref = { path: "/custom.jsonl", raw_id: "xyz" }; + + stageRecord(db, { + record_kind: "session", + record_id: "sess-fields", + record_json: sessionJson, + session_id: "sess-fields", + }); + + const result = buildV2PushPayload(db); + const canonical = result?.payload.canonical as Record; + const session = canonical.sessions[0] as Record; + + expect(session.record_kind).toBe("session"); + expect(session.schema_version).toBe("2.0"); + expect(session.capture_mode).toBe("hook"); + expect(session.normalizer_version).toBe("3.5.0"); + expect(session.raw_source_ref).toEqual({ path: "/custom.jsonl", raw_id: "xyz" }); + }); + + test("includes orchestrate_runs in canonical.orchestrate_runs", () => { + const orchestrateRunJson = { + run_id: "orch-bp-1", + timestamp: "2026-03-18T11:00:00.000Z", + elapsed_ms: 12000, + dry_run: false, + approval_mode: "auto", + total_skills: 5, + evaluated: 4, + evolved: 1, + deployed: 1, + watched: 2, + skipped: 1, + skill_actions: [ + { skill: "selftune", action: "evolve", reason: "low pass rate", deployed: true }, + { skill: "commit", action: "watch", reason: "recently deployed" }, + { skill: "test-runner", action: "skip", reason: "insufficient data" }, + ], + }; + + stageRecord(db, { + record_kind: "orchestrate_run", + record_id: "orch-bp-1", + record_json: orchestrateRunJson, + }); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.orchestrate_runs).toBeDefined(); + expect(canonical.orchestrate_runs).toHaveLength(1); + + const run = canonical.orchestrate_runs[0] as Record; + expect(run.run_id).toBe("orch-bp-1"); + expect(run.dry_run).toBe(false); + expect(run.approval_mode).toBe("auto"); + expect(run.total_skills).toBe(5); + expect(run.elapsed_ms).toBe(12000); + const actions = run.skill_actions as unknown[]; + expect(actions).toHaveLength(3); + }); + + test("returns payload with only orchestrate_runs (no canonical records)", () => { + stageRecord(db, { + record_kind: "orchestrate_run", + record_id: "orch-only-1", + record_json: { + run_id: "orch-only-1", + timestamp: "2026-03-18T11:00:00.000Z", + elapsed_ms: 1000, + dry_run: true, + approval_mode: "review", + total_skills: 1, + evaluated: 1, + evolved: 0, + deployed: 0, + watched: 0, + skipped: 1, + skill_actions: [{ skill: "test", action: "skip", reason: "dry run" }], + }, + }); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.sessions).toHaveLength(0); + expect(canonical.orchestrate_runs).toHaveLength(1); + }); + + test("respects limit parameter", () => { + for (let i = 0; i < 10; i++) { + stageRecord(db, { + record_kind: "session", + record_id: `sess-limit-${i}`, + record_json: makeSessionJson(`sess-limit-${i}`), + session_id: `sess-limit-${i}`, + }); + } + + const result = buildV2PushPayload(db, undefined, 3); + expect(result).not.toBeNull(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.sessions).toHaveLength(3); + }); + + test("returns null when all staged rows have malformed record_json", () => { + stageRecord(db, { + record_kind: "session", + record_id: "bad-json-1", + record_json: "{not valid json", + session_id: "bad-json-1", + }); + + const result = buildV2PushPayload(db); + expect(result).toBeNull(); + }); + + test("returns null when a malformed staged row blocks the front of the batch", () => { + stageRecord(db, { + record_kind: "session", + record_id: "bad-json-2", + record_json: "{not valid json", + session_id: "bad-json-2", + }); + stageRecord(db, { + record_kind: "session", + record_id: "sess-valid-1", + record_json: makeSessionJson("sess-valid-1"), + session_id: "sess-valid-1", + }); + + const result = buildV2PushPayload(db); + expect(result).toBeNull(); + }); + + test("does not advance the cursor past malformed staged rows", () => { + stageRecord(db, { + record_kind: "session", + record_id: "sess-valid-before-bad", + record_json: makeSessionJson("sess-valid-before-bad"), + session_id: "sess-valid-before-bad", + }); + stageRecord(db, { + record_kind: "session", + record_id: "bad-json-3", + record_json: "{not valid json", + session_id: "bad-json-3", + }); + stageRecord(db, { + record_kind: "session", + record_id: "sess-valid-after-bad", + record_json: makeSessionJson("sess-valid-after-bad"), + session_id: "sess-valid-after-bad", + }); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.sessions).toHaveLength(1); + const session = canonical.sessions[0] as Record; + expect(session.session_id).toBe("sess-valid-before-bad"); + + const second = buildV2PushPayload(db, result?.lastSeq); + expect(second).toBeNull(); + }); +}); diff --git a/tests/alpha-upload/e2e.test.ts b/tests/alpha-upload/e2e.test.ts new file mode 100644 index 00000000..2930e845 --- /dev/null +++ b/tests/alpha-upload/e2e.test.ts @@ -0,0 +1,730 @@ +/** + * End-to-end integration tests for the alpha upload pipeline. + * + * Tests the full flow: staging -> enqueue -> flush -> status verification. + * Uses an in-memory SQLite database and a mock HTTP endpoint via globalThis.fetch. + */ + +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, it, mock } from "bun:test"; +import { flushQueue } from "../../cli/selftune/alpha-upload/flush.js"; +import { prepareUploads, runUploadCycle } from "../../cli/selftune/alpha-upload/index.js"; +import { + getPendingUploads, + getQueueStats, + readWatermark, +} from "../../cli/selftune/alpha-upload/queue.js"; +import type { QueueItem, QueueOperations } from "../../cli/selftune/alpha-upload-contract.js"; +import { getLastUploadError, getLastUploadSuccess } from "../../cli/selftune/localdb/queries.js"; +import { ALL_DDL, MIGRATIONS, POST_MIGRATION_INDEXES } from "../../cli/selftune/localdb/schema.js"; +import { checkAlphaQueueHealth } from "../../cli/selftune/observability.js"; +import { type AlphaStatusInfo, formatAlphaStatus } from "../../cli/selftune/status.js"; + +// --------------------------------------------------------------------------- +// Test helpers +// --------------------------------------------------------------------------- + +function createTestDb(): Database { + const db = new Database(":memory:"); + db.exec("PRAGMA journal_mode = WAL"); + for (const ddl of ALL_DDL) { + db.exec(ddl); + } + for (const migration of MIGRATIONS) { + try { + db.exec(migration); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (!message.includes("duplicate column")) { + throw error; + } + } + } + for (const idx of POST_MIGRATION_INDEXES) { + db.exec(idx); + } + return db; +} + +/** Stage canonical session records directly into the staging table. */ +function stageSessions(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const sid = `e2e-session-${i}`; + const record = { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sid, + started_at: "2026-01-01T00:00:00.000Z", + ended_at: "2026-01-01T01:00:00.000Z", + model: "opus", + completion_status: "completed", + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, staged_at) + VALUES (?, ?, ?, ?, ?)`, + ["session", sid, JSON.stringify(record), sid, new Date().toISOString()], + ); + } +} + +/** Stage canonical prompt records directly. */ +function stagePrompts(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const pid = `e2e-prompt-${i}`; + const record = { + record_kind: "prompt", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: "e2e-session-0", + prompt_id: pid, + occurred_at: "2026-01-01T00:00:00.000Z", + prompt_text: "test prompt", + prompt_kind: "user", + is_actionable: true, + prompt_index: i, + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, prompt_id, staged_at) + VALUES (?, ?, ?, ?, ?, ?)`, + ["prompt", pid, JSON.stringify(record), "e2e-session-0", pid, new Date().toISOString()], + ); + } +} + +/** Stage evolution evidence records directly using V2 deterministic shape. */ +function stageEvolutionEvidence(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const evidenceId = `ev_e2e-prop-${i}_deployed_${i}`; + const recordId = `evidence-${evidenceId}:deployed:2026-01-01T00:00:00Z`; + const record = { + evidence_id: evidenceId, + skill_name: "Research", + proposal_id: `e2e-prop-${i}`, + target: "description", + stage: "deployed", + rationale: "improved accuracy", + confidence: 0.85, + timestamp: "2026-01-01T00:00:00.000Z", + original_text: "old", + proposed_text: "new", + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, staged_at) + VALUES (?, ?, ?, ?)`, + ["evolution_evidence", recordId, JSON.stringify(record), new Date().toISOString()], + ); + } +} + +/** Build QueueOperations adapter from a db for flush engine. */ +async function buildQueueOps(db: Database): Promise { + const { markSending, markSent, markFailed } = await import( + "../../cli/selftune/alpha-upload/queue.js" + ); + return { + getPending: (limit: number) => getPendingUploads(db, limit) as QueueItem[], + markSending: (id: number) => markSending(db, [id]), + markSent: (id: number) => markSent(db, [id]), + markFailed: (id: number, error?: string) => markFailed(db, id, error ?? "unknown"), + }; +} + +// --------------------------------------------------------------------------- +// E2E: Full pipeline flow +// --------------------------------------------------------------------------- + +describe("e2e: full upload pipeline", () => { + let db: Database; + let originalFetch: typeof globalThis.fetch; + + beforeEach(() => { + db = createTestDb(); + originalFetch = globalThis.fetch; + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + db.close(); + }); + + it("stages records, enqueues, flushes to mock endpoint, and updates queue status", async () => { + // Step 1: Stage sample records + stageSessions(db, 3); + stagePrompts(db, 2); + stageEvolutionEvidence(db, 1); + + // Step 2: Prepare uploads (builds V2 payload and enqueues) + const prepared = prepareUploads( + db, + "e2e-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(prepared.enqueued).toBe(1); + expect(prepared.types).toContain("canonical"); + + // Verify queue state after prepare + const statsAfterPrepare = getQueueStats(db); + expect(statsAfterPrepare.pending).toBe(1); + expect(statsAfterPrepare.sent).toBe(0); + + // Step 3: Mock the HTTP endpoint to return success + let postedPayload: Record | null = null; + let capturedHeaders: Record = {}; + + globalThis.fetch = mock(async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedHeaders = Object.fromEntries(new Headers(init?.headers).entries()); + postedPayload = JSON.parse(init?.body as string); + return new Response(JSON.stringify({ success: true, push_id: "test-push-id", errors: [] }), { + status: 200, + headers: { "Content-Type": "application/json" }, + }); + }); + + // Step 4: Flush the queue + const queueOps = await buildQueueOps(db); + const flush = await flushQueue(queueOps, "https://mock.selftune.dev/api/v1/push", { + apiKey: "test-api-key-123", + }); + + expect(flush.sent).toBe(1); + expect(flush.failed).toBe(0); + + // Step 5: Verify the HTTP request was correct + expect(postedPayload).not.toBeNull(); + expect((postedPayload as Record).schema_version).toBe("2.0"); + expect((postedPayload as Record).push_id).toBeDefined(); + expect((postedPayload as Record).canonical).toBeDefined(); + expect(capturedHeaders.authorization).toBe("Bearer test-api-key-123"); + expect(capturedHeaders["content-type"]).toBe("application/json"); + + // Step 6: Verify queue status updated to sent + const statsAfterFlush = getQueueStats(db); + expect(statsAfterFlush.pending).toBe(0); + expect(statsAfterFlush.sent).toBe(1); + + // Step 7: Verify watermark advanced + const watermark = readWatermark(db, "canonical"); + expect(watermark).not.toBeNull(); + expect(watermark ?? 0).toBeGreaterThan(0); + + // Step 8: Running again with no new records produces no new uploads + const secondPrepare = prepareUploads( + db, + "e2e-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(secondPrepare.enqueued).toBe(0); + }); + + it("runUploadCycle handles the full cycle end-to-end", async () => { + // Stage records first + stageSessions(db, 2); + + // Mock successful endpoint + globalThis.fetch = mock(async () => { + return new Response(JSON.stringify({ success: true, push_id: "cycle-push-id", errors: [] }), { + status: 200, + }); + }); + + // Run the full cycle + const result = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "cycle-key-abc", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.enrolled).toBe(true); + expect(result.prepared).toBe(1); + expect(result.sent).toBe(1); + expect(result.failed).toBe(0); + expect(result.skipped).toBe(0); + + // Verify queue is clean + const stats = getQueueStats(db); + expect(stats.pending).toBe(0); + expect(stats.sent).toBe(1); + + // Running again produces no new uploads + const secondRun = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "cycle-key-abc", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(secondRun.prepared).toBe(0); + expect(secondRun.sent).toBe(0); + }); + + it("dry-run mode does not send HTTP requests", async () => { + stageSessions(db, 2); + + let fetchCalled = false; + globalThis.fetch = mock(async () => { + fetchCalled = true; + return new Response("should not be called", { status: 500 }); + }); + + const result = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + dryRun: true, + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.enrolled).toBe(true); + expect(result.prepared).toBe(1); + expect(result.sent).toBe(0); + expect(fetchCalled).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// E2E: Failure scenarios +// --------------------------------------------------------------------------- + +describe("e2e: failure scenarios", () => { + let db: Database; + let originalFetch: typeof globalThis.fetch; + + beforeEach(() => { + db = createTestDb(); + originalFetch = globalThis.fetch; + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + db.close(); + }); + + it("auth failure (401) marks items as failed with descriptive message", async () => { + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + return new Response("Unauthorized", { status: 401 }); + }); + + const result = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "bad-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.enrolled).toBe(true); + expect(result.prepared).toBe(1); + expect(result.failed).toBe(1); + expect(result.sent).toBe(0); + + // Check error message recorded + const lastError = getLastUploadError(db); + expect(lastError).not.toBeNull(); + expect(lastError?.last_error).toContain("Authentication failed"); + }); + + it("auth failure (403) marks items as failed with permission message", async () => { + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + return new Response("Forbidden", { status: 403 }); + }); + + const result = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "forbidden-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.failed).toBe(1); + const lastError = getLastUploadError(db); + expect(lastError?.last_error).toContain("Authorization denied"); + }); + + it("network-unreachable endpoint keeps records in queue with failure status", async () => { + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + throw new Error("connect ECONNREFUSED 127.0.0.1:1"); + }); + + // Prepare manually so we can control flush options (maxRetries=1 to skip backoff) + const prepared = prepareUploads( + db, + "e2e-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(prepared.enqueued).toBe(1); + + // Flush with maxRetries=1 to avoid exponential backoff timeout + const queueOps = await buildQueueOps(db); + const flush = await flushQueue(queueOps, "http://localhost:1/nonexistent", { + apiKey: "test-key", + maxRetries: 1, + }); + + expect(flush.failed).toBe(1); + expect(flush.sent).toBe(0); + + // Error recorded in queue + const lastError = getLastUploadError(db); + expect(lastError).not.toBeNull(); + expect(lastError?.last_error).toContain("exhausted retries"); + }); + + it("409 conflict is treated as success (duplicate push_id)", async () => { + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + return new Response("Conflict: duplicate push_id", { status: 409 }); + }); + + const result = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "test-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.sent).toBe(1); + expect(result.failed).toBe(0); + + const stats = getQueueStats(db); + expect(stats.sent).toBe(1); + expect(stats.failed).toBe(0); + }); + + it("second run picks up where first left off (watermark persistence)", async () => { + // Stage 3 sessions + stageSessions(db, 3); + + // First run: mock success + globalThis.fetch = mock(async () => { + return new Response(JSON.stringify({ success: true, push_id: "run1", errors: [] }), { + status: 200, + }); + }); + + const firstRun = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "test-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(firstRun.prepared).toBe(1); + expect(firstRun.sent).toBe(1); + const watermarkAfterFirst = readWatermark(db, "canonical"); + + // Add more records AFTER the first run + for (let i = 100; i < 103; i++) { + const sid = `e2e-session-${i}`; + const record = { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-02T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sid, + started_at: "2026-01-02T00:00:00.000Z", + ended_at: "2026-01-02T01:00:00.000Z", + model: "opus", + completion_status: "completed", + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, staged_at) + VALUES (?, ?, ?, ?, ?)`, + ["session", sid, JSON.stringify(record), sid, new Date().toISOString()], + ); + } + + // Second run: should only pick up the new records + const secondRun = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "test-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(secondRun.prepared).toBe(1); + expect(secondRun.sent).toBe(1); + + // Watermark should have advanced further + const watermarkAfterSecond = readWatermark(db, "canonical"); + expect(watermarkAfterSecond).not.toBeNull(); + expect(watermarkAfterSecond ?? 0).toBeGreaterThan(watermarkAfterFirst ?? 0); + + // Queue should show 2 sent total + const stats = getQueueStats(db); + expect(stats.sent).toBe(2); + }); + + it("missing API key still enqueues but flush fails with auth error", async () => { + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + return new Response("Unauthorized", { status: 401 }); + }); + + // Run without API key + const result = await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + // no apiKey + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + // Records were prepared/enqueued + expect(result.prepared).toBe(1); + // But flush failed due to 401 + expect(result.failed).toBe(1); + }); + + it("unenrolled user gets empty summary without any network calls", async () => { + stageSessions(db, 5); + + let fetchCalled = false; + globalThis.fetch = mock(async () => { + fetchCalled = true; + return new Response("should not be called", { status: 500 }); + }); + + const result = await runUploadCycle(db, { + enrolled: false, + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.enrolled).toBe(false); + expect(result.prepared).toBe(0); + expect(result.sent).toBe(0); + expect(result.failed).toBe(0); + expect(fetchCalled).toBe(false); + }); +}); + +// --------------------------------------------------------------------------- +// E2E: Observability and status visibility +// --------------------------------------------------------------------------- + +describe("e2e: status visibility after uploads", () => { + let db: Database; + let originalFetch: typeof globalThis.fetch; + + beforeEach(() => { + db = createTestDb(); + originalFetch = globalThis.fetch; + }); + + afterEach(() => { + globalThis.fetch = originalFetch; + db.close(); + }); + + it("queue stats reflect accurate counts after mixed success/failure uploads", async () => { + // Stage and run a successful upload + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + return new Response(JSON.stringify({ success: true, push_id: "ok", errors: [] }), { + status: 200, + }); + }); + + await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + // Now stage and run a failed upload + for (let i = 10; i < 11; i++) { + const sid = `e2e-session-${i}`; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, staged_at) + VALUES (?, ?, ?, ?, ?)`, + [ + "session", + sid, + JSON.stringify({ + record_kind: "session", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sid, + started_at: "2026-01-01T00:00:00.000Z", + ended_at: "2026-01-01T01:00:00.000Z", + model: "opus", + completion_status: "completed", + }), + sid, + new Date().toISOString(), + ], + ); + } + + globalThis.fetch = mock(async () => { + return new Response("Unauthorized", { status: 401 }); + }); + + await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "bad-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + // Verify stats + const stats = getQueueStats(db); + expect(stats.sent).toBe(1); + expect(stats.failed).toBe(1); + expect(stats.pending).toBe(0); + + // Verify last error/success queries + const lastError = getLastUploadError(db); + expect(lastError).not.toBeNull(); + expect(lastError?.last_error).toContain("Authentication failed"); + + const lastSuccess = getLastUploadSuccess(db); + expect(lastSuccess).not.toBeNull(); + }); + + it("formatAlphaStatus renders correctly with live queue data", async () => { + // Populate queue with mixed statuses + stageSessions(db, 2); + + globalThis.fetch = mock(async () => { + return new Response(JSON.stringify({ success: true, push_id: "ok", errors: [] }), { + status: 200, + }); + }); + + await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + // Build status info from real queue data + const info: AlphaStatusInfo = { + enrolled: true, + stats: getQueueStats(db), + lastError: getLastUploadError(db), + lastSuccess: getLastUploadSuccess(db), + }; + + const output = formatAlphaStatus(info); + expect(output).toContain("enrolled"); + expect(output).toContain("Sent:"); + + // Check sent count appears in output + expect(info.stats.sent).toBe(1); + }); + + it("doctor checks detect stuck items after failed upload", async () => { + // Insert an old pending item to simulate a stuck upload + const twoHoursAgo = new Date(Date.now() - 2 * 3600 * 1000).toISOString(); + db.run( + `INSERT INTO upload_queue (payload_type, payload_json, status, attempts, created_at, updated_at) + VALUES (?, '{}', 'pending', 0, ?, ?)`, + ["push", twoHoursAgo, twoHoursAgo], + ); + + const checks = await checkAlphaQueueHealth(db, true); + const stuckCheck = checks.find((c) => c.name === "alpha_queue_stuck"); + expect(stuckCheck).toBeDefined(); + expect(stuckCheck?.status).toBe("warn"); + expect(stuckCheck?.message).toContain("old"); + }); + + it("doctor checks pass when queue is healthy after successful upload", async () => { + stageSessions(db, 1); + + globalThis.fetch = mock(async () => { + return new Response(JSON.stringify({ success: true, push_id: "ok", errors: [] }), { + status: 200, + }); + }); + + await runUploadCycle(db, { + enrolled: true, + userId: "e2e-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://mock.selftune.dev/api/v1/push", + apiKey: "key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + const checks = await checkAlphaQueueHealth(db, true); + expect(checks.every((c) => c.status === "pass")).toBe(true); + }); +}); diff --git a/tests/alpha-upload/flush.test.ts b/tests/alpha-upload/flush.test.ts new file mode 100644 index 00000000..522090d9 --- /dev/null +++ b/tests/alpha-upload/flush.test.ts @@ -0,0 +1,474 @@ +import { afterEach, describe, expect, mock, test } from "bun:test"; +import { uploadPushPayload } from "../../cli/selftune/alpha-upload/client.js"; +import { flushQueue } from "../../cli/selftune/alpha-upload/flush.js"; +import type { QueueItem, QueueOperations } from "../../cli/selftune/alpha-upload-contract.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function makePayload(overrides?: Record): Record { + return { + schema_version: "2.0", + push_id: "test-push-id", + client_version: "0.2.7", + normalizer_version: "1.0.0", + canonical: { + sessions: [], + prompts: [], + skill_invocations: [], + execution_facts: [], + normalization_runs: [], + evolution_evidence: [], + }, + ...overrides, + }; +} + +function makeQueueItem(id: number, overrides?: Partial): QueueItem { + const payload = makePayload(); + return { + id, + payload_type: "push", + status: "pending", + attempts: 0, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + last_error: null, + payload_json: JSON.stringify(payload), + ...overrides, + }; +} + +function createMockQueue( + items: QueueItem[], +): QueueOperations & { calls: Record } { + const calls: Record = { + getPending: [], + markSending: [], + markSent: [], + markFailed: [], + }; + + let pendingItems = [...items]; + + return { + calls, + getPending(limit: number): QueueItem[] { + calls.getPending.push([limit]); + const result = pendingItems.filter((i) => i.status === "pending").slice(0, limit); + pendingItems = pendingItems.filter((i) => !result.some((r) => r.id === i.id)); + return result; + }, + markSending(id: number): boolean { + calls.markSending.push([id]); + return true; + }, + markSent(id: number): boolean { + calls.markSent.push([id]); + return true; + }, + markFailed(id: number, error?: string): boolean { + calls.markFailed.push([id, error]); + return true; + }, + }; +} + +// --------------------------------------------------------------------------- +// uploadPushPayload tests +// --------------------------------------------------------------------------- + +describe("uploadPushPayload", () => { + const originalFetch = globalThis.fetch; + + afterEach(() => { + globalThis.fetch = originalFetch; + }); + + test("returns success result on 200 response", async () => { + const payload = makePayload(); + globalThis.fetch = mock( + async () => + new Response(JSON.stringify({ success: true, push_id: "test-push-id", errors: [] }), { + status: 200, + }), + ); + + const result = await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + expect(result.success).toBe(true); + expect(result.errors).toEqual([]); + }); + + test("treats 201 accepted cloud response as success", async () => { + const payload = makePayload(); + globalThis.fetch = mock( + async () => + new Response( + JSON.stringify({ + status: "accepted", + push_id: "accepted-push-id", + canonical_sessions_written: 1, + }), + { status: 201 }, + ), + ); + + const result = await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + expect(result.success).toBe(true); + expect(result.push_id).toBe("accepted-push-id"); + expect(result.errors).toEqual([]); + expect(result._status).toBe(201); + }); + + test("sends correct headers without API key", async () => { + const payload = makePayload(); + let capturedHeaders: Headers | null = null; + + globalThis.fetch = mock(async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedHeaders = new Headers(init?.headers); + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + + expect(capturedHeaders).not.toBeNull(); + if (capturedHeaders === null) { + throw new Error("fetch was not called - capturedHeaders is null"); + } + expect(capturedHeaders.get("Content-Type")).toBe("application/json"); + expect(capturedHeaders.get("User-Agent")).toMatch(/^selftune\//); + expect(capturedHeaders.get("Authorization")).toBeNull(); + }); + + test("sends Authorization header when API key is provided", async () => { + const payload = makePayload(); + let capturedHeaders: Headers | null = null; + + globalThis.fetch = mock(async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedHeaders = new Headers(init?.headers); + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + await uploadPushPayload(payload, "https://api.example.com/api/v1/push", "my-secret-key"); + + expect(capturedHeaders).not.toBeNull(); + if (capturedHeaders === null) { + throw new Error("fetch was not called - capturedHeaders is null"); + } + expect(capturedHeaders.get("Authorization")).toBe("Bearer my-secret-key"); + }); + + test("sends POST with JSON body containing schema_version 2.0", async () => { + const payload = makePayload(); + let capturedMethod: string | undefined; + let capturedBody: string | undefined; + + globalThis.fetch = mock(async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedMethod = init?.method; + capturedBody = init?.body as string; + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + + expect(capturedMethod).toBe("POST"); + if (capturedBody === undefined) { + throw new Error("fetch was not called - capturedBody is undefined"); + } + const parsed = JSON.parse(capturedBody); + expect(parsed.schema_version).toBe("2.0"); + expect(parsed.canonical).toBeDefined(); + }); + + test("returns error result on invalid JSON response shape", async () => { + const payload = makePayload(); + globalThis.fetch = mock( + async () => + new Response(JSON.stringify({ success: "yes", errors: "nope" }), { + status: 200, + }), + ); + + const result = await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + expect(result.success).toBe(false); + expect(result.errors[0]).toContain("Invalid JSON response shape"); + }); + + test("returns error result on 4xx response", async () => { + const payload = makePayload(); + globalThis.fetch = mock(async () => new Response("Bad Request", { status: 400 })); + + const result = await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + expect(result.success).toBe(false); + expect(result.errors.length).toBeGreaterThan(0); + }); + + test("returns error result on 5xx response", async () => { + const payload = makePayload(); + globalThis.fetch = mock(async () => new Response("Internal Server Error", { status: 500 })); + + const result = await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + expect(result.success).toBe(false); + expect(result.errors.length).toBeGreaterThan(0); + }); + + test("returns error result on network failure without throwing", async () => { + const payload = makePayload(); + globalThis.fetch = mock(async () => { + throw new Error("Network unreachable"); + }); + + const result = await uploadPushPayload(payload, "https://api.example.com/api/v1/push"); + expect(result.success).toBe(false); + expect(result.errors[0]).toContain("Network unreachable"); + }); +}); + +// --------------------------------------------------------------------------- +// flushQueue tests +// --------------------------------------------------------------------------- + +describe("flushQueue", () => { + const originalFetch = globalThis.fetch; + + afterEach(() => { + globalThis.fetch = originalFetch; + }); + + test("returns zero summary when queue is empty", async () => { + const queue = createMockQueue([]); + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push"); + expect(summary).toEqual({ sent: 0, failed: 0, skipped: 0 }); + }); + + test("uploads all pending items on success", async () => { + const items = [makeQueueItem(1), makeQueueItem(2), makeQueueItem(3)]; + const queue = createMockQueue(items); + + globalThis.fetch = mock( + async () => + new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { status: 200 }), + ); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push"); + + expect(summary.sent).toBe(3); + expect(summary.failed).toBe(0); + expect(summary.skipped).toBe(0); + expect(queue.calls.markSending.length).toBe(3); + expect(queue.calls.markSent.length).toBe(3); + }); + + test("treats 409 (duplicate push_id) as success", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + + globalThis.fetch = mock( + async () => new Response("Conflict: duplicate push_id", { status: 409 }), + ); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 3, + }); + + expect(summary.sent).toBe(1); + expect(summary.failed).toBe(0); + expect(queue.calls.markSent.length).toBe(1); + }); + + test("treats 401 as non-retryable auth error", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + let callCount = 0; + + globalThis.fetch = mock(async () => { + callCount++; + return new Response("Unauthorized", { status: 401 }); + }); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 3, + }); + + expect(summary.failed).toBe(1); + expect(callCount).toBe(1); // No retries + expect(queue.calls.markFailed.length).toBe(1); + const firstFailure = queue.calls.markFailed[0]; + expect(firstFailure).toBeDefined(); + if (!firstFailure) { + throw new Error("queue.markFailed was not called"); + } + const errorMsg = firstFailure[1] as string; + expect(errorMsg).toContain("Authentication failed"); + expect(errorMsg).toContain("API key"); + }); + + test("treats 403 as non-retryable auth error", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + let callCount = 0; + + globalThis.fetch = mock(async () => { + callCount++; + return new Response("Forbidden", { status: 403 }); + }); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 3, + }); + + expect(summary.failed).toBe(1); + expect(callCount).toBe(1); // No retries + const firstFailure = queue.calls.markFailed[0]; + expect(firstFailure).toBeDefined(); + if (!firstFailure) { + throw new Error("queue.markFailed was not called"); + } + const errorMsg = firstFailure[1] as string; + expect(errorMsg).toContain("Authorization denied"); + expect(errorMsg).toContain("selftune doctor"); + }); + + test("passes API key through to uploadPushPayload", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + let capturedHeaders: Headers | null = null; + + globalThis.fetch = mock(async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedHeaders = new Headers(init?.headers); + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + await flushQueue(queue, "https://api.example.com/api/v1/push", { + apiKey: "test-api-key", + }); + + expect(capturedHeaders).not.toBeNull(); + if (capturedHeaders === null) { + throw new Error("fetch was not called - capturedHeaders is null"); + } + expect(capturedHeaders.get("Authorization")).toBe("Bearer test-api-key"); + }); + + test("marks items as failed when upload fails", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + + globalThis.fetch = mock(async () => new Response("Server Error", { status: 500 })); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 1, + }); + + expect(summary.failed).toBe(1); + expect(summary.sent).toBe(0); + expect(queue.calls.markFailed.length).toBeGreaterThanOrEqual(1); + }); + + test("skips items that already exceeded max attempts", async () => { + const items = [makeQueueItem(1, { attempts: 5 })]; + const queue = createMockQueue(items); + + globalThis.fetch = mock( + async () => + new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { status: 200 }), + ); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 5, + }); + + expect(summary.failed).toBe(1); + expect(summary.sent).toBe(0); + expect(queue.calls.markSending.length).toBe(0); + expect(queue.calls.markFailed.length).toBe(1); + }); + + test("respects batchSize option", async () => { + const items = [makeQueueItem(1), makeQueueItem(2), makeQueueItem(3)]; + const queue = createMockQueue(items); + + globalThis.fetch = mock( + async () => + new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { status: 200 }), + ); + + await flushQueue(queue, "https://api.example.com/api/v1/push", { batchSize: 2 }); + + expect(queue.calls.getPending[0]?.[0]).toBe(2); + }); + + test("dry-run mode does not make HTTP calls", async () => { + const items = [makeQueueItem(1), makeQueueItem(2)]; + const queue = createMockQueue(items); + let fetchCallCount = 0; + + globalThis.fetch = mock(async () => { + fetchCallCount++; + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + dryRun: true, + }); + + expect(fetchCallCount).toBe(0); + expect(summary.sent).toBe(0); + expect(summary.skipped).toBe(2); + expect(queue.calls.markSent.length).toBe(0); + expect(queue.calls.markFailed.length).toBe(0); + }); + + test("retries with backoff on transient failure then succeeds", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + let callCount = 0; + + globalThis.fetch = mock(async () => { + callCount++; + if (callCount === 1) { + return new Response("Server Error", { status: 500 }); + } + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 3, + }); + + expect(summary.sent).toBe(1); + expect(summary.failed).toBe(0); + expect(callCount).toBe(2); + }); + + test("does not retry on 4xx client errors (except 401/403/409)", async () => { + const items = [makeQueueItem(1)]; + const queue = createMockQueue(items); + let callCount = 0; + + globalThis.fetch = mock(async () => { + callCount++; + return new Response("Bad Request", { status: 400 }); + }); + + const summary = await flushQueue(queue, "https://api.example.com/api/v1/push", { + maxRetries: 3, + }); + + expect(summary.failed).toBe(1); + expect(callCount).toBe(1); + }); +}); diff --git a/tests/alpha-upload/integration.test.ts b/tests/alpha-upload/integration.test.ts new file mode 100644 index 00000000..6e4cdf91 --- /dev/null +++ b/tests/alpha-upload/integration.test.ts @@ -0,0 +1,468 @@ +/** + * Integration tests for the alpha upload orchestration module (V2). + * + * Tests prepareUploads, runUploadCycle, API key flow, and fail-open contract. + * Uses an in-memory SQLite database with the full schema applied. + * + * The upload pipeline now uses a staging-based approach: + * 1. stageCanonicalRecords() stages from JSONL + evolution evidence + * 2. buildV2PushPayload() reads staged records via single monotonic cursor + * 3. prepareUploads() enqueues the resulting payload + * + * Since integration tests seed data directly into SQLite tables (not JSONL), + * we must also stage them into canonical_upload_staging before prepareUploads + * can build a payload from them. + */ + +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, it, mock } from "bun:test"; +import { enqueueUpload, getQueueStats } from "../../cli/selftune/alpha-upload/queue.js"; +import { ALL_DDL, MIGRATIONS, POST_MIGRATION_INDEXES } from "../../cli/selftune/localdb/schema.js"; + +// --------------------------------------------------------------------------- +// Test helpers +// --------------------------------------------------------------------------- + +function createTestDb(): Database { + const db = new Database(":memory:"); + db.exec("PRAGMA journal_mode = WAL"); + for (const ddl of ALL_DDL) { + db.exec(ddl); + } + for (const migration of MIGRATIONS) { + try { + db.exec(migration); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + if (!message.includes("duplicate column")) { + throw error; + } + } + } + for (const idx of POST_MIGRATION_INDEXES) { + db.exec(idx); + } + return db; +} + +/** Stage a canonical session record directly into the staging table. */ +function stageSessions(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const sid = `session-${i}`; + const record = { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sid, + started_at: "2026-01-01T00:00:00.000Z", + ended_at: "2026-01-01T01:00:00.000Z", + model: "opus", + completion_status: "completed", + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, staged_at) + VALUES (?, ?, ?, ?, ?)`, + ["session", sid, JSON.stringify(record), sid, new Date().toISOString()], + ); + } +} + +/** Stage canonical prompt records directly. */ +function stagePrompts(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const pid = `prompt-${i}`; + const record = { + record_kind: "prompt", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: "session-0", + prompt_id: pid, + occurred_at: "2026-01-01T00:00:00.000Z", + prompt_text: "test prompt", + prompt_kind: "user", + is_actionable: true, + prompt_index: i, + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, prompt_id, staged_at) + VALUES (?, ?, ?, ?, ?, ?)`, + ["prompt", pid, JSON.stringify(record), "session-0", pid, new Date().toISOString()], + ); + } +} + +/** Stage canonical invocation records directly. */ +function stageInvocations(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const invId = `inv-${i}`; + const record = { + record_kind: "skill_invocation", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: "session-0", + skill_invocation_id: invId, + occurred_at: "2026-01-01T00:00:00.000Z", + skill_name: "Research", + invocation_mode: "implicit", + triggered: true, + confidence: 0.9, + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, staged_at) + VALUES (?, ?, ?, ?, ?)`, + ["skill_invocation", invId, JSON.stringify(record), "session-0", new Date().toISOString()], + ); + } +} + +/** Stage canonical execution fact records directly. */ +function stageExecutionFacts(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const efId = `ef-${i}`; + const record = { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-01-01T00:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: "session-0", + execution_fact_id: efId, + occurred_at: "2026-01-01T00:00:00.000Z", + tool_calls_json: { Read: 3 }, + total_tool_calls: 3, + assistant_turns: 2, + errors_encountered: 0, + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, session_id, staged_at) + VALUES (?, ?, ?, ?, ?)`, + ["execution_fact", efId, JSON.stringify(record), "session-0", new Date().toISOString()], + ); + } +} + +/** Stage evolution evidence records directly. */ +function stageEvolutionEvidence(db: Database, count: number): void { + for (let i = 0; i < count; i++) { + const recordId = `ev-stage-${i}`; + const record = { + evidence_id: recordId, + timestamp: "2026-01-01T00:00:00.000Z", + skill_name: "Research", + skill_path: "/tmp/skills/Research/SKILL.md", + proposal_id: `prop-${i}`, + target: "description", + stage: "deployed", + rationale: "improved accuracy", + confidence: 0.85, + details: "pass rate improved", + original_text: "old", + proposed_text: "new", + }; + db.run( + `INSERT OR IGNORE INTO canonical_upload_staging + (record_kind, record_id, record_json, staged_at) + VALUES (?, ?, ?, ?)`, + ["evolution_evidence", recordId, JSON.stringify(record), new Date().toISOString()], + ); + } +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe("alpha-upload/index -- prepareUploads (V2 staging)", () => { + let db: Database; + + beforeEach(() => { + db = createTestDb(); + }); + + afterEach(() => { + db.close(); + }); + + it("returns empty summary when no staged rows exist", async () => { + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(result.enqueued).toBe(0); + expect(result.types).toEqual([]); + }); + + it("enqueues a single V2 push payload from staged sessions", async () => { + stageSessions(db, 3); + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(result.enqueued).toBe(1); + expect(result.types).toContain("canonical"); + + const stats = getQueueStats(db); + expect(stats.pending).toBe(1); + }); + + it("enqueues payload including staged invocations", async () => { + stageSessions(db, 1); + stageInvocations(db, 5); + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(result.enqueued).toBe(1); + expect(result.types).toContain("canonical"); + }); + + it("enqueues payload including staged evolution_evidence", async () => { + stageEvolutionEvidence(db, 2); + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(result.enqueued).toBe(1); + expect(result.types).toContain("canonical"); + }); + + it("enqueues payload including all record types", async () => { + stageSessions(db, 1); + stagePrompts(db, 2); + stageInvocations(db, 3); + stageExecutionFacts(db, 1); + stageEvolutionEvidence(db, 1); + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(result.enqueued).toBe(1); + expect(result.types).toContain("canonical"); + }); + + it("respects watermarks -- does not re-enqueue already-uploaded rows", async () => { + stageSessions(db, 3); + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + + // First call enqueues + const first = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(first.enqueued).toBe(1); + + // Second call finds no new rows (watermark advanced) + const second = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(second.enqueued).toBe(0); + }); + + it("produces V2 payload with schema_version 2.0", async () => { + stageSessions(db, 1); + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + prepareUploads(db, "test-user", "claude_code", "0.2.7", "/nonexistent/canonical.jsonl"); + + // Read the queued payload + const row = db + .query("SELECT payload_json FROM upload_queue WHERE status = 'pending' LIMIT 1") + .get() as { payload_json: string }; + const payload = JSON.parse(row.payload_json); + expect(payload.schema_version).toBe("2.0"); + expect(payload.push_id).toBeDefined(); + expect(payload.canonical).toBeDefined(); + expect(payload.canonical.sessions).toBeDefined(); + }); +}); + +describe("alpha-upload/index -- runUploadCycle (V2 staging)", () => { + let db: Database; + + beforeEach(() => { + db = createTestDb(); + }); + + afterEach(() => { + db.close(); + }); + + it("returns empty summary when unenrolled", async () => { + const { runUploadCycle } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = await runUploadCycle(db, { + enrolled: false, + endpoint: "https://api.selftune.dev/api/v1/push", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + expect(result.enrolled).toBe(false); + expect(result.prepared).toBe(0); + expect(result.sent).toBe(0); + expect(result.failed).toBe(0); + expect(result.skipped).toBe(0); + }); + + it("prepares and flushes when enrolled (dry-run)", async () => { + stageSessions(db, 2); + + const { runUploadCycle } = await import("../../cli/selftune/alpha-upload/index.js"); + const result = await runUploadCycle(db, { + enrolled: true, + userId: "test-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://api.selftune.dev/api/v1/push", + dryRun: true, + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(result.enrolled).toBe(true); + expect(result.prepared).toBe(1); + // In dry-run mode, nothing is actually sent + expect(result.sent).toBe(0); + }); + + it("passes apiKey through to flush", async () => { + stageSessions(db, 1); + const originalFetch = globalThis.fetch; + let capturedHeaders: Headers | null = null; + + globalThis.fetch = mock(async (_input: RequestInfo | URL, init?: RequestInit) => { + capturedHeaders = new Headers(init?.headers); + return new Response(JSON.stringify({ success: true, push_id: "id", errors: [] }), { + status: 200, + }); + }); + + try { + const { runUploadCycle } = await import("../../cli/selftune/alpha-upload/index.js"); + await runUploadCycle(db, { + enrolled: true, + userId: "test-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://api.selftune.dev/api/v1/push", + apiKey: "test-secret-key", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + expect(capturedHeaders).not.toBeNull(); + if (capturedHeaders === null) { + throw new Error("fetch was not called - capturedHeaders is null"); + } + expect(capturedHeaders.get("Authorization")).toBe("Bearer test-secret-key"); + } finally { + globalThis.fetch = originalFetch; + } + }); + + it("does not throw on upload errors", async () => { + const { runUploadCycle } = await import("../../cli/selftune/alpha-upload/index.js"); + + // Pre-enqueue an item with corrupt JSON to force the fail-open parse path. + enqueueUpload(db, "push", "not-valid-json"); + + const result = await runUploadCycle(db, { + enrolled: true, + userId: "test-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://api.selftune.dev/api/v1/push", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + + // Should not throw -- fail open + expect(result.enrolled).toBe(true); + expect(typeof result.prepared).toBe("number"); + expect(typeof result.sent).toBe("number"); + expect(typeof result.failed).toBe("number"); + }); +}); + +describe("alpha-upload/index -- fail-open guarantees (V2 staging)", () => { + it("prepareUploads never throws even with a broken database", async () => { + const { prepareUploads } = await import("../../cli/selftune/alpha-upload/index.js"); + const db = new Database(":memory:"); + try { + // No schema applied -- all queries will fail + const result = prepareUploads( + db, + "test-user", + "claude_code", + "0.2.7", + "/nonexistent/canonical.jsonl", + ); + expect(result.enqueued).toBe(0); + expect(result.types).toEqual([]); + } finally { + db.close(); + } + }); + + it("runUploadCycle never throws even with a broken database", async () => { + const { runUploadCycle } = await import("../../cli/selftune/alpha-upload/index.js"); + const db = new Database(":memory:"); + try { + // No schema applied + const result = await runUploadCycle(db, { + enrolled: true, + userId: "test-user", + agentType: "claude_code", + selftuneVersion: "0.2.7", + endpoint: "https://api.selftune.dev/api/v1/push", + canonicalLogPath: "/nonexistent/canonical.jsonl", + }); + expect(result.enrolled).toBe(true); + expect(result.prepared).toBe(0); + } finally { + db.close(); + } + }); +}); diff --git a/tests/alpha-upload/queue.test.ts b/tests/alpha-upload/queue.test.ts new file mode 100644 index 00000000..ee5567ad --- /dev/null +++ b/tests/alpha-upload/queue.test.ts @@ -0,0 +1,294 @@ +/** + * Tests for alpha upload queue and watermark storage layer. + * + * Uses in-memory SQLite via openDb(":memory:") for isolation. + */ + +import type { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { + enqueueUpload, + getPendingUploads, + getQueueStats, + markFailed, + markSending, + markSent, + readWatermark, + writeWatermark, +} from "../../cli/selftune/alpha-upload/queue.js"; +import { openDb } from "../../cli/selftune/localdb/db.js"; + +let db: Database; + +beforeEach(() => { + db = openDb(":memory:"); +}); + +afterEach(() => { + db.close(); +}); + +// -- enqueueUpload ------------------------------------------------------------ + +describe("enqueueUpload", () => { + test("inserts a pending item with correct fields", () => { + const payload = JSON.stringify({ session_id: "s1", platform: "claude" }); + const ok = enqueueUpload(db, "session", payload); + expect(ok).toBe(true); + + const row = db.query("SELECT * FROM upload_queue WHERE id = 1").get() as Record< + string, + unknown + >; + expect(row).toBeTruthy(); + expect(row.payload_type).toBe("session"); + expect(row.payload_json).toBe(payload); + expect(row.status).toBe("pending"); + expect(row.attempts).toBe(0); + expect(row.last_error).toBeNull(); + expect(typeof row.created_at).toBe("string"); + expect(typeof row.updated_at).toBe("string"); + }); + + test("auto-increments id across multiple inserts", () => { + enqueueUpload(db, "session", "{}"); + enqueueUpload(db, "invocation", "{}"); + enqueueUpload(db, "evolution", "{}"); + + const rows = db.query("SELECT id FROM upload_queue ORDER BY id").all() as Array<{ id: number }>; + expect(rows.map((r) => r.id)).toEqual([1, 2, 3]); + }); +}); + +// -- getPendingUploads -------------------------------------------------------- + +describe("getPendingUploads", () => { + test("returns only pending items, oldest first", () => { + enqueueUpload(db, "session", '{"a":1}'); + enqueueUpload(db, "session", '{"a":2}'); + enqueueUpload(db, "invocation", '{"a":3}'); + + // Mark first as sending so it's no longer pending + markSending(db, [1]); + + const pending = getPendingUploads(db); + expect(pending.length).toBe(2); + expect(pending[0].id).toBe(2); + expect(pending[1].id).toBe(3); + }); + + test("respects limit parameter", () => { + for (let i = 0; i < 10; i++) { + enqueueUpload(db, "session", `{"i":${i}}`); + } + const pending = getPendingUploads(db, 3); + expect(pending.length).toBe(3); + expect(pending[0].id).toBe(1); + }); + + test("returns empty array when no pending items", () => { + const pending = getPendingUploads(db); + expect(pending).toEqual([]); + }); +}); + +// -- markSending -------------------------------------------------------------- + +describe("markSending", () => { + test("transitions pending items to sending", () => { + enqueueUpload(db, "session", "{}"); + enqueueUpload(db, "session", "{}"); + + const ok = markSending(db, [1, 2]); + expect(ok).toBe(true); + + const rows = db.query("SELECT status FROM upload_queue ORDER BY id").all() as Array<{ + status: string; + }>; + expect(rows.every((r) => r.status === "sending")).toBe(true); + }); + + test("does not transition non-pending items", () => { + enqueueUpload(db, "session", "{}"); + markSending(db, [1]); + // Try to transition again (already sending) + markSending(db, [1]); + + const row = db.query("SELECT status FROM upload_queue WHERE id = 1").get() as { + status: string; + }; + expect(row.status).toBe("sending"); + }); +}); + +// -- markSent ----------------------------------------------------------------- + +describe("markSent", () => { + test("transitions sending items to sent", () => { + enqueueUpload(db, "session", "{}"); + markSending(db, [1]); + + const ok = markSent(db, [1]); + expect(ok).toBe(true); + + const row = db.query("SELECT status FROM upload_queue WHERE id = 1").get() as { + status: string; + }; + expect(row.status).toBe("sent"); + }); + + test("updates watermark to max id per payload_type", () => { + enqueueUpload(db, "session", "{}"); + enqueueUpload(db, "session", "{}"); + enqueueUpload(db, "invocation", "{}"); + markSending(db, [1, 2, 3]); + markSent(db, [1, 2, 3]); + + const sessionWm = readWatermark(db, "session"); + expect(sessionWm).toBe(2); + + const invocationWm = readWatermark(db, "invocation"); + expect(invocationWm).toBe(3); + }); + + test("does not advance the watermark for rows that were never sending", () => { + enqueueUpload(db, "session", "{}"); + + expect(markSent(db, [1])).toBe(true); + expect(readWatermark(db, "session")).toBeNull(); + }); +}); + +// -- markFailed --------------------------------------------------------------- + +describe("markFailed", () => { + test("transitions sending item to failed and records error", () => { + enqueueUpload(db, "session", "{}"); + markSending(db, [1]); + + const ok = markFailed(db, 1, "network timeout"); + expect(ok).toBe(true); + + const row = db + .query("SELECT status, attempts, last_error FROM upload_queue WHERE id = 1") + .get() as { status: string; attempts: number; last_error: string }; + expect(row.status).toBe("failed"); + expect(row.attempts).toBe(1); + expect(row.last_error).toBe("network timeout"); + }); + + test("increments attempts on repeated failures", () => { + enqueueUpload(db, "session", "{}"); + + // First failure cycle + markSending(db, [1]); + markFailed(db, 1, "error 1"); + + // Reset to pending for retry, then fail again + db.run("UPDATE upload_queue SET status = 'pending' WHERE id = 1"); + markSending(db, [1]); + markFailed(db, 1, "error 2"); + + const row = db.query("SELECT attempts, last_error FROM upload_queue WHERE id = 1").get() as { + attempts: number; + last_error: string; + }; + expect(row.attempts).toBe(2); + expect(row.last_error).toBe("error 2"); + }); +}); + +// -- getQueueStats ------------------------------------------------------------ + +describe("getQueueStats", () => { + test("returns counts by status", () => { + enqueueUpload(db, "session", "{}"); + enqueueUpload(db, "session", "{}"); + enqueueUpload(db, "invocation", "{}"); + markSending(db, [1]); + markSent(db, [1]); + markSending(db, [2]); + markFailed(db, 2, "err"); + + const stats = getQueueStats(db); + expect(stats.pending).toBe(1); + expect(stats.sending).toBe(0); + expect(stats.sent).toBe(1); + expect(stats.failed).toBe(1); + }); + + test("returns all zeros for empty queue", () => { + const stats = getQueueStats(db); + expect(stats).toEqual({ pending: 0, sending: 0, sent: 0, failed: 0 }); + }); +}); + +// -- readWatermark / writeWatermark ------------------------------------------- + +describe("watermarks", () => { + test("readWatermark returns null for unknown payload type", () => { + const wm = readWatermark(db, "session"); + expect(wm).toBeNull(); + }); + + test("writeWatermark inserts new watermark", () => { + writeWatermark(db, "session", 42); + const wm = readWatermark(db, "session"); + expect(wm).toBe(42); + }); + + test("writeWatermark upserts existing watermark", () => { + writeWatermark(db, "session", 10); + writeWatermark(db, "session", 50); + const wm = readWatermark(db, "session"); + expect(wm).toBe(50); + }); + + test("watermarks are independent per payload_type", () => { + writeWatermark(db, "session", 100); + writeWatermark(db, "invocation", 200); + writeWatermark(db, "evolution", 300); + + expect(readWatermark(db, "session")).toBe(100); + expect(readWatermark(db, "invocation")).toBe(200); + expect(readWatermark(db, "evolution")).toBe(300); + }); +}); + +// -- Schema validation -------------------------------------------------------- + +describe("schema", () => { + test("upload_queue table exists with correct columns", () => { + const cols = db.query("PRAGMA table_info(upload_queue)").all() as Array<{ + name: string; + type: string; + }>; + const colNames = cols.map((c) => c.name); + expect(colNames).toContain("id"); + expect(colNames).toContain("payload_type"); + expect(colNames).toContain("payload_json"); + expect(colNames).toContain("status"); + expect(colNames).toContain("attempts"); + expect(colNames).toContain("created_at"); + expect(colNames).toContain("updated_at"); + expect(colNames).toContain("last_error"); + }); + + test("upload_watermarks table exists with correct columns", () => { + const cols = db.query("PRAGMA table_info(upload_watermarks)").all() as Array<{ + name: string; + type: string; + }>; + const colNames = cols.map((c) => c.name); + expect(colNames).toContain("payload_type"); + expect(colNames).toContain("last_uploaded_id"); + expect(colNames).toContain("updated_at"); + }); + + test("indexes exist on upload_queue", () => { + const indexes = db.query("PRAGMA index_list(upload_queue)").all() as Array<{ name: string }>; + const indexNames = indexes.map((i) => i.name); + expect(indexNames).toContain("idx_upload_queue_status"); + expect(indexNames).toContain("idx_upload_queue_type_status"); + }); +}); diff --git a/tests/alpha-upload/staging.test.ts b/tests/alpha-upload/staging.test.ts new file mode 100644 index 00000000..e503df31 --- /dev/null +++ b/tests/alpha-upload/staging.test.ts @@ -0,0 +1,785 @@ +/** + * Tests for the canonical upload staging pipeline. + * + * Covers: + * - stageCanonicalRecords() inserting from JSONL + * - Dedup behavior (staging same records twice) + * - buildV2PushPayload() reading from staging with cursor + * - Evolution evidence staged alongside canonical records + * - Output passing PushPayloadV2Schema validation + */ + +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { PushPayloadV2Schema } from "@selftune/telemetry-contract/schemas"; +import { buildV2PushPayload } from "../../cli/selftune/alpha-upload/build-payloads.js"; +import { + generateEvidenceId, + stageCanonicalRecords, +} from "../../cli/selftune/alpha-upload/stage-canonical.js"; +import { ALL_DDL, MIGRATIONS, POST_MIGRATION_INDEXES } from "../../cli/selftune/localdb/schema.js"; + +// -- Test helpers ------------------------------------------------------------- + +function createTestDb(): Database { + const db = new Database(":memory:"); + for (const ddl of ALL_DDL) db.run(ddl); + for (const m of MIGRATIONS) { + try { + db.run(m); + } catch (e: unknown) { + const msg = e instanceof Error ? e.message : String(e); + if (!msg.includes("duplicate column")) throw e; + } + } + for (const idx of POST_MIGRATION_INDEXES) { + db.run(idx); + } + return db; +} + +function createTempDir(): string { + return mkdtempSync(join(tmpdir(), "staging-test-")); +} + +function makeCanonicalSessionRecord(sessionId: string, overrides: Record = {}) { + return { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: { path: "/some/transcript.jsonl" }, + session_id: sessionId, + started_at: "2026-03-18T09:00:00.000Z", + ended_at: "2026-03-18T09:30:00.000Z", + model: "opus", + completion_status: "completed", + ...overrides, + }; +} + +function makeCanonicalPromptRecord( + promptId: string, + sessionId: string, + overrides: Record = {}, +) { + return { + record_kind: "prompt", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sessionId, + prompt_id: promptId, + occurred_at: "2026-03-18T09:01:00.000Z", + prompt_text: "improve my skills", + prompt_kind: "user", + is_actionable: true, + prompt_index: 0, + ...overrides, + }; +} + +function makeCanonicalInvocationRecord( + invId: string, + sessionId: string, + overrides: Record = {}, +) { + return { + record_kind: "skill_invocation", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sessionId, + skill_invocation_id: invId, + occurred_at: "2026-03-18T09:02:00.000Z", + skill_name: "selftune", + invocation_mode: "implicit", + triggered: true, + confidence: 0.95, + ...overrides, + }; +} + +function makeCanonicalExecutionFactRecord( + sessionId: string, + overrides: Record = {}, +) { + return { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: sessionId, + execution_fact_id: + overrides.execution_fact_id ?? `${sessionId}:2026-03-18T09:03:00.000Z:no-prompt`, + occurred_at: "2026-03-18T09:03:00.000Z", + tool_calls_json: { Read: 3, Edit: 2 }, + total_tool_calls: 5, + assistant_turns: 3, + errors_encountered: 0, + ...overrides, + }; +} + +function writeCanonicalJsonl(dir: string, records: unknown[]): string { + const logPath = join(dir, "canonical_telemetry_log.jsonl"); + const content = + records.map((r) => JSON.stringify(r)).join("\n") + (records.length > 0 ? "\n" : ""); + writeFileSync(logPath, content, "utf-8"); + return logPath; +} + +function insertEvolutionEvidence( + db: Database, + overrides: Partial<{ + timestamp: string; + proposal_id: string; + skill_name: string; + skill_path: string; + target: string; + stage: string; + rationale: string; + confidence: number; + details: string; + original_text: string; + proposed_text: string; + }> = {}, +): void { + const e = { + timestamp: overrides.timestamp ?? "2026-03-18T10:10:00Z", + proposal_id: overrides.proposal_id ?? `prop-${Math.random().toString(36).slice(2)}`, + skill_name: overrides.skill_name ?? "selftune", + skill_path: overrides.skill_path ?? "/path/to/SKILL.md", + target: overrides.target ?? "description", + stage: overrides.stage ?? "deployed", + rationale: overrides.rationale ?? "improved routing accuracy", + confidence: overrides.confidence ?? 0.85, + details: overrides.details ?? "pass rate improved", + original_text: overrides.original_text ?? "old description", + proposed_text: overrides.proposed_text ?? "new description", + }; + db.run( + `INSERT INTO evolution_evidence (timestamp, proposal_id, skill_name, skill_path, target, stage, rationale, confidence, details, original_text, proposed_text) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + [ + e.timestamp, + e.proposal_id, + e.skill_name, + e.skill_path, + e.target, + e.stage, + e.rationale, + e.confidence, + e.details, + e.original_text, + e.proposed_text, + ], + ); +} + +// -- Tests -------------------------------------------------------------------- + +describe("stageCanonicalRecords", () => { + let db: Database; + let tempDir: string; + + beforeEach(() => { + db = createTestDb(); + tempDir = createTempDir(); + }); + + afterEach(() => { + db.close(); + rmSync(tempDir, { recursive: true, force: true }); + }); + + test("stages canonical records from JSONL into staging table", () => { + const logPath = writeCanonicalJsonl(tempDir, [ + makeCanonicalSessionRecord("sess-1"), + makeCanonicalPromptRecord("p-1", "sess-1"), + makeCanonicalInvocationRecord("inv-1", "sess-1"), + makeCanonicalExecutionFactRecord("sess-1"), + ]); + + const count = stageCanonicalRecords(db, logPath); + expect(count).toBe(4); + + // Verify they're in the staging table + const rows = db + .query("SELECT * FROM canonical_upload_staging ORDER BY local_seq") + .all() as Array<{ + local_seq: number; + record_kind: string; + record_id: string; + record_json: string; + session_id: string | null; + }>; + expect(rows).toHaveLength(4); + expect(rows[0].record_kind).toBe("session"); + expect(rows[0].record_id).toBe("sess-1"); + expect(rows[1].record_kind).toBe("prompt"); + expect(rows[1].record_id).toBe("p-1"); + expect(rows[2].record_kind).toBe("skill_invocation"); + expect(rows[2].record_id).toBe("inv-1"); + expect(rows[3].record_kind).toBe("execution_fact"); + }); + + test("dedup -- staging same records twice does not create duplicates", () => { + const logPath = writeCanonicalJsonl(tempDir, [ + makeCanonicalSessionRecord("sess-1"), + makeCanonicalPromptRecord("p-1", "sess-1"), + ]); + + const first = stageCanonicalRecords(db, logPath); + expect(first).toBe(2); + + const second = stageCanonicalRecords(db, logPath); + expect(second).toBe(0); // no new records + + const total = db.query("SELECT COUNT(*) as cnt FROM canonical_upload_staging").get() as { + cnt: number; + }; + expect(total.cnt).toBe(2); + }); + + test("stages evolution evidence from SQLite", () => { + // No canonical JSONL records + const logPath = writeCanonicalJsonl(tempDir, []); + + // Insert evolution evidence into SQLite + insertEvolutionEvidence(db, { + proposal_id: "prop-1", + skill_name: "selftune", + stage: "deployed", + timestamp: "2026-03-18T10:10:00Z", + }); + + const count = stageCanonicalRecords(db, logPath); + expect(count).toBe(1); + + const rows = db.query("SELECT * FROM canonical_upload_staging").all() as Array<{ + record_kind: string; + record_id: string; + }>; + expect(rows).toHaveLength(1); + expect(rows[0].record_kind).toBe("evolution_evidence"); + // record_id is now the deterministic evidence_id (ev_ prefix + hash) + expect(rows[0].record_id).toStartWith("ev_"); + }); + + test("stages evolution evidence with deterministic evidence_id", () => { + const logPath = writeCanonicalJsonl(tempDir, []); + + insertEvolutionEvidence(db, { + proposal_id: "prop-ev-id", + skill_name: "selftune", + stage: "deployed", + timestamp: "2026-03-18T10:10:00Z", + }); + + stageCanonicalRecords(db, logPath); + + const rows = db + .query( + "SELECT record_json, record_id FROM canonical_upload_staging WHERE record_kind = 'evolution_evidence'", + ) + .all() as Array<{ + record_json: string; + record_id: string; + }>; + expect(rows).toHaveLength(1); + + const parsed = JSON.parse(rows[0].record_json); + // evidence_id must be present and start with ev_ + expect(parsed.evidence_id).toBeDefined(); + expect(typeof parsed.evidence_id).toBe("string"); + expect(parsed.evidence_id).toStartWith("ev_"); + + // record_id in staging table should be the evidence_id + expect(rows[0].record_id).toBe(parsed.evidence_id); + }); + + test("evidence_id is deterministic -- same evidence produces same ID", () => { + const record1 = { + proposal_id: "prop-det", + stage: "validated", + skill_name: "Research", + timestamp: "2026-03-18T10:15:00Z", + }; + + const id1 = generateEvidenceId(record1); + const id2 = generateEvidenceId(record1); + + expect(id1).toBe(id2); + expect(id1).toStartWith("ev_"); + }); + + test("evidence_id differs for same proposal+stage at different timestamps", () => { + const record1 = { + proposal_id: "prop-multi", + stage: "validated", + skill_name: "Research", + timestamp: "2026-03-18T10:15:00Z", + }; + const record2 = { + ...record1, + timestamp: "2026-03-18T11:00:00Z", + }; + + const id1 = generateEvidenceId(record1); + const id2 = generateEvidenceId(record2); + + expect(id1).not.toBe(id2); + }); + + test("evidence_id handles null proposal_id gracefully", () => { + const record = { + proposal_id: null, + stage: "proposed", + skill_name: "selftune", + timestamp: "2026-03-18T10:00:00Z", + }; + + const id = generateEvidenceId(record); + expect(id).toStartWith("ev_"); + expect(id.length).toBeGreaterThan(3); + }); + + test("preserves full canonical record JSON losslessly", () => { + const session = makeCanonicalSessionRecord("sess-lossless", { + raw_source_ref: { path: "/transcripts/abc.jsonl", line: 42 }, + capture_mode: "hook", + normalizer_version: "2.5.0", + }); + const logPath = writeCanonicalJsonl(tempDir, [session]); + + stageCanonicalRecords(db, logPath); + + const row = db + .query("SELECT record_json FROM canonical_upload_staging WHERE record_id = 'sess-lossless'") + .get() as { record_json: string }; + const parsed = JSON.parse(row.record_json); + + // These fields should be preserved exactly as-is from the canonical log + expect(parsed.raw_source_ref).toEqual({ path: "/transcripts/abc.jsonl", line: 42 }); + expect(parsed.capture_mode).toBe("hook"); + expect(parsed.normalizer_version).toBe("2.5.0"); + expect(parsed.schema_version).toBe("2.0"); + }); + + test("uses execution_fact_id as record_id for execution facts", () => { + const fact = makeCanonicalExecutionFactRecord("sess-efid", { + execution_fact_id: "ef-custom-123", + }); + + const logPath = writeCanonicalJsonl(tempDir, [fact]); + stageCanonicalRecords(db, logPath); + + const row = db + .query("SELECT record_id FROM canonical_upload_staging WHERE record_kind = 'execution_fact'") + .get() as { record_id: string }; + expect(row.record_id).toBe("ef-custom-123"); + }); + + test("uses execution_fact_id directly as record_id (no fallback format)", () => { + const fact = makeCanonicalExecutionFactRecord("sess-det", { + execution_fact_id: "ef-explicit-id", + }); + + const logPath = writeCanonicalJsonl(tempDir, [fact]); + stageCanonicalRecords(db, logPath); + + const row = db + .query("SELECT record_id FROM canonical_upload_staging WHERE record_kind = 'execution_fact'") + .get() as { record_id: string }; + expect(row.record_id).toBe("ef-explicit-id"); + }); + + test("injects deterministic execution_fact_id when missing from record", () => { + // Create a record WITHOUT execution_fact_id to simulate older canonical logs + const factWithoutId = { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: "sess-no-efid", + occurred_at: "2026-03-18T09:03:00.000Z", + tool_calls_json: { Read: 1 }, + total_tool_calls: 1, + assistant_turns: 1, + errors_encountered: 0, + // NOTE: no execution_fact_id field at all + }; + + const logPath = writeCanonicalJsonl(tempDir, [factWithoutId]); + stageCanonicalRecords(db, logPath); + + const row = db + .query( + "SELECT record_json FROM canonical_upload_staging WHERE record_kind = 'execution_fact'", + ) + .get() as { record_json: string }; + const parsed = JSON.parse(row.record_json); + + // Must have execution_fact_id injected + expect(parsed.execution_fact_id).toBeDefined(); + expect(typeof parsed.execution_fact_id).toBe("string"); + expect(parsed.execution_fact_id).toStartWith("ef_"); + }); + + test("generated execution_fact_id is deterministic (same inputs produce same ID)", () => { + // Two identical records should produce the same execution_fact_id + const factWithoutId = { + record_kind: "execution_fact", + schema_version: "2.0", + normalizer_version: "1.0.0", + normalized_at: "2026-03-18T10:00:00.000Z", + platform: "claude_code", + capture_mode: "replay", + source_session_kind: "interactive", + raw_source_ref: {}, + session_id: "sess-deterministic", + occurred_at: "2026-03-18T09:05:00.000Z", + prompt_id: "p-det-1", + tool_calls_json: { Read: 2 }, + total_tool_calls: 2, + assistant_turns: 1, + errors_encountered: 0, + }; + + // Stage once + const logPath1 = writeCanonicalJsonl(tempDir, [factWithoutId]); + stageCanonicalRecords(db, logPath1); + + const row1 = db + .query( + "SELECT record_json FROM canonical_upload_staging WHERE record_kind = 'execution_fact'", + ) + .get() as { record_json: string }; + const id1 = JSON.parse(row1.record_json).execution_fact_id; + + // Stage again with a fresh DB -- same record should produce same ID + const db2 = createTestDb(); + stageCanonicalRecords(db2, logPath1); + + const row2 = db2 + .query( + "SELECT record_json FROM canonical_upload_staging WHERE record_kind = 'execution_fact'", + ) + .get() as { record_json: string }; + const id2 = JSON.parse(row2.record_json).execution_fact_id; + db2.close(); + + expect(id1).toBe(id2); + expect(id1).toStartWith("ef_"); + }); + + test("execution facts WITH execution_fact_id are left unchanged", () => { + const factWithId = makeCanonicalExecutionFactRecord("sess-has-id", { + execution_fact_id: "ef-already-set-999", + }); + + const logPath = writeCanonicalJsonl(tempDir, [factWithId]); + stageCanonicalRecords(db, logPath); + + const row = db + .query( + "SELECT record_json FROM canonical_upload_staging WHERE record_kind = 'execution_fact'", + ) + .get() as { record_json: string }; + const parsed = JSON.parse(row.record_json); + + // Must preserve the original execution_fact_id exactly + expect(parsed.execution_fact_id).toBe("ef-already-set-999"); + }); + + test("returns 0 when JSONL file does not exist", () => { + const count = stageCanonicalRecords(db, "/nonexistent/file.jsonl"); + expect(count).toBe(0); + }); + + test("stages orchestrate_runs from SQLite", () => { + const logPath = writeCanonicalJsonl(tempDir, []); + + // Insert an orchestrate run into SQLite + db.run( + `INSERT INTO orchestrate_runs (run_id, timestamp, elapsed_ms, dry_run, approval_mode, total_skills, evaluated, evolved, deployed, watched, skipped, skill_actions_json) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + [ + "orch-run-1", + "2026-03-18T11:00:00.000Z", + 5000, + 0, + "auto", + 3, + 2, + 1, + 1, + 1, + 0, + JSON.stringify([ + { skill: "selftune", action: "evolve", reason: "low pass rate", deployed: true }, + { skill: "commit", action: "watch", reason: "recently deployed" }, + ]), + ], + ); + + const count = stageCanonicalRecords(db, logPath); + expect(count).toBe(1); + + const rows = db.query("SELECT * FROM canonical_upload_staging").all() as Array<{ + record_kind: string; + record_id: string; + record_json: string; + }>; + expect(rows).toHaveLength(1); + expect(rows[0].record_kind).toBe("orchestrate_run"); + expect(rows[0].record_id).toBe("orch-run-1"); + + // Verify the staged JSON has correct types + const parsed = JSON.parse(rows[0].record_json); + expect(parsed.dry_run).toBe(false); // boolean, not integer + expect(parsed.skill_actions).toBeArray(); + expect(parsed.skill_actions).toHaveLength(2); + expect(parsed.skill_actions[0].skill).toBe("selftune"); + }); + + test("orchestrate_run dedup by run_id", () => { + const logPath = writeCanonicalJsonl(tempDir, []); + + db.run( + `INSERT INTO orchestrate_runs (run_id, timestamp, elapsed_ms, dry_run, approval_mode, total_skills, evaluated, evolved, deployed, watched, skipped, skill_actions_json) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + ["orch-dup", "2026-03-18T11:00:00.000Z", 1000, 1, "review", 1, 1, 0, 0, 0, 1, "[]"], + ); + + const first = stageCanonicalRecords(db, logPath); + expect(first).toBe(1); + + const second = stageCanonicalRecords(db, logPath); + expect(second).toBe(0); + }); +}); + +describe("buildV2PushPayload (staging-based)", () => { + let db: Database; + let tempDir: string; + + beforeEach(() => { + db = createTestDb(); + tempDir = createTempDir(); + }); + + afterEach(() => { + db.close(); + rmSync(tempDir, { recursive: true, force: true }); + }); + + test("returns null when staging table is empty", () => { + const result = buildV2PushPayload(db); + expect(result).toBeNull(); + }); + + test("returns null when all records are past cursor", () => { + const logPath = writeCanonicalJsonl(tempDir, [makeCanonicalSessionRecord("sess-1")]); + stageCanonicalRecords(db, logPath); + + const result = buildV2PushPayload(db, 999999); + expect(result).toBeNull(); + }); + + test("builds payload from staged records", () => { + const logPath = writeCanonicalJsonl(tempDir, [ + makeCanonicalSessionRecord("sess-1"), + makeCanonicalPromptRecord("p-1", "sess-1"), + makeCanonicalInvocationRecord("inv-1", "sess-1"), + makeCanonicalExecutionFactRecord("sess-1"), + ]); + stageCanonicalRecords(db, logPath); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + + expect(result).toBeDefined(); + const payload = result?.payload; + expect(payload.schema_version).toBe("2.0"); + expect(payload.push_id).toBeDefined(); + + const canonical = payload?.canonical as Record; + expect(canonical.sessions).toHaveLength(1); + expect(canonical.prompts).toHaveLength(1); + expect(canonical.skill_invocations).toHaveLength(1); + expect(canonical.execution_facts).toHaveLength(1); + }); + + test("returns correct lastSeq for cursor advancement", () => { + const logPath = writeCanonicalJsonl(tempDir, [ + makeCanonicalSessionRecord("sess-1"), + makeCanonicalSessionRecord("sess-2"), + ]); + stageCanonicalRecords(db, logPath); + + const first = buildV2PushPayload(db); + expect(first).not.toBeNull(); + expect(first).toBeDefined(); + expect(first?.lastSeq).toBeGreaterThan(0); + + // Second call with cursor from first should return null + const second = buildV2PushPayload(db, first?.lastSeq); + expect(second).toBeNull(); + }); + + test("respects limit parameter", () => { + const records = Array.from({ length: 10 }, (_, i) => + makeCanonicalSessionRecord(`sess-limit-${i}`), + ); + const logPath = writeCanonicalJsonl(tempDir, records); + stageCanonicalRecords(db, logPath); + + const result = buildV2PushPayload(db, undefined, 3); + expect(result).not.toBeNull(); + expect(result).toBeDefined(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.sessions).toHaveLength(3); + }); + + test("includes evolution evidence in payload", () => { + const logPath = writeCanonicalJsonl(tempDir, []); + insertEvolutionEvidence(db, { + proposal_id: "prop-evo", + skill_name: "selftune", + stage: "deployed", + timestamp: "2026-03-18T10:10:00Z", + }); + stageCanonicalRecords(db, logPath); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + expect(result).toBeDefined(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.evolution_evidence).toHaveLength(1); + const ev = canonical.evolution_evidence[0] as Record; + expect(ev.skill_name).toBe("selftune"); + expect(ev.proposal_id).toBe("prop-evo"); + }); + + test("payload passes PushPayloadV2Schema validation", () => { + const logPath = writeCanonicalJsonl(tempDir, [ + makeCanonicalSessionRecord("sess-v"), + makeCanonicalPromptRecord("p-v", "sess-v"), + makeCanonicalInvocationRecord("inv-v", "sess-v"), + makeCanonicalExecutionFactRecord("sess-v", { execution_fact_id: "ef-v" }), + ]); + stageCanonicalRecords(db, logPath); + + insertEvolutionEvidence(db, { + proposal_id: "prop-v", + skill_name: "selftune", + stage: "deployed", + timestamp: "2026-03-18T10:10:00.000Z", + }); + // Re-stage to pick up evolution evidence + stageCanonicalRecords(db, logPath); + + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + expect(result).toBeDefined(); + + const parsed = PushPayloadV2Schema.safeParse(result?.payload); + if (!parsed.success) { + console.error("Zod validation errors:", JSON.stringify(parsed.error.issues, null, 2)); + } + expect(parsed.success).toBe(true); + }); + + test("includes orchestrate_runs in payload from staging", () => { + const logPath = writeCanonicalJsonl(tempDir, [makeCanonicalSessionRecord("sess-orch")]); + + // Insert orchestrate run + db.run( + `INSERT INTO orchestrate_runs (run_id, timestamp, elapsed_ms, dry_run, approval_mode, total_skills, evaluated, evolved, deployed, watched, skipped, skill_actions_json) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)`, + [ + "orch-payload-1", + "2026-03-18T11:00:00.000Z", + 8000, + 0, + "auto", + 5, + 4, + 1, + 1, + 2, + 1, + JSON.stringify([ + { + skill: "selftune", + action: "evolve", + reason: "pass rate below threshold", + deployed: true, + }, + ]), + ], + ); + + stageCanonicalRecords(db, logPath); + const result = buildV2PushPayload(db); + expect(result).not.toBeNull(); + expect(result).toBeDefined(); + + const canonical = result?.payload.canonical as Record; + expect(canonical.orchestrate_runs).toBeDefined(); + expect(canonical.orchestrate_runs).toHaveLength(1); + + const run = canonical.orchestrate_runs[0] as Record; + expect(run.run_id).toBe("orch-payload-1"); + expect(run.dry_run).toBe(false); + expect(run.approval_mode).toBe("auto"); + expect(run.total_skills).toBe(5); + expect((run.skill_actions as unknown[]).length).toBe(1); + }); + + test("no hardcoded provenance fields -- canonical fields preserved from source", () => { + const session = makeCanonicalSessionRecord("sess-prov", { + capture_mode: "hook", + normalizer_version: "3.0.0", + raw_source_ref: { path: "/custom/path.jsonl", raw_id: "abc-123" }, + }); + const logPath = writeCanonicalJsonl(tempDir, [session]); + stageCanonicalRecords(db, logPath); + + const result = buildV2PushPayload(db); + expect(result).toBeDefined(); + const canonical = result?.payload.canonical as Record; + const s = canonical.sessions[0] as Record; + + // These should come from the original record, NOT be hardcoded + expect(s.capture_mode).toBe("hook"); + expect(s.normalizer_version).toBe("3.0.0"); + expect(s.raw_source_ref).toEqual({ path: "/custom/path.jsonl", raw_id: "abc-123" }); + }); +}); diff --git a/tests/alpha-upload/status.test.ts b/tests/alpha-upload/status.test.ts new file mode 100644 index 00000000..212b4801 --- /dev/null +++ b/tests/alpha-upload/status.test.ts @@ -0,0 +1,418 @@ +/** + * Tests for alpha upload status integration in `selftune status` + * and alpha-related doctor checks in observability. + */ + +import { Database } from "bun:sqlite"; +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { + getLastUploadError, + getLastUploadSuccess, + getOldestPendingAge, +} from "../../cli/selftune/localdb/queries.js"; +import { ALL_DDL } from "../../cli/selftune/localdb/schema.js"; +import { checkAlphaQueueHealth } from "../../cli/selftune/observability.js"; +import { + type AlphaStatusInfo, + type CloudVerifyData, + fetchCloudVerify, + formatAlphaStatus, +} from "../../cli/selftune/status.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +function createTestDb(): Database { + const db = new Database(":memory:"); + for (const ddl of ALL_DDL) { + db.run(ddl); + } + return db; +} + +function insertQueueItem( + db: Database, + opts: { + payload_type?: string; + status?: string; + created_at?: string; + updated_at?: string; + last_error?: string | null; + attempts?: number; + } = {}, +): void { + const now = new Date().toISOString(); + db.run( + `INSERT INTO upload_queue (payload_type, payload_json, status, attempts, created_at, updated_at, last_error) + VALUES (?, '{}', ?, ?, ?, ?, ?)`, + [ + opts.payload_type ?? "sessions", + opts.status ?? "pending", + opts.attempts ?? 0, + opts.created_at ?? now, + opts.updated_at ?? now, + opts.last_error ?? null, + ], + ); +} + +// --------------------------------------------------------------------------- +// Query helper tests +// --------------------------------------------------------------------------- + +describe("getLastUploadError", () => { + let db: Database; + beforeEach(() => { + db = createTestDb(); + }); + afterEach(() => { + db.close(); + }); + + test("returns null when no failed items exist", () => { + const result = getLastUploadError(db); + expect(result).toBeNull(); + }); + + test("returns most recent failed item error and timestamp", () => { + insertQueueItem(db, { + status: "failed", + last_error: "old error", + updated_at: "2025-01-01T00:00:00Z", + }); + insertQueueItem(db, { + status: "failed", + last_error: "newest error", + updated_at: "2025-01-02T00:00:00Z", + }); + insertQueueItem(db, { + status: "sent", + updated_at: "2025-01-03T00:00:00Z", + }); + + const result = getLastUploadError(db); + expect(result).not.toBeNull(); + expect(result?.last_error).toBe("newest error"); + expect(result?.updated_at).toBe("2025-01-02T00:00:00Z"); + }); +}); + +describe("getLastUploadSuccess", () => { + let db: Database; + beforeEach(() => { + db = createTestDb(); + }); + afterEach(() => { + db.close(); + }); + + test("returns null when no sent items exist", () => { + const result = getLastUploadSuccess(db); + expect(result).toBeNull(); + }); + + test("returns most recent sent item timestamp", () => { + insertQueueItem(db, { + status: "sent", + updated_at: "2025-01-01T00:00:00Z", + }); + insertQueueItem(db, { + status: "sent", + updated_at: "2025-01-02T00:00:00Z", + }); + + const result = getLastUploadSuccess(db); + expect(result).not.toBeNull(); + expect(result?.updated_at).toBe("2025-01-02T00:00:00Z"); + }); +}); + +describe("getOldestPendingAge", () => { + let db: Database; + beforeEach(() => { + db = createTestDb(); + }); + afterEach(() => { + db.close(); + }); + + test("returns null when no pending items exist", () => { + const result = getOldestPendingAge(db); + expect(result).toBeNull(); + }); + + test("returns age in seconds of oldest pending item", () => { + const twoHoursAgo = new Date(Date.now() - 2 * 3600 * 1000).toISOString(); + const oneHourAgo = new Date(Date.now() - 1 * 3600 * 1000).toISOString(); + + insertQueueItem(db, { status: "pending", created_at: twoHoursAgo }); + insertQueueItem(db, { status: "pending", created_at: oneHourAgo }); + + const age = getOldestPendingAge(db); + expect(age).not.toBeNull(); + // Should be approximately 7200 seconds (2 hours), allow some tolerance + expect(age).toBeGreaterThan(7100); + expect(age).toBeLessThan(7300); + }); + + test("ignores non-pending items", () => { + const longAgo = new Date(Date.now() - 24 * 3600 * 1000).toISOString(); + insertQueueItem(db, { status: "sent", created_at: longAgo }); + insertQueueItem(db, { status: "failed", created_at: longAgo }); + + const result = getOldestPendingAge(db); + expect(result).toBeNull(); + }); +}); + +// --------------------------------------------------------------------------- +// Doctor check tests +// --------------------------------------------------------------------------- + +describe("checkAlphaQueueHealth", () => { + let db: Database; + beforeEach(() => { + db = createTestDb(); + }); + afterEach(() => { + db.close(); + }); + + test("returns empty array when not enrolled", async () => { + const checks = await checkAlphaQueueHealth(db, false); + expect(checks).toHaveLength(0); + }); + + test("returns pass checks when queue is healthy", async () => { + const checks = await checkAlphaQueueHealth(db, true); + expect(checks.length).toBeGreaterThan(0); + expect(checks.every((c) => c.status === "pass")).toBe(true); + }); + + test("warns when pending items older than 1 hour (alpha_queue_stuck)", async () => { + const twoHoursAgo = new Date(Date.now() - 2 * 3600 * 1000).toISOString(); + insertQueueItem(db, { status: "pending", created_at: twoHoursAgo }); + + const checks = await checkAlphaQueueHealth(db, true); + const stuckCheck = checks.find((c) => c.name === "alpha_queue_stuck"); + expect(stuckCheck).toBeDefined(); + expect(stuckCheck?.status).toBe("warn"); + }); + + test("passes when pending items are recent", async () => { + const fiveMinutesAgo = new Date(Date.now() - 5 * 60 * 1000).toISOString(); + insertQueueItem(db, { status: "pending", created_at: fiveMinutesAgo }); + + const checks = await checkAlphaQueueHealth(db, true); + const stuckCheck = checks.find((c) => c.name === "alpha_queue_stuck"); + expect(stuckCheck).toBeDefined(); + expect(stuckCheck?.status).toBe("pass"); + }); + + test("warns when failed count exceeds 50 (alpha_queue_failures)", async () => { + for (let i = 0; i < 51; i++) { + insertQueueItem(db, { status: "failed", last_error: `error ${i}` }); + } + + const checks = await checkAlphaQueueHealth(db, true); + const failCheck = checks.find((c) => c.name === "alpha_queue_failures"); + expect(failCheck).toBeDefined(); + expect(failCheck?.status).toBe("warn"); + }); + + test("passes when failed count is under threshold", async () => { + for (let i = 0; i < 10; i++) { + insertQueueItem(db, { status: "failed", last_error: `error ${i}` }); + } + + const checks = await checkAlphaQueueHealth(db, true); + const failCheck = checks.find((c) => c.name === "alpha_queue_failures"); + expect(failCheck).toBeDefined(); + expect(failCheck?.status).toBe("pass"); + }); +}); + +// --------------------------------------------------------------------------- +// Status formatting tests +// --------------------------------------------------------------------------- + +describe("formatAlphaStatus", () => { + test("returns 'not enrolled' line when not enrolled", () => { + const output = formatAlphaStatus(null); + expect(output).toContain("not enrolled"); + expect(output).toContain("Next command"); + expect(output).toContain( + "selftune init --alpha --alpha-email --alpha-key ", + ); + }); + + test("shows enrolled status with queue stats", () => { + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 5, sending: 1, sent: 100, failed: 2 }, + lastError: { last_error: "network timeout", updated_at: "2025-01-15T10:00:00Z" }, + lastSuccess: { updated_at: "2025-01-15T09:00:00Z" }, + }; + const output = formatAlphaStatus(info); + expect(output).toContain("enrolled"); + expect(output).toContain("5"); // pending + expect(output).toContain("2"); // failed + expect(output).toContain("100"); // sent + expect(output).toContain("network timeout"); + }); + + test("shows enrolled status with no errors", () => { + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 0, sending: 0, sent: 50, failed: 0 }, + lastError: null, + lastSuccess: { updated_at: "2025-01-15T09:00:00Z" }, + }; + const output = formatAlphaStatus(info); + expect(output).toContain("enrolled"); + expect(output).not.toContain("error"); + }); + + test("shows enrolled status with no successful uploads yet", () => { + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 3, sending: 0, sent: 0, failed: 0 }, + lastError: null, + lastSuccess: null, + }; + const output = formatAlphaStatus(info); + expect(output).toContain("enrolled"); + expect(output).toContain("3"); // pending + }); + + test("shows next command when enrollment is missing a credential", () => { + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "enrolled_no_credential", + stats: { pending: 0, sending: 0, sent: 0, failed: 0 }, + lastError: null, + lastSuccess: null, + }; + + const output = formatAlphaStatus(info); + expect(output).toContain("Next command"); + expect(output).toContain("--alpha-key "); + }); + + test("shows linked but not enrolled state when cloud identity exists", () => { + const info: AlphaStatusInfo = { + enrolled: false, + linkState: "linked_not_enrolled", + stats: { pending: 0, sending: 0, sent: 0, failed: 0 }, + lastError: null, + lastSuccess: null, + }; + + const output = formatAlphaStatus(info); + expect(output).toContain("Status: not enrolled"); + expect(output).toContain("Cloud link: linked (not enrolled)"); + expect(output).toContain("Next command"); + }); + + test("shows cloud verification data when available", () => { + const cloudVerify: CloudVerifyData = { + enrolled: true, + last_push_at: "2025-03-20T14:25:00Z", + key_prefix: "st_live_abc", + key_created_at: "2025-01-01T00:00:00Z", + total_pushes: 12, + last_push_status: "success", + }; + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 0, sending: 0, sent: 47, failed: 0 }, + lastError: null, + lastSuccess: { updated_at: "2025-03-20T14:25:00Z" }, + cloudVerify, + }; + const output = formatAlphaStatus(info); + expect(output).toContain("Cloud verified:"); + expect(output).toContain("yes"); + expect(output).toContain("Total pushes:"); + expect(output).toContain("12"); + expect(output).toContain("Last push:"); + }); + + test("omits cloud verification lines when cloudVerify is null", () => { + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 0, sending: 0, sent: 50, failed: 0 }, + lastError: null, + lastSuccess: { updated_at: "2025-01-15T09:00:00Z" }, + cloudVerify: null, + }; + const output = formatAlphaStatus(info); + expect(output).toContain("enrolled"); + expect(output).not.toContain("Cloud verified:"); + expect(output).not.toContain("Total pushes:"); + }); + + test("omits cloud verification lines when cloudVerify is undefined", () => { + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 0, sending: 0, sent: 50, failed: 0 }, + lastError: null, + lastSuccess: { updated_at: "2025-01-15T09:00:00Z" }, + }; + const output = formatAlphaStatus(info); + expect(output).not.toContain("Cloud verified:"); + }); + + test("shows cloud verification without last_push_at when null", () => { + const cloudVerify: CloudVerifyData = { + enrolled: true, + last_push_at: null, + key_prefix: "st_live_abc", + key_created_at: "2025-01-01T00:00:00Z", + total_pushes: 0, + last_push_status: null, + }; + const info: AlphaStatusInfo = { + enrolled: true, + linkState: "ready", + stats: { pending: 0, sending: 0, sent: 0, failed: 0 }, + lastError: null, + lastSuccess: null, + cloudVerify, + }; + const output = formatAlphaStatus(info); + expect(output).toContain("Cloud verified:"); + expect(output).toContain("Total pushes:"); + expect(output).toContain("0"); + expect(output).not.toContain("Last push:"); + }); +}); + +// --------------------------------------------------------------------------- +// fetchCloudVerify tests +// --------------------------------------------------------------------------- + +describe("fetchCloudVerify", () => { + test("returns null when endpoint is unreachable", async () => { + // Point to a non-existent local server to simulate network failure + const originalEnv = process.env.SELFTUNE_ALPHA_ENDPOINT; + process.env.SELFTUNE_ALPHA_ENDPOINT = "http://127.0.0.1:19999/api/v1/push"; + try { + const result = await fetchCloudVerify("st_live_test_key"); + expect(result).toBeNull(); + } finally { + if (originalEnv === undefined) { + delete process.env.SELFTUNE_ALPHA_ENDPOINT; + } else { + process.env.SELFTUNE_ALPHA_ENDPOINT = originalEnv; + } + } + }); +}); diff --git a/tests/autonomy-proof.test.ts b/tests/autonomy-proof.test.ts index 1da52c44..03c78708 100644 --- a/tests/autonomy-proof.test.ts +++ b/tests/autonomy-proof.test.ts @@ -42,14 +42,12 @@ import type { SyncResult, SyncStepResult } from "../cli/selftune/sync.js"; import type { DoctorResult, EvalEntry, - EvolutionAuditEntry, EvolutionProposal, FailurePattern, QueryLogRecord, SessionTelemetryRecord, SkillUsageRecord, } from "../cli/selftune/types.js"; -import { readJsonl } from "../cli/selftune/utils/jsonl.js"; // --------------------------------------------------------------------------- // Shared fixtures @@ -264,7 +262,6 @@ describe("autonomy proof: autonomous deploy end-to-end", () => { const skillPath = join(skillDir, "SKILL.md"); writeFileSync(skillPath, SKILL_MD_ORIGINAL, "utf-8"); - const auditLogPath = join(tmpDir, "evolution_audit_log.jsonl"); const proposal = makeProposal(skillPath); const validation = makeValidation(); diff --git a/tests/e2e/alpha-smoke.test.ts b/tests/e2e/alpha-smoke.test.ts new file mode 100644 index 00000000..1058735f --- /dev/null +++ b/tests/e2e/alpha-smoke.test.ts @@ -0,0 +1,288 @@ +/** + * End-to-end alpha smoke test. + * + * Requires: gwangju-v1 running locally with DEV_AUTH=1 + * Run: SELFTUNE_ALPHA_ENDPOINT=http://localhost:8080/api/v1/push bun test tests/e2e/alpha-smoke.test.ts + * + * This test exercises the real device-code bootstrap path: + * 1. Spawn `selftune init --alpha` in a subprocess + * 2. Race: approve the device code via test-only internal endpoint + * 3. CLI poll resolves -> config written with cloud_user_id, api_key + * 4. Run `selftune alpha upload` with the received credentials + * 5. Verify data landed in the cloud + */ + +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { randomUUID } from "node:crypto"; +import { existsSync, mkdtempSync, readFileSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { dirname, join, resolve } from "node:path"; + +// --------------------------------------------------------------------------- +// Gate: skip when cloud is not available +// --------------------------------------------------------------------------- + +const CLOUD_AVAILABLE = !!process.env.SELFTUNE_ALPHA_ENDPOINT; +const describeE2E = CLOUD_AVAILABLE ? describe : describe.skip; + +// Resolve CLI entry point (relative to this test file) +const CLI_ENTRY = resolve(dirname(import.meta.path), "..", "..", "cli", "selftune", "index.ts"); + +describeE2E("Alpha E2E Smoke", () => { + const baseUrl = (process.env.SELFTUNE_ALPHA_ENDPOINT ?? "").replace(/\/push$/, ""); + let tempHome: string; + let configDir: string; + let configPath: string; + + beforeEach(() => { + tempHome = mkdtempSync(join(tmpdir(), "selftune-e2e-")); + configDir = join(tempHome, ".selftune"); + configPath = join(configDir, "config.json"); + }); + + afterEach(() => { + rmSync(tempHome, { recursive: true, force: true }); + }); + + test( + "full flow: init --alpha device-code path -> push -> verify", + async () => { + // ----------------------------------------------------------------- + // Step 1: Spawn `selftune init --alpha` in a subprocess + // ----------------------------------------------------------------- + const initProc = Bun.spawn( + [ + "bun", + "run", + CLI_ENTRY, + "init", + "--alpha", + "--agent", + "claude_code", + "--cli-path", + CLI_ENTRY, + "--force", + ], + { + env: { + ...process.env, + HOME: tempHome, + SELFTUNE_CONFIG_DIR: configDir, + SELFTUNE_ALPHA_ENDPOINT: process.env.SELFTUNE_ALPHA_ENDPOINT, + // Prevent auto-update check from interfering + SELFTUNE_SKIP_AUTO_UPDATE: "1", + }, + stdout: "pipe", + stderr: "pipe", + }, + ); + + // ----------------------------------------------------------------- + // Step 2: Parse the user_code from subprocess stdout + // + // The init process emits JSON lines to stdout. We need the one with + // code: "device_code_issued" which contains the user_code. + // ----------------------------------------------------------------- + let userCode: string | null = null; + const stdoutChunks: string[] = []; + const stderrChunks: string[] = []; + + // Read stdout in a streaming fashion to catch the device code early + const stdoutReader = initProc.stdout.getReader(); + const decoder = new TextDecoder(); + + const readUntilUserCode = async (): Promise => { + const deadline = Date.now() + 15_000; // 15s to get the device code + let accumulated = ""; + + while (Date.now() < deadline) { + const { done, value } = await Promise.race([ + stdoutReader.read(), + new Promise<{ done: true; value: undefined }>((resolve) => + setTimeout(() => resolve({ done: true, value: undefined }), 1000), + ), + ]); + + if (value) { + const chunk = decoder.decode(value, { stream: true }); + accumulated += chunk; + stdoutChunks.push(chunk); + + // Try to parse JSON lines from accumulated output + const lines = accumulated.split("\n"); + for (const line of lines) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + const parsed = JSON.parse(trimmed); + if (parsed.code === "device_code_issued" && parsed.user_code) { + return parsed.user_code; + } + } catch { + // Not valid JSON, skip + } + } + } + + if (done) break; + } + + throw new Error(`Timed out waiting for device_code_issued. Stdout so far: ${accumulated}`); + }; + + userCode = await readUntilUserCode(); + expect(userCode).toBeTruthy(); + expect(typeof userCode).toBe("string"); + + // ----------------------------------------------------------------- + // Step 3: Approve via test-only internal endpoint + // ----------------------------------------------------------------- + const approveResponse = await fetch(`${baseUrl}/internal/test/approve-device`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ user_code: userCode }), + }); + + expect(approveResponse.ok).toBe(true); + + // ----------------------------------------------------------------- + // Step 4: Wait for init to complete (poll should resolve) + // ----------------------------------------------------------------- + // Continue reading stdout until process exits + const drainStdout = async () => { + try { + while (true) { + const { done, value } = await stdoutReader.read(); + if (value) stdoutChunks.push(decoder.decode(value, { stream: true })); + if (done) break; + } + } catch { + // Reader closed + } + }; + + // Also capture stderr + const drainStderr = async () => { + try { + const reader = initProc.stderr.getReader(); + while (true) { + const { done, value } = await reader.read(); + if (value) stderrChunks.push(decoder.decode(value, { stream: true })); + if (done) break; + } + } catch { + // Reader closed + } + }; + + await Promise.all([drainStdout(), drainStderr()]); + + const exitCode = await initProc.exited; + const fullStdout = stdoutChunks.join(""); + const fullStderr = stderrChunks.join(""); + + if (exitCode !== 0) { + console.error("init stdout:", fullStdout); + console.error("init stderr:", fullStderr); + } + expect(exitCode).toBe(0); + + // ----------------------------------------------------------------- + // Step 5: Verify config file was written correctly + // ----------------------------------------------------------------- + expect(existsSync(configPath)).toBe(true); + + const configRaw = readFileSync(configPath, "utf-8"); + const config = JSON.parse(configRaw); + + expect(config.alpha).toBeDefined(); + expect(config.alpha.enrolled).toBe(true); + expect(config.alpha.cloud_user_id).toBeTruthy(); + expect(typeof config.alpha.cloud_user_id).toBe("string"); + expect(config.alpha.api_key).toBeTruthy(); + expect( + config.alpha.api_key.startsWith("st_live_") || config.alpha.api_key.startsWith("st_test_"), + ).toBe(true); + + const apiKey = config.alpha.api_key as string; + // ----------------------------------------------------------------- + // Step 6: Verify data can be pushed using the received credentials + // ----------------------------------------------------------------- + const pushPayload = { + schema_version: "2.0", + client_version: "0.0.0-e2e", + push_id: randomUUID(), + normalizer_version: "0.0.0-e2e", + canonical: { + sessions: [ + { + record_kind: "session", + schema_version: "2.0", + normalizer_version: "0.0.0-e2e", + normalized_at: new Date().toISOString(), + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: { path: "/tmp/e2e/session.jsonl" }, + source_session_kind: "interactive", + session_id: randomUUID(), + agent_type: "claude", + agent_cli: "claude-code", + started_at: new Date().toISOString(), + ended_at: new Date().toISOString(), + completion_status: "completed", + }, + ], + prompts: [], + skill_invocations: [], + execution_facts: [], + normalization_runs: [ + { + record_kind: "normalization_run", + schema_version: "2.0", + normalizer_version: "0.0.0-e2e", + normalized_at: new Date().toISOString(), + platform: "claude_code", + capture_mode: "hook", + raw_source_ref: {}, + run_id: randomUUID(), + run_at: new Date().toISOString(), + raw_records_seen: 1, + canonical_records_written: 1, + repair_applied: false, + }, + ], + evolution_evidence: [], + orchestrate_runs: [], + }, + }; + + const pushResponse = await fetch(`${baseUrl}/push`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify(pushPayload), + }); + + // Accept 200 or 201 as success + expect(pushResponse.status).toBeGreaterThanOrEqual(200); + expect(pushResponse.status).toBeLessThan(300); + + // ----------------------------------------------------------------- + // Step 7: Verify enrollment via API + // ----------------------------------------------------------------- + const verifyResponse = await fetch(`${baseUrl}/alpha/verify`, { + method: "GET", + headers: { + Authorization: `Bearer ${apiKey}`, + }, + }); + + expect(verifyResponse.ok).toBe(true); + const verifyBody = (await verifyResponse.json()) as Record; + expect(verifyBody.enrolled).toBe(true); + }, + { timeout: 30_000 }, + ); +}); diff --git a/tests/eval/synthetic-evals.test.ts b/tests/eval/synthetic-evals.test.ts index 22076bd3..13f5d89a 100644 --- a/tests/eval/synthetic-evals.test.ts +++ b/tests/eval/synthetic-evals.test.ts @@ -32,6 +32,43 @@ describe("buildSyntheticPrompt", () => { expect(system).toContain("Implicit"); expect(system).toContain("Contextual"); }); + + test("includes real examples when provided", () => { + const realExamples = { + positive: ["make me a slide deck", "create presentation for Q4"], + negative: ["what is the weather?", "fix the login bug"], + }; + const { user } = buildSyntheticPrompt("content", "pptx", 10, 5, realExamples); + expect(user).toContain("Real user queries for style and phrasing reference:"); + expect(user).toContain("Queries that triggered this skill:"); + expect(user).toContain('"make me a slide deck"'); + expect(user).toContain('"create presentation for Q4"'); + expect(user).toContain("Queries that did NOT trigger (general queries):"); + expect(user).toContain('"what is the weather?"'); + expect(user).toContain("Generate queries that match this natural phrasing style."); + }); + + test("omits real examples section when not provided", () => { + const { user } = buildSyntheticPrompt("content", "pptx", 10, 5); + expect(user).not.toContain("Real user queries"); + }); + + test("omits real examples section when arrays are empty", () => { + const { user } = buildSyntheticPrompt("content", "pptx", 10, 5, { + positive: [], + negative: [], + }); + expect(user).not.toContain("Real user queries"); + }); + + test("includes only positive section when negatives are empty", () => { + const { user } = buildSyntheticPrompt("content", "pptx", 10, 5, { + positive: ["make slides"], + negative: [], + }); + expect(user).toContain("Queries that triggered this skill:"); + expect(user).not.toContain("Queries that did NOT trigger"); + }); }); // --------------------------------------------------------------------------- diff --git a/tests/evolution/constitutional.test.ts b/tests/evolution/constitutional.test.ts new file mode 100644 index 00000000..166b25a5 --- /dev/null +++ b/tests/evolution/constitutional.test.ts @@ -0,0 +1,173 @@ +import { describe, expect, test } from "bun:test"; +import { checkConstitution } from "../../cli/selftune/evolution/constitutional.js"; + +// --------------------------------------------------------------------------- +// Principle 1: Size constraint +// --------------------------------------------------------------------------- + +describe("Principle 1 — Size constraint", () => { + const original = "A skill that helps with testing and validation of code quality"; + + test("passes when within limits", () => { + const proposed = "A skill that helps with testing, validation, and code review"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + expect(result.violations).toHaveLength(0); + }); + + test("fails when >8192 chars", () => { + const proposed = "A".repeat(8193); + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("8192"))).toBe(true); + }); + + test("fails when >3x word count of original", () => { + // Original has ~10 words, so >30 words should fail + const words = Array.from({ length: 35 }, (_, i) => `word${i}`).join(" "); + const result = checkConstitution(words, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("3.0x"))).toBe(true); + }); + + test("fails when <0.3x word count of original", () => { + // Original has ~10 words, so <3 words should fail + const proposed = "Testing skill"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("0.3x"))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Principle 2: No XML injection +// --------------------------------------------------------------------------- + +describe("Principle 2 — No XML injection", () => { + const original = "A skill for building presentations"; + + test("passes clean text", () => { + const proposed = "A skill for building presentations and slide decks"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); + + test("fails with script tag", () => { + const proposed = "A skill for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("XML"))).toBe(true); + }); + + test("passes with less-than in normal text like A < B", () => { + const proposed = "A skill where quality < perfection is the norm for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Principle 3: No unbounded broadening +// --------------------------------------------------------------------------- + +describe("Principle 3 — No unbounded broadening", () => { + const original = "A skill for building presentations"; + + test("passes qualified broadening with enumeration", () => { + const proposed = "Supports all formats including PDF, DOCX, and PPTX for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); + + test("fails bare 'all requests'", () => { + const proposed = "Handles all requests for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("broadening"))).toBe(true); + }); + + test("fails bare 'everything'", () => { + const proposed = "Works with everything for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("broadening"))).toBe(true); + }); + + test("passes 'any' followed by 'such as'", () => { + const proposed = "Handles any format such as PDF or DOCX for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); + + test("passes 'every' followed by 'e.g.'", () => { + const proposed = "Covers every presentation type, e.g., slides and handouts"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); + + test("passes 'all' followed by comma-separated list", () => { + const proposed = "Supports all output types, PDF, DOCX, HTML for presentations"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Principle 4: Anchor preservation +// --------------------------------------------------------------------------- + +describe("Principle 4 — Anchor preservation", () => { + test("passes when USE WHEN is preserved", () => { + const original = "A skill for testing. USE WHEN the user asks about tests"; + const proposed = + "An improved skill for testing. USE WHEN the user asks about tests or validation"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); + + test("fails when USE WHEN is dropped", () => { + const original = "A skill for testing. USE WHEN the user asks about tests"; + const proposed = "An improved skill for testing and validation"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("Anchor"))).toBe(true); + }); + + test("passes when no USE WHEN in original", () => { + const original = "A skill for testing things"; + const proposed = "An improved skill for testing things and code review"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + }); + + test("fails when $skillName reference is dropped", () => { + const original = "A skill for $test-skill slash command usage"; + const proposed = "A skill for running tests"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.some((v) => v.includes("Anchor"))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Combined +// --------------------------------------------------------------------------- + +describe("Combined checks", () => { + test("passes a good proposal", () => { + const original = "A skill for building presentations and slide decks"; + const proposed = "A skill for building presentations, slide decks, and visual reports"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(true); + expect(result.violations).toHaveLength(0); + }); + + test("fails a bad proposal with multiple violations", () => { + const original = "A skill for testing. USE WHEN user asks about tests"; + // has XML, has unbounded broadening, drops USE WHEN + const proposed = "
x
everything"; + const result = checkConstitution(proposed, original, "test-skill"); + expect(result.passed).toBe(false); + expect(result.violations.length).toBeGreaterThanOrEqual(2); + }); +}); diff --git a/tests/evolution/evolve.test.ts b/tests/evolution/evolve.test.ts index 565afa68..b0f211ba 100644 --- a/tests/evolution/evolve.test.ts +++ b/tests/evolution/evolve.test.ts @@ -603,6 +603,19 @@ describe("evolve orchestrator", () => { expect(result.reason).toContain("sonnet"); expect(result.gateValidation).toBeDefined(); expect(result.gateValidation?.improved).toBe(false); + + const rejectedCalls = mockAppendAuditEntry.mock.calls.filter( + (call: unknown[]) => (call[0] as EvolutionAuditEntry).action === "rejected", + ); + expect(rejectedCalls.length).toBeGreaterThanOrEqual(1); + expect((rejectedCalls[rejectedCalls.length - 1]?.[0] as EvolutionAuditEntry).details).toContain( + "Gate validation failed", + ); + + const rejectedEvidence = mockAppendEvidenceEntry.mock.calls.filter( + (call: unknown[]) => (call[0] as EvolutionEvidenceEntry).stage === "rejected", + ); + expect(rejectedEvidence.length).toBeGreaterThanOrEqual(1); }); // 14. No gate validation when gateModel is not set diff --git a/tests/evolution/propose-body.test.ts b/tests/evolution/propose-body.test.ts index 45970674..2e09be66 100644 --- a/tests/evolution/propose-body.test.ts +++ b/tests/evolution/propose-body.test.ts @@ -94,6 +94,57 @@ describe("buildBodyGenerationPrompt", () => { expect(prompt).not.toContain("Reference Examples"); }); + test("includes execution context when provided", () => { + const execCtx = { + avgToolCalls: 12.5, + avgErrors: 1.3, + avgTurns: 8.0, + commonTools: ["Read", "Edit", "Bash"], + failureTools: ["Bash"], + }; + const prompt = buildBodyGenerationPrompt( + currentContent, + patterns, + missedQueries, + skillName, + undefined, + execCtx, + ); + expect(prompt).toContain("Execution Profile"); + expect(prompt).toContain("Average tool calls per session: 12.5"); + expect(prompt).toContain("Average errors per session: 1.3"); + expect(prompt).toContain("Average assistant turns: 8.0"); + expect(prompt).toContain("Read, Edit, Bash"); + expect(prompt).toContain("Tools correlated with failures: Bash"); + }); + + test("omits execution context when not provided", () => { + const prompt = buildBodyGenerationPrompt(currentContent, patterns, missedQueries, skillName); + expect(prompt).not.toContain("Execution Profile"); + expect(prompt).not.toContain("Average tool calls"); + }); + + test("handles execution context with empty tool lists", () => { + const execCtx = { + avgToolCalls: 0, + avgErrors: 0, + avgTurns: 0, + commonTools: [], + failureTools: [], + }; + const prompt = buildBodyGenerationPrompt( + currentContent, + patterns, + missedQueries, + skillName, + undefined, + execCtx, + ); + expect(prompt).toContain("Execution Profile"); + expect(prompt).toContain("Most-used tools in successful sessions: none"); + expect(prompt).toContain("Tools correlated with failures: none"); + }); + test("includes failure feedback when present", () => { const patternsWithFeedback: FailurePattern[] = [ { diff --git a/tests/evolution/propose-description.test.ts b/tests/evolution/propose-description.test.ts index 39a03682..c6c28074 100644 --- a/tests/evolution/propose-description.test.ts +++ b/tests/evolution/propose-description.test.ts @@ -107,6 +107,34 @@ describe("buildProposalPrompt", () => { const prompt = buildProposalPrompt(currentDescription, patterns, missedQueries, skillName); expect(prompt).not.toContain("Structured Failure Analysis"); }); + + test("includes aggregate metrics section when provided", () => { + const metrics = { + mean_score: 0.72, + score_std_dev: 0.15, + failed_session_rate: 0.33, + mean_errors: 2.5, + total_graded: 12, + }; + const prompt = buildProposalPrompt( + currentDescription, + patterns, + missedQueries, + skillName, + metrics, + ); + expect(prompt).toContain("Mean grading score: 0.72/1.0"); + expect(prompt).toContain("σ=0.15"); + expect(prompt).toContain("Failed session rate: 33%"); + expect(prompt).toContain("Mean execution errors per session: 2.5"); + expect(prompt).toContain("Sessions graded: 12"); + }); + + test("omits aggregate metrics section when not provided", () => { + const prompt = buildProposalPrompt(currentDescription, patterns, missedQueries, skillName); + expect(prompt).not.toContain("Mean grading score"); + expect(prompt).not.toContain("Failed session rate"); + }); }); // --------------------------------------------------------------------------- diff --git a/tests/helpers/isolated-store.ts b/tests/helpers/isolated-store.ts new file mode 100644 index 00000000..53095d4b --- /dev/null +++ b/tests/helpers/isolated-store.ts @@ -0,0 +1,47 @@ +/** + * Creates a temporary isolated store directory for hermetic testing. + * Returns paths and env vars that redirect all selftune storage, + * plus a cleanup function to remove the temp directory. + */ + +import { mkdirSync, mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +export interface IsolatedStore { + /** Root temp directory (acts as SELFTUNE_HOME) */ + root: string; + /** Environment variables to set for isolation */ + env: { + SELFTUNE_HOME: string; + SELFTUNE_CONFIG_DIR: string; + SELFTUNE_LOG_DIR: string; + }; + /** Remove the temp directory and all contents */ + cleanup: () => void; +} + +export function createIsolatedStore(): IsolatedStore { + const root = mkdtempSync(join(tmpdir(), "selftune-test-")); + const configDir = join(root, ".selftune"); + const logDir = join(root, ".claude"); + + mkdirSync(configDir, { recursive: true }); + mkdirSync(logDir, { recursive: true }); + + return { + root, + env: { + SELFTUNE_HOME: root, + SELFTUNE_CONFIG_DIR: configDir, + SELFTUNE_LOG_DIR: logDir, + }, + cleanup: () => { + try { + rmSync(root, { recursive: true, force: true }); + } catch { + /* best-effort */ + } + }, + }; +} diff --git a/tests/hooks/prompt-log.test.ts b/tests/hooks/prompt-log.test.ts index 31265de8..f79cf394 100644 --- a/tests/hooks/prompt-log.test.ts +++ b/tests/hooks/prompt-log.test.ts @@ -4,7 +4,7 @@ import { tmpdir } from "node:os"; import { join } from "node:path"; import { processPrompt } from "../../cli/selftune/hooks/prompt-log.js"; import { _setTestDb, getDb, openDb } from "../../cli/selftune/localdb/db.js"; -import type { PromptSubmitPayload, QueryLogRecord } from "../../cli/selftune/types.js"; +import type { PromptSubmitPayload } from "../../cli/selftune/types.js"; let tmpDir: string; let canonicalLogPath: string; diff --git a/tests/hooks/session-stop.test.ts b/tests/hooks/session-stop.test.ts index de8b2c9b..e6716426 100644 --- a/tests/hooks/session-stop.test.ts +++ b/tests/hooks/session-stop.test.ts @@ -5,7 +5,6 @@ import { join } from "node:path"; import { processPrompt } from "../../cli/selftune/hooks/prompt-log.js"; import { processSessionStop } from "../../cli/selftune/hooks/session-stop.js"; import { _setTestDb, getDb, openDb } from "../../cli/selftune/localdb/db.js"; -import type { SessionTelemetryRecord } from "../../cli/selftune/types.js"; let tmpDir: string; let canonicalLogPath: string; diff --git a/tests/hooks/skill-eval.test.ts b/tests/hooks/skill-eval.test.ts index 1c57b456..c25d1202 100644 --- a/tests/hooks/skill-eval.test.ts +++ b/tests/hooks/skill-eval.test.ts @@ -5,18 +5,18 @@ import { join } from "node:path"; import { processPrompt } from "../../cli/selftune/hooks/prompt-log.js"; import { extractSkillName, processToolUse } from "../../cli/selftune/hooks/skill-eval.js"; import { _setTestDb, getDb, openDb } from "../../cli/selftune/localdb/db.js"; -import type { PostToolUsePayload, SkillUsageRecord } from "../../cli/selftune/types.js"; +import type { PostToolUsePayload } from "../../cli/selftune/types.js"; let tmpDir: string; let canonicalLogPath: string; let promptStatePath: string; -let queryLogPath: string; +let _queryLogPath: string; beforeEach(() => { tmpDir = mkdtempSync(join(tmpdir(), "selftune-skill-eval-")); canonicalLogPath = join(tmpDir, "canonical.jsonl"); promptStatePath = join(tmpDir, "canonical-session-state.json"); - queryLogPath = join(tmpDir, "queries.jsonl"); + _queryLogPath = join(tmpDir, "queries.jsonl"); const testDb = openDb(":memory:"); _setTestDb(testDb); diff --git a/tests/ingestors/claude-replay.test.ts b/tests/ingestors/claude-replay.test.ts index ba67fca0..2bbd9401 100644 --- a/tests/ingestors/claude-replay.test.ts +++ b/tests/ingestors/claude-replay.test.ts @@ -1,13 +1,5 @@ import { afterEach, beforeEach, describe, expect, test } from "bun:test"; -import { - existsSync, - mkdirSync, - mkdtempSync, - readFileSync, - rmSync, - utimesSync, - writeFileSync, -} from "node:fs"; +import { existsSync, mkdirSync, mkdtempSync, rmSync, utimesSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { diff --git a/tests/init/alpha-consent.test.ts b/tests/init/alpha-consent.test.ts new file mode 100644 index 00000000..f96c8082 --- /dev/null +++ b/tests/init/alpha-consent.test.ts @@ -0,0 +1,299 @@ +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { + ALPHA_CONSENT_NOTICE, + generateUserId, + readAlphaIdentity, + writeAlphaIdentity, +} from "../../cli/selftune/alpha-identity.js"; +import { runInit } from "../../cli/selftune/init.js"; +import type { AlphaIdentity, SelftuneConfig } from "../../cli/selftune/types.js"; + +let tmpDir: string; + +beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "selftune-alpha-")); +}); + +afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); +}); + +// --------------------------------------------------------------------------- +// alpha-identity module +// --------------------------------------------------------------------------- + +describe("generateUserId", () => { + test("returns a valid UUID string", () => { + const id = generateUserId(); + // UUID v4 format: xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx + expect(id).toMatch(/^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/); + }); + + test("generates unique IDs on each call", () => { + const id1 = generateUserId(); + const id2 = generateUserId(); + expect(id1).not.toBe(id2); + }); +}); + +describe("readAlphaIdentity", () => { + test("returns null when config does not exist", () => { + const result = readAlphaIdentity(join(tmpDir, "nonexistent.json")); + expect(result).toBeNull(); + }); + + test("returns null when config has no alpha block", () => { + const configPath = join(tmpDir, "config.json"); + writeFileSync(configPath, JSON.stringify({ agent_type: "claude_code" }), "utf-8"); + const result = readAlphaIdentity(configPath); + expect(result).toBeNull(); + }); + + test("returns null when config contains malformed JSON", () => { + const configPath = join(tmpDir, "bad.json"); + writeFileSync(configPath, "{invalid json!!!", "utf-8"); + const result = readAlphaIdentity(configPath); + expect(result).toBeNull(); + }); + + test("returns alpha block when present", () => { + const configPath = join(tmpDir, "config.json"); + const alpha: AlphaIdentity = { + enrolled: true, + user_id: "test-uuid", + email: "test@example.com", + consent_timestamp: "2026-03-18T00:00:00Z", + }; + writeFileSync(configPath, JSON.stringify({ agent_type: "claude_code", alpha }), "utf-8"); + + const result = readAlphaIdentity(configPath); + expect(result).not.toBeNull(); + expect(result?.enrolled).toBe(true); + expect(result?.user_id).toBe("test-uuid"); + expect(result?.email).toBe("test@example.com"); + }); +}); + +describe("writeAlphaIdentity", () => { + test("writes alpha block to new config file", () => { + const configPath = join(tmpDir, "config.json"); + const identity: AlphaIdentity = { + enrolled: true, + user_id: "new-uuid", + email: "new@example.com", + consent_timestamp: "2026-03-18T00:00:00Z", + }; + + writeAlphaIdentity(configPath, identity); + + const raw = JSON.parse(readFileSync(configPath, "utf-8")); + expect(raw.alpha).toEqual(identity); + }); + + test("throws when config contains malformed JSON", () => { + const configPath = join(tmpDir, "corrupt.json"); + writeFileSync(configPath, "not valid json{{{", "utf-8"); + const identity: AlphaIdentity = { + enrolled: true, + user_id: "test-uuid", + email: "test@example.com", + consent_timestamp: "2026-03-18T00:00:00Z", + }; + expect(() => writeAlphaIdentity(configPath, identity)).toThrow(); + }); + + test("merges alpha block into existing config without clobbering other fields", () => { + const configPath = join(tmpDir, "config.json"); + writeFileSync( + configPath, + JSON.stringify({ agent_type: "claude_code", cli_path: "/test" }), + "utf-8", + ); + + const identity: AlphaIdentity = { + enrolled: true, + user_id: "merged-uuid", + email: "merged@example.com", + consent_timestamp: "2026-03-18T00:00:00Z", + }; + + writeAlphaIdentity(configPath, identity); + + const raw = JSON.parse(readFileSync(configPath, "utf-8")); + expect(raw.agent_type).toBe("claude_code"); + expect(raw.cli_path).toBe("/test"); + expect(raw.alpha.user_id).toBe("merged-uuid"); + }); +}); + +describe("ALPHA_CONSENT_NOTICE", () => { + test("contains key disclosure elements", () => { + expect(ALPHA_CONSENT_NOTICE).toContain("alpha"); + expect(ALPHA_CONSENT_NOTICE).toContain("WHAT IS COLLECTED"); + expect(ALPHA_CONSENT_NOTICE).toContain("WHAT IS NOT COLLECTED"); + expect(ALPHA_CONSENT_NOTICE).toContain("Raw user prompt/query text"); + expect(ALPHA_CONSENT_NOTICE).toContain("selftune init --no-alpha"); + }); +}); + +// --------------------------------------------------------------------------- +// runInit alpha integration +// --------------------------------------------------------------------------- + +describe("runInit with alpha", () => { + function makeInitOpts(overrides: Record = {}) { + const configDir = join(tmpDir, ".selftune"); + const configPath = join(configDir, "config.json"); + return { + configDir, + configPath, + force: false, + agentOverride: "claude_code", + cliPathOverride: "/test/cli/selftune/index.ts", + homeDir: tmpDir, + ...overrides, + }; + } + + test("writes alpha block with valid UUID when alpha=true with key and email", async () => { + const opts = makeInitOpts({ + alpha: true, + alphaEmail: "user@example.com", + alphaName: "Test User", + alphaKey: "st_live_testkey123", + }); + + const config = await runInit(opts); + + expect(config.alpha).toBeDefined(); + expect(config.alpha?.enrolled).toBe(true); + expect(config.alpha?.user_id).toMatch( + /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/, + ); + expect(config.alpha?.email).toBe("user@example.com"); + expect(config.alpha?.display_name).toBe("Test User"); + expect(config.alpha?.consent_timestamp).toBeTruthy(); + }); + + test("does NOT write alpha block when alpha flag is absent", async () => { + const opts = makeInitOpts(); + const config = await runInit(opts); + expect(config.alpha).toBeUndefined(); + }); + + test("throws error when alpha=true with key but no email provided", async () => { + const opts = makeInitOpts({ alpha: true, alphaKey: "st_live_test" }); + await expect(runInit(opts)).rejects.toThrow( + "--alpha-email flag is required when using --alpha-key", + ); + }); + + test("--no-alpha sets enrolled=false but preserves user_id", async () => { + const configDir = join(tmpDir, ".selftune"); + const _configPath = join(configDir, "config.json"); + + // First, enroll with direct key path + const enrollConfig = await runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "user@example.com", + alphaKey: "st_live_testkey123", + force: true, + }), + ); + const originalUserId = enrollConfig.alpha?.user_id; + + // Then unenroll + const unenrollConfig = await runInit( + makeInitOpts({ + noAlpha: true, + force: true, + }), + ); + + expect(unenrollConfig.alpha).toBeDefined(); + expect(unenrollConfig.alpha?.enrolled).toBe(false); + expect(unenrollConfig.alpha?.user_id).toBe(originalUserId); + }); + + test("reinit with force + alpha preserves existing user_id", async () => { + const configDir = join(tmpDir, ".selftune"); + const _configPath = join(configDir, "config.json"); + + // First enrollment with key + const firstConfig = await runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "first@example.com", + alphaKey: "st_live_firstkey", + force: true, + }), + ); + const originalUserId = firstConfig.alpha?.user_id; + + // Re-init with force + alpha + new key (should preserve user_id) + const secondConfig = await runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "second@example.com", + alphaKey: "st_live_secondkey", + force: true, + }), + ); + + expect(secondConfig.alpha?.user_id).toBe(originalUserId); + expect(secondConfig.alpha?.email).toBe("second@example.com"); + }); + + test("plain force reinit preserves existing alpha enrollment", async () => { + const firstConfig = await runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "first@example.com", + alphaKey: "st_live_testkey123", + force: true, + }), + ); + + const secondConfig = await runInit( + makeInitOpts({ + force: true, + }), + ); + + expect(secondConfig.alpha).toBeDefined(); + expect(secondConfig.alpha?.enrolled).toBe(true); + expect(secondConfig.alpha?.user_id).toBe(firstConfig.alpha?.user_id); + expect(secondConfig.alpha?.email).toBe("first@example.com"); + }); + + test("config round-trips correctly (read after write)", async () => { + const opts = makeInitOpts({ + alpha: true, + alphaEmail: "roundtrip@example.com", + alphaName: "Round Trip", + alphaKey: "st_live_roundtrip", + }); + + await runInit(opts); + + // Read back from disk + const raw = JSON.parse(readFileSync(opts.configPath, "utf-8")) as SelftuneConfig; + expect(raw.alpha).toBeDefined(); + expect(raw.alpha?.enrolled).toBe(true); + expect(raw.alpha?.email).toBe("roundtrip@example.com"); + expect(raw.alpha?.display_name).toBe("Round Trip"); + expect(raw.alpha?.user_id).toMatch( + /^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$/, + ); + + // Read via the identity module + const identity = readAlphaIdentity(opts.configPath); + expect(identity).not.toBeNull(); + expect(identity?.user_id).toBe(raw.alpha?.user_id); + }); +}); diff --git a/tests/init/alpha-onboarding-e2e.test.ts b/tests/init/alpha-onboarding-e2e.test.ts new file mode 100644 index 00000000..f1119315 --- /dev/null +++ b/tests/init/alpha-onboarding-e2e.test.ts @@ -0,0 +1,171 @@ +/** + * E2E smoke test: fresh config → alpha-enrolled → upload-ready + * + * Exercises the real runInit() path, not synthetic config writes. + */ + +import { afterEach, beforeEach, describe, expect, test } from "bun:test"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { getAlphaLinkState, readAlphaIdentity } from "../../cli/selftune/alpha-identity.js"; +import { checkAlphaReadiness, runInit } from "../../cli/selftune/init.js"; +import { checkCloudLinkHealth } from "../../cli/selftune/observability.js"; + +let tmpDir: string; + +beforeEach(() => { + tmpDir = mkdtempSync(join(tmpdir(), "selftune-onboarding-e2e-")); +}); + +afterEach(() => { + rmSync(tmpDir, { recursive: true, force: true }); +}); + +function makeInitOpts(overrides: Record = {}) { + const configDir = join(tmpDir, ".selftune"); + const configPath = join(configDir, "config.json"); + return { + configDir, + configPath, + force: false, + agentOverride: "claude_code", + cliPathOverride: "/test/cli/selftune/index.ts", + homeDir: tmpDir, + ...overrides, + }; +} + +describe("Agent-first alpha onboarding E2E", () => { + test("fresh config → selftune init --alpha --alpha-key → upload-ready", async () => { + const testApiKey = ["st_live", "abc123xyz"].join("_"); + const opts = makeInitOpts(); + + // Step 1: Fresh machine — no config exists + expect(readAlphaIdentity(opts.configPath)).toBeNull(); + const readiness0 = checkAlphaReadiness(opts.configPath); + expect(readiness0.ready).toBe(false); + expect(readiness0.missing).toContain("alpha identity not configured"); + expect(readiness0.guidance.blocking).toBe(true); + expect(readiness0.guidance.next_command).toContain("selftune init --alpha"); + + // Step 2: Enroll with email + key (direct key path) + const config1 = await runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "user@example.com", + alphaName: "Test User", + alphaKey: testApiKey, + }), + ); + + expect(config1.alpha?.enrolled).toBe(true); + expect(config1.alpha?.email).toBe("user@example.com"); + expect(config1.alpha?.api_key).toBe(testApiKey); + + // Step 3: Readiness check — api_key is valid so readiness passes + const readiness1 = checkAlphaReadiness(opts.configPath); + expect(readiness1.ready).toBe(true); + expect(readiness1.missing).toHaveLength(0); + + // Note: guidance uses getAlphaLinkState which requires cloud_user_id for "ready". + // Direct-key path doesn't set cloud_user_id, so guidance still shows blocking. + // This is expected — device-code flow is the recommended path for full linking. + + // Step 4: Health checks + const identity1 = readAlphaIdentity(opts.configPath); + const healthChecks = checkCloudLinkHealth(identity1); + expect(healthChecks.length).toBeGreaterThan(0); + }); + + test("invalid credential format rejected by init", async () => { + await expect( + runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "user@example.com", + alphaKey: "bad_key_format", + }), + ), + ).rejects.toThrow("API key must start with 'st_live_' or 'st_test_'"); + }); + + test("--alpha without --alpha-key requires device-code flow (no email needed)", async () => { + // When --alpha is provided without --alpha-key, init triggers device-code flow. + // Without a mock server, this will fail on the fetch — confirming the flow is entered. + await expect( + runInit( + makeInitOpts({ + alpha: true, + alphaEmail: "user@example.com", + }), + ), + ).rejects.toThrow(); // fetch will fail since no server is running + }); + + test("--alpha --alpha-key without --alpha-email throws", async () => { + await expect( + runInit( + makeInitOpts({ + alpha: true, + alphaKey: "st_live_abc123", + }), + ), + ).rejects.toThrow("--alpha-email flag is required when using --alpha-key"); + }); + + test("link state transitions are correct", () => { + expect(getAlphaLinkState(null)).toBe("not_linked"); + + // enrolled=false, no cloud_user_id -> not_linked + expect( + getAlphaLinkState({ + enrolled: false, + user_id: "u1", + consent_timestamp: "", + }), + ).toBe("not_linked"); + + // enrolled=false, has cloud_user_id -> linked_not_enrolled + expect( + getAlphaLinkState({ + enrolled: false, + user_id: "u1", + consent_timestamp: "", + cloud_user_id: "cloud-1", + }), + ).toBe("linked_not_enrolled"); + + // enrolled=true, no api_key -> enrolled_no_credential + expect( + getAlphaLinkState({ + enrolled: true, + user_id: "u1", + consent_timestamp: "", + }), + ).toBe("enrolled_no_credential"); + + // enrolled=true, has cloud_user_id + valid api_key -> ready + expect( + getAlphaLinkState({ + enrolled: true, + user_id: "u1", + consent_timestamp: "", + cloud_user_id: "cloud-1", + api_key: "st_live_x", + }), + ).toBe("ready"); + + // enrolled=true, has valid api_key but no cloud_user_id -> ready + // (cloud_user_id is bonus enrichment, not a gate for readiness) + expect( + getAlphaLinkState({ + enrolled: true, + user_id: "u1", + consent_timestamp: "", + api_key: "st_live_x", + }), + ).toBe("ready"); + }); +}); diff --git a/tests/init/device-code.test.ts b/tests/init/device-code.test.ts new file mode 100644 index 00000000..e8b1d7eb --- /dev/null +++ b/tests/init/device-code.test.ts @@ -0,0 +1,186 @@ +/** + * Tests for the device-code authentication client. + * + * Mocks globalThis.fetch to test requestDeviceCode, pollDeviceCode, and getBaseUrl + * without making real network calls. + */ + +import { afterEach, describe, expect, it } from "bun:test"; + +import { + getBaseUrl, + pollDeviceCode, + requestDeviceCode, +} from "../../cli/selftune/auth/device-code.js"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +const originalFetch = globalThis.fetch; +const originalEnv = { ...process.env }; + +function mockFetch(handler: (url: string, init?: RequestInit) => Promise): void { + globalThis.fetch = handler as typeof globalThis.fetch; +} + +function restoreFetch(): void { + globalThis.fetch = originalFetch; +} + +// --------------------------------------------------------------------------- +// getBaseUrl +// --------------------------------------------------------------------------- + +describe("getBaseUrl", () => { + afterEach(() => { + process.env = { ...originalEnv }; + }); + + it("strips /push from SELFTUNE_ALPHA_ENDPOINT", () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://api.example.com/api/v1/push"; + expect(getBaseUrl()).toBe("https://api.example.com/api/v1"); + }); + + it("returns the endpoint unchanged when it does not end with /push", () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://api.example.com/api/v1"; + expect(getBaseUrl()).toBe("https://api.example.com/api/v1"); + }); + + it("uses default endpoint when env var is not set", () => { + delete process.env.SELFTUNE_ALPHA_ENDPOINT; + expect(getBaseUrl()).toBe("https://api.selftune.dev/api/v1"); + }); +}); + +// --------------------------------------------------------------------------- +// requestDeviceCode +// --------------------------------------------------------------------------- + +describe("requestDeviceCode", () => { + afterEach(() => { + restoreFetch(); + process.env = { ...originalEnv }; + }); + + it("returns a DeviceCodeGrant on success", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + const grant = { + device_code: "dc_abc123", + user_code: "ABCD-1234", + verification_url: "https://test.local/verify", + expires_in: 300, + interval: 5, + }; + + mockFetch(async (url, init) => { + expect(url).toBe("https://test.local/api/v1/device-code"); + expect(init?.method).toBe("POST"); + const body = JSON.parse(init?.body as string); + expect(body.client_id).toBe("selftune-cli"); + expect(body.scope).toBe("push read"); + return new Response(JSON.stringify(grant), { status: 200 }); + }); + + const result = await requestDeviceCode(); + expect(result).toEqual(grant); + }); + + it("throws on non-200 response", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + mockFetch(async () => { + return new Response("Server Error", { status: 500, statusText: "Internal Server Error" }); + }); + + await expect(requestDeviceCode()).rejects.toThrow("Device code request failed: 500"); + }); +}); + +// --------------------------------------------------------------------------- +// pollDeviceCode +// --------------------------------------------------------------------------- + +describe("pollDeviceCode", () => { + afterEach(() => { + restoreFetch(); + process.env = { ...originalEnv }; + }); + + it("resolves on approved after pending polls", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + let callCount = 0; + mockFetch(async (url) => { + expect(url).toBe("https://test.local/api/v1/device-code/poll"); + callCount++; + if (callCount < 3) { + return new Response(JSON.stringify({ status: "pending" }), { status: 200 }); + } + return new Response( + JSON.stringify({ + status: "approved", + api_key: "st_live_newkey123", + cloud_user_id: "cloud_user_abc", + org_id: "org_xyz", + }), + { status: 200 }, + ); + }); + + // Use very short interval (0.01s) and long expiry for test speed + const result = await pollDeviceCode("dc_test", 0.01, 30); + expect(result.api_key).toBe("st_live_newkey123"); + expect(result.cloud_user_id).toBe("cloud_user_abc"); + expect(result.org_id).toBe("org_xyz"); + expect(callCount).toBe(3); + }); + + it("throws on expired status (HTTP 410 with JSON body)", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + mockFetch(async () => { + return new Response(JSON.stringify({ status: "expired" }), { status: 410 }); + }); + + await expect(pollDeviceCode("dc_test", 0.01, 30)).rejects.toThrow( + "Device code expired. Please retry.", + ); + }); + + it("throws on denied status (HTTP 403 with JSON body)", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + mockFetch(async () => { + return new Response(JSON.stringify({ status: "denied" }), { status: 403 }); + }); + + await expect(pollDeviceCode("dc_test", 0.01, 30)).rejects.toThrow( + "Device code denied by user.", + ); + }); + + it("throws on poll HTTP failure with non-JSON body", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + mockFetch(async () => { + return new Response("Bad", { status: 503 }); + }); + + await expect(pollDeviceCode("dc_test", 0.01, 30)).rejects.toThrow("Poll failed: 503"); + }); + + it("times out when deadline passes without approval", async () => { + process.env.SELFTUNE_ALPHA_ENDPOINT = "https://test.local/api/v1/push"; + + mockFetch(async () => { + return new Response(JSON.stringify({ status: "pending" }), { status: 200 }); + }); + + // expiresIn=0 means deadline is already passed before first poll attempt + // But the first poll still runs because we sleep first then check deadline + // Use a tiny expiry so it times out quickly + await expect(pollDeviceCode("dc_test", 0.01, 0.01)).rejects.toThrow(/timed out|expired/); + }); +}); diff --git a/tests/init/init.test.ts b/tests/init/init.test.ts index 18627241..193ab706 100644 --- a/tests/init/init.test.ts +++ b/tests/init/init.test.ts @@ -161,11 +161,11 @@ describe("checkClaudeCodeHooks", () => { // --------------------------------------------------------------------------- describe("runInit", () => { - test("writes config to specified directory", () => { + test("writes config to specified directory", async () => { const configDir = join(tmpDir, ".selftune"); const configPath = join(configDir, "config.json"); - const result = runInit({ + const result = await runInit({ configDir, configPath, force: false, @@ -184,7 +184,7 @@ describe("runInit", () => { expect(written.agent_type).toBe("claude_code"); }); - test("returns existing config without force flag", () => { + test("returns existing config without force flag", async () => { const configDir = join(tmpDir, ".selftune"); const configPath = join(configDir, "config.json"); mkdirSync(configDir, { recursive: true }); @@ -199,7 +199,7 @@ describe("runInit", () => { }; writeFileSync(configPath, JSON.stringify(existingConfig, null, 2), "utf-8"); - const result = runInit({ + const result = await runInit({ configDir, configPath, force: false, @@ -210,7 +210,7 @@ describe("runInit", () => { expect(result.initialized_at).toBe("2025-01-01T00:00:00.000Z"); }); - test("overwrites existing config with force flag", () => { + test("overwrites existing config with force flag", async () => { const configDir = join(tmpDir, ".selftune"); const configPath = join(configDir, "config.json"); mkdirSync(configDir, { recursive: true }); @@ -225,7 +225,7 @@ describe("runInit", () => { }; writeFileSync(configPath, JSON.stringify(existingConfig, null, 2), "utf-8"); - const result = runInit({ + const result = await runInit({ configDir, configPath, force: true, @@ -239,11 +239,11 @@ describe("runInit", () => { expect(result.initialized_at).not.toBe("2025-01-01T00:00:00.000Z"); }); - test("creates config directory if it does not exist", () => { + test("creates config directory if it does not exist", async () => { const configDir = join(tmpDir, "nested", "deep", ".selftune"); const configPath = join(configDir, "config.json"); - const result = runInit({ + const result = await runInit({ configDir, configPath, force: false, @@ -257,11 +257,11 @@ describe("runInit", () => { expect(result.agent_type).toBe("codex"); }); - test("config file is valid JSON with pretty formatting", () => { + test("config file is valid JSON with pretty formatting", async () => { const configDir = join(tmpDir, ".selftune"); const configPath = join(configDir, "config.json"); - runInit({ + await runInit({ configDir, configPath, force: false, @@ -283,7 +283,7 @@ describe("runInit", () => { expect(parsed).toHaveProperty("initialized_at"); }); - test("sets hooks_installed correctly for claude_code", () => { + test("sets hooks_installed correctly for claude_code", async () => { const claudeDir = join(tmpDir, ".claude"); mkdirSync(claudeDir, { recursive: true }); @@ -300,7 +300,7 @@ describe("runInit", () => { const configDir = join(tmpDir, ".selftune"); const configPath = join(configDir, "config.json"); - const result = runInit({ + const result = await runInit({ configDir, configPath, force: false, diff --git a/tests/last/last.test.ts b/tests/last/last.test.ts index 7b06a522..c45e98de 100644 --- a/tests/last/last.test.ts +++ b/tests/last/last.test.ts @@ -126,7 +126,7 @@ describe("computeLastInsight", () => { const result = computeLastInsight(telemetry, skills, queries); expect(result).not.toBeNull(); expect(result?.recommendation).toBe( - "2 queries had no skill match. Run 'selftune evals --list-skills' to investigate.", + "2 queries had no skill match. Run 'selftune eval generate --list-skills' to investigate.", ); }); @@ -177,7 +177,7 @@ describe("formatInsight", () => { errors: 0, toolCalls: 14, recommendation: - "3 queries had no skill match. Run 'selftune evals --list-skills' to investigate.", + "3 queries had no skill match. Run 'selftune eval generate --list-skills' to investigate.", }; const output = formatInsight(insight); expect(output).toContain("a1b2c3d4"); @@ -189,7 +189,7 @@ describe("formatInsight", () => { expect(output).toContain("Errors:"); expect(output).toContain("Tool calls:"); expect(output).toContain("14"); - expect(output).toContain("selftune evals --list-skills"); + expect(output).toContain("selftune eval generate --list-skills"); }); test("output omits unmatched section when no unmatched queries", () => { diff --git a/tests/localdb/localdb.test.ts b/tests/localdb/localdb.test.ts index 7c7f9a24..ab10b0af 100644 --- a/tests/localdb/localdb.test.ts +++ b/tests/localdb/localdb.test.ts @@ -8,7 +8,8 @@ import { afterEach, beforeEach, describe, expect, it } from "bun:test"; * All tests use :memory: databases — no filesystem side effects. */ -import { getMeta, openDb, setMeta } from "../../cli/selftune/localdb/db.js"; +import { _setTestDb, getMeta, openDb, setMeta } from "../../cli/selftune/localdb/db.js"; +import { writeEvolutionAuditToDb } from "../../cli/selftune/localdb/direct-write.js"; import { getOverviewPayload, getSkillReportPayload, @@ -410,6 +411,57 @@ describe("localdb queries", () => { }); }); +// --------------------------------------------------------------------------- +// Direct-write: iterations_used column +// --------------------------------------------------------------------------- + +describe("writeEvolutionAuditToDb iterations_used", () => { + let db: Database; + + beforeEach(() => { + db = openDb(":memory:"); + _setTestDb(db); + }); + + afterEach(() => { + db.close(); + _setTestDb(null); + }); + + it("persists iterations_used and reads it back", () => { + const ok = writeEvolutionAuditToDb({ + timestamp: "2026-03-18T12:00:00Z", + proposal_id: "prop-iter-1", + skill_name: "TestSkill", + action: "deployed", + details: "Deployed after 3 iterations", + iterations_used: 3, + }); + expect(ok).toBe(true); + + const row = db + .query("SELECT iterations_used FROM evolution_audit WHERE proposal_id = ?") + .get("prop-iter-1") as { iterations_used: number | null }; + expect(row.iterations_used).toBe(3); + }); + + it("stores null when iterations_used is omitted", () => { + const ok = writeEvolutionAuditToDb({ + timestamp: "2026-03-18T12:01:00Z", + proposal_id: "prop-iter-2", + skill_name: "TestSkill", + action: "created", + details: "No iterations yet", + }); + expect(ok).toBe(true); + + const row = db + .query("SELECT iterations_used FROM evolution_audit WHERE proposal_id = ?") + .get("prop-iter-2") as { iterations_used: number | null }; + expect(row.iterations_used).toBeNull(); + }); +}); + // --------------------------------------------------------------------------- // Test data seeder // --------------------------------------------------------------------------- diff --git a/tests/localdb/read-queries.test.ts b/tests/localdb/read-queries.test.ts index 0dfbf5a2..8548b8c3 100644 --- a/tests/localdb/read-queries.test.ts +++ b/tests/localdb/read-queries.test.ts @@ -1,7 +1,7 @@ import type { Database } from "bun:sqlite"; import { afterEach, beforeEach, describe, expect, it } from "bun:test"; -import { _setTestDb, openDb } from "../../cli/selftune/localdb/db.js"; +import { openDb } from "../../cli/selftune/localdb/db.js"; import { getOrchestrateRuns, getOverviewPayload, diff --git a/tests/localdb/write.test.ts b/tests/localdb/write.test.ts index c158ef2d..73f87a01 100644 --- a/tests/localdb/write.test.ts +++ b/tests/localdb/write.test.ts @@ -1,4 +1,3 @@ -import type { Database } from "bun:sqlite"; import { afterEach, beforeEach, describe, expect, it } from "bun:test"; import type { CanonicalExecutionFactRecord, diff --git a/tests/observability.test.ts b/tests/observability.test.ts index d314ef9e..84ac1e18 100644 --- a/tests/observability.test.ts +++ b/tests/observability.test.ts @@ -3,6 +3,8 @@ import { existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node: import { homedir, tmpdir } from "node:os"; import { join } from "node:path"; import { + checkCloudLinkHealth, + checkDashboardIntegrityHealth, checkEvolutionHealth, checkHookInstallation, checkLogHealth, @@ -99,7 +101,54 @@ describe("checkEvolutionHealth", () => { }); }); +describe("checkDashboardIntegrityHealth", () => { + test("returns a warning about legacy dashboard freshness mode", () => { + const checks = checkDashboardIntegrityHealth(); + expect(checks).toHaveLength(1); + expect(checks[0]?.name).toBe("dashboard_freshness_mode"); + expect(checks[0]?.status).toBe("warn"); + expect(checks[0]?.message).toContain("JSONL watcher invalidation"); + }); +}); + describe("checkConfigHealth", () => { + test("returns guidance when config is missing", () => { + const tempHome = mkdtempSync(join(tmpdir(), "selftune-observability-missing-")); + const moduleUrl = new URL("../cli/selftune/observability.ts", import.meta.url).href; + + try { + const proc = Bun.spawnSync( + [ + process.execPath, + "-e", + `const { checkConfigHealth } = await import(${JSON.stringify(moduleUrl)}); console.log(JSON.stringify(checkConfigHealth()));`, + ], + { + env: { ...process.env, HOME: tempHome }, + stdout: "pipe", + stderr: "pipe", + }, + ); + + if (proc.exitCode !== 0) { + const stderr = new TextDecoder().decode(proc.stderr); + throw new Error(`Subprocess failed (exit ${proc.exitCode}): ${stderr}`); + } + + const output = new TextDecoder().decode(proc.stdout).trim(); + const checks = JSON.parse(output) as Array<{ + status: string; + guidance?: { next_command?: string; blocking?: boolean }; + }>; + expect(checks).toHaveLength(1); + expect(checks[0]?.status).toBe("warn"); + expect(checks[0]?.guidance?.blocking).toBe(true); + expect(checks[0]?.guidance?.next_command).toBe("selftune init"); + } finally { + rmSync(tempHome, { recursive: true, force: true }); + } + }); + test("accepts openclaw agent_type values written by init", () => { const tempHome = mkdtempSync(join(tmpdir(), "selftune-observability-")); const configDir = join(tempHome, ".selftune"); @@ -150,6 +199,22 @@ describe("checkConfigHealth", () => { }); }); +describe("checkCloudLinkHealth", () => { + test("returns remediation guidance when credential is missing", () => { + const checks = checkCloudLinkHealth({ + enrolled: true, + user_id: "user-1", + email: "user@example.com", + consent_timestamp: "2026-03-20T00:00:00.000Z", + }); + + expect(checks).toHaveLength(1); + expect(checks[0]?.status).toBe("warn"); + expect(checks[0]?.guidance?.blocking).toBe(true); + expect(checks[0]?.guidance?.next_command).toContain("--alpha-key "); + }); +}); + describe("doctor", () => { test("returns structured result", async () => { const result = await doctor(); @@ -173,6 +238,13 @@ describe("doctor", () => { expect(evolutionChecks.length).toBeGreaterThanOrEqual(1); }); + test("includes dashboard integrity warning", async () => { + const result = await doctor(); + const integrityCheck = result.checks.find((c) => c.name === "dashboard_freshness_mode"); + expect(integrityCheck).toBeDefined(); + expect(integrityCheck?.status).toBe("warn"); + }); + test("doctor does not produce false positives from git hook checks", async () => { const result = await doctor(); // With the git hook checks removed, doctor should not produce false diff --git a/tests/orchestrate-overlap.test.ts b/tests/orchestrate-overlap.test.ts new file mode 100644 index 00000000..3a19ee19 --- /dev/null +++ b/tests/orchestrate-overlap.test.ts @@ -0,0 +1,137 @@ +/** + * Tests for detectCrossSkillOverlap — cross-skill eval set overlap detection. + * + * This function is an internal helper in orchestrate.ts, exported only for testing. + */ + +import { describe, expect, test } from "bun:test"; +import { detectCrossSkillOverlap } from "../cli/selftune/orchestrate.js"; +import type { QueryLogRecord, SkillUsageRecord } from "../cli/selftune/types.js"; + +// --------------------------------------------------------------------------- +// Helper factories +// --------------------------------------------------------------------------- + +function makeSkillRecord(skillName: string, query: string): SkillUsageRecord { + return { + timestamp: new Date().toISOString(), + session_id: "sess-001", + skill_name: skillName, + skill_path: `/skills/${skillName}/SKILL.md`, + query, + triggered: true, + source: "claude_code_replay", + }; +} + +function makeQueryRecord(query: string): QueryLogRecord { + return { + timestamp: new Date().toISOString(), + session_id: "sess-001", + query, + source: "hook", + }; +} + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe("detectCrossSkillOverlap", () => { + test("detects overlap when two skills share >30% queries", async () => { + // Skill A: queries 1-5 + // Skill B: queries 3-7 + // Shared: 3, 4, 5 = 3 out of min(5,5) = 60% overlap + const skillRecords: SkillUsageRecord[] = [ + makeSkillRecord("SkillA", "deploy the app"), + makeSkillRecord("SkillA", "run the tests"), + makeSkillRecord("SkillA", "check the logs"), + makeSkillRecord("SkillA", "restart the server"), + makeSkillRecord("SkillA", "update the config"), + makeSkillRecord("SkillB", "check the logs"), + makeSkillRecord("SkillB", "restart the server"), + makeSkillRecord("SkillB", "update the config"), + makeSkillRecord("SkillB", "scale the pods"), + makeSkillRecord("SkillB", "monitor metrics"), + ]; + + const queryRecords: QueryLogRecord[] = [ + makeQueryRecord("deploy the app"), + makeQueryRecord("run the tests"), + makeQueryRecord("check the logs"), + makeQueryRecord("restart the server"), + makeQueryRecord("update the config"), + makeQueryRecord("scale the pods"), + makeQueryRecord("monitor metrics"), + ]; + + const candidates = [{ skill: "SkillA" }, { skill: "SkillB" }]; + const result = await detectCrossSkillOverlap(candidates, skillRecords, queryRecords); + + expect(result.length).toBe(1); + expect(result[0].skill_a).toBe("SkillA"); + expect(result[0].skill_b).toBe("SkillB"); + expect(result[0].overlap_pct).toBeGreaterThan(0.3); + expect(result[0].shared_queries.length).toBe(3); + expect(result[0].shared_queries).toContain("check the logs"); + expect(result[0].shared_queries).toContain("restart the server"); + expect(result[0].shared_queries).toContain("update the config"); + }); + + test("returns empty array when skills have disjoint queries", async () => { + const skillRecords: SkillUsageRecord[] = [ + makeSkillRecord("SkillA", "deploy the app"), + makeSkillRecord("SkillA", "run the tests"), + makeSkillRecord("SkillA", "check the logs"), + makeSkillRecord("SkillB", "scale the pods"), + makeSkillRecord("SkillB", "monitor metrics"), + makeSkillRecord("SkillB", "rotate secrets"), + ]; + + const queryRecords: QueryLogRecord[] = [ + makeQueryRecord("deploy the app"), + makeQueryRecord("run the tests"), + makeQueryRecord("check the logs"), + makeQueryRecord("scale the pods"), + makeQueryRecord("monitor metrics"), + makeQueryRecord("rotate secrets"), + ]; + + const candidates = [{ skill: "SkillA" }, { skill: "SkillB" }]; + const result = await detectCrossSkillOverlap(candidates, skillRecords, queryRecords); + + expect(result).toEqual([]); + }); + + test("returns empty array with empty candidates", async () => { + const result = await detectCrossSkillOverlap([], [], []); + expect(result).toEqual([]); + }); + + test("returns empty array with single candidate", async () => { + const skillRecords: SkillUsageRecord[] = [makeSkillRecord("SkillA", "deploy the app")]; + const queryRecords: QueryLogRecord[] = [makeQueryRecord("deploy the app")]; + + const candidates = [{ skill: "SkillA" }]; + const result = await detectCrossSkillOverlap(candidates, skillRecords, queryRecords); + + expect(result).toEqual([]); + }); + + test("caps shared_queries at 10 entries", async () => { + // Create two skills that share 15 queries + const sharedQueries = Array.from({ length: 15 }, (_, i) => `shared query number ${i + 1}`); + const skillRecords: SkillUsageRecord[] = [ + ...sharedQueries.map((q) => makeSkillRecord("SkillA", q)), + ...sharedQueries.map((q) => makeSkillRecord("SkillB", q)), + ]; + const queryRecords: QueryLogRecord[] = sharedQueries.map((q) => makeQueryRecord(q)); + + const candidates = [{ skill: "SkillA" }, { skill: "SkillB" }]; + const result = await detectCrossSkillOverlap(candidates, skillRecords, queryRecords); + + expect(result.length).toBe(1); + expect(result[0].shared_queries.length).toBe(10); + expect(result[0].overlap_pct).toBe(1.0); // 100% overlap + }); +}); diff --git a/tests/trust-floor/health.test.ts b/tests/trust-floor/health.test.ts new file mode 100644 index 00000000..4719223c --- /dev/null +++ b/tests/trust-floor/health.test.ts @@ -0,0 +1,83 @@ +/** + * Tests for the expanded /api/health endpoint with runtime identity fields. + */ + +import { afterAll, beforeAll, describe, expect, it } from "bun:test"; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import type { HealthResponse } from "../../cli/selftune/dashboard-contract.js"; + +let startDashboardServer: typeof import("../../cli/selftune/dashboard-server.js").startDashboardServer; +let testSpaDir: string; +let server: Awaited> | null = null; + +beforeAll(async () => { + const mod = await import("../../cli/selftune/dashboard-server.js"); + startDashboardServer = mod.startDashboardServer; + testSpaDir = mkdtempSync(join(tmpdir(), "selftune-health-test-")); + mkdirSync(join(testSpaDir, "assets"), { recursive: true }); + writeFileSync(join(testSpaDir, "index.html"), ``); +}); + +afterAll(async () => { + if (server) await server.stop(); + try { + rmSync(testSpaDir, { recursive: true, force: true }); + } catch { + /* best-effort */ + } +}); + +describe("/api/health runtime identity", () => { + it("returns all expected fields", async () => { + server = await startDashboardServer({ + port: 0, + host: "127.0.0.1", + spaDir: testSpaDir, + openBrowser: false, + runtimeMode: "test", + overviewLoader: () => ({ + overview: { + telemetry: [], + skills: [], + evolution: [], + counts: { telemetry: 0, skills: 0, evolution: 0, evidence: 0, sessions: 0, prompts: 0 }, + unmatched_queries: [], + pending_proposals: [], + }, + skills: [], + }), + }); + + const res = await fetch(`http://127.0.0.1:${server.port}/api/health`); + expect(res.status).toBe(200); + + const body: HealthResponse = await res.json(); + + // Original fields + expect(body.ok).toBe(true); + expect(body.service).toBe("selftune-dashboard"); + expect(typeof body.version).toBe("string"); + expect(typeof body.spa).toBe("boolean"); + expect(typeof body.v2_data_available).toBe("boolean"); + + // New runtime identity fields + expect(typeof body.workspace_root).toBe("string"); + expect(body.workspace_root).toBeTruthy(); + + expect(typeof body.git_sha).toBe("string"); + + expect(typeof body.db_path).toBe("string"); + + expect(typeof body.log_dir).toBe("string"); + expect(typeof body.config_dir).toBe("string"); + + expect(["jsonl", "none"]).toContain(body.watcher_mode); + expect(body.process_mode).toBe("test"); + + expect(body.host).toBe("127.0.0.1"); + expect(typeof body.port).toBe("number"); + expect(body.port).toBeGreaterThan(0); + }); +}); diff --git a/tests/trust-floor/hermetic-store.test.ts b/tests/trust-floor/hermetic-store.test.ts new file mode 100644 index 00000000..f9fde1cb --- /dev/null +++ b/tests/trust-floor/hermetic-store.test.ts @@ -0,0 +1,102 @@ +/** + * Tests that SELFTUNE_HOME redirects all derived paths correctly. + * + * Because constants.ts evaluates at import time, we must spawn a + * subprocess with the env vars set rather than mutating process.env + * after import. + */ + +import { afterAll, beforeAll, describe, expect, it } from "bun:test"; +import { createIsolatedStore, type IsolatedStore } from "../helpers/isolated-store.js"; + +let store: IsolatedStore; + +beforeAll(() => { + store = createIsolatedStore(); +}); + +afterAll(() => { + store.cleanup(); +}); + +describe("SELFTUNE_HOME environment override", () => { + it("redirects config, log, claude, and openclaw paths via subprocess", async () => { + // We run a small inline script that imports constants and prints them. + // This ensures the env vars are set BEFORE the module evaluates. + const script = ` + const c = await import("./cli/selftune/constants.js"); + console.log(JSON.stringify({ + configDir: c.SELFTUNE_CONFIG_DIR, + logDir: c.LOG_DIR, + telemetryLog: c.TELEMETRY_LOG, + configPath: c.SELFTUNE_CONFIG_PATH, + claudeSettingsPath: c.CLAUDE_SETTINGS_PATH, + claudeProjectsDir: c.CLAUDE_CODE_PROJECTS_DIR, + claudeMarker: c.CLAUDE_CODE_MARKER, + codexMarker: c.CODEX_INGEST_MARKER, + opencodeMarker: c.OPENCODE_INGEST_MARKER, + openclawAgentsDir: c.OPENCLAW_AGENTS_DIR, + })); + `; + + const cleanEnv = { ...process.env }; + delete cleanEnv.SELFTUNE_CONFIG_DIR; + delete cleanEnv.SELFTUNE_LOG_DIR; + cleanEnv.SELFTUNE_HOME = store.root; + + const result = Bun.spawnSync(["bun", "-e", script], { + env: cleanEnv, + cwd: process.cwd(), + }); + + if (result.exitCode !== 0) { + throw new Error(`Subprocess failed: ${result.stderr.toString()}`); + } + + const stdout = result.stdout.toString().trim(); + expect(stdout.length).toBeGreaterThan(0); + + const paths = JSON.parse(stdout); + expect(paths.configDir).toBe(`${store.root}/.selftune`); + expect(paths.logDir).toBe(`${store.root}/.claude`); + expect(paths.telemetryLog).toContain(`${store.root}/.claude/`); + expect(paths.configPath).toContain(`${store.root}/.selftune/`); + expect(paths.claudeSettingsPath).toBe(`${store.root}/.claude/settings.json`); + expect(paths.claudeProjectsDir).toBe(`${store.root}/.claude/projects`); + expect(paths.claudeMarker).toBe(`${store.root}/.claude/claude_code_ingested_sessions.json`); + expect(paths.codexMarker).toBe(`${store.root}/.claude/codex_ingested_rollouts.json`); + expect(paths.opencodeMarker).toBe(`${store.root}/.claude/opencode_ingested_sessions.json`); + expect(paths.openclawAgentsDir).toBe(`${store.root}/.openclaw/agents`); + }); + + it("specific overrides take precedence over SELFTUNE_HOME", async () => { + const script = ` + const c = await import("./cli/selftune/constants.js"); + console.log(JSON.stringify({ + configDir: c.SELFTUNE_CONFIG_DIR, + logDir: c.LOG_DIR, + })); + `; + + const customConfig = `${store.root}/custom-config`; + const customLog = `${store.root}/custom-log`; + + const result = Bun.spawnSync(["bun", "-e", script], { + env: { + ...process.env, + SELFTUNE_HOME: "/should/be/ignored", + SELFTUNE_CONFIG_DIR: customConfig, + SELFTUNE_LOG_DIR: customLog, + }, + cwd: process.cwd(), + }); + + if (result.exitCode !== 0) { + throw new Error(`Subprocess failed: ${result.stderr.toString()}`); + } + + const paths = JSON.parse(result.stdout.toString().trim()); + expect(paths.configDir).toBe(customConfig); + expect(paths.logDir).toBe(customLog); + }); +}); diff --git a/tests/trust-floor/rebuild-preflight.test.ts b/tests/trust-floor/rebuild-preflight.test.ts new file mode 100644 index 00000000..b2324d64 --- /dev/null +++ b/tests/trust-floor/rebuild-preflight.test.ts @@ -0,0 +1,144 @@ +/** + * Tests for the rebuild preflight guard in materializeFull. + * + * Verifies that materializeFull throws when SQLite has rows newer than + * the corresponding JSONL file, unless `force` is set. + */ + +import { afterEach, describe, expect, it } from "bun:test"; +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { openDb } from "../../cli/selftune/localdb/db.js"; +import { materializeFull } from "../../cli/selftune/localdb/materialize.js"; + +function makeTempDir(): string { + const dir = mkdtempSync(join(tmpdir(), "selftune-preflight-")); + return dir; +} + +describe("rebuild preflight guard", () => { + const cleanups: Array<() => void> = []; + + afterEach(() => { + for (const fn of cleanups) { + try { + fn(); + } catch { + /* best-effort */ + } + } + cleanups.length = 0; + }); + + it("throws when SQLite has newer evolution_audit rows than JSONL", () => { + const tmp = makeTempDir(); + cleanups.push(() => rmSync(tmp, { recursive: true, force: true })); + + // Create empty JSONL files + const auditLog = join(tmp, "evolution_audit_log.jsonl"); + const evidenceLog = join(tmp, "evolution_evidence_log.jsonl"); + const orchestrateLog = join(tmp, "orchestrate_runs.jsonl"); + const telemetryLog = join(tmp, "session_telemetry_log.jsonl"); + const canonicalLog = join(tmp, "canonical_telemetry_log.jsonl"); + + writeFileSync(auditLog, ""); + writeFileSync(evidenceLog, ""); + writeFileSync(orchestrateLog, ""); + writeFileSync(telemetryLog, ""); + writeFileSync(canonicalLog, ""); + + // Create in-memory DB and insert a row into evolution_audit + const db = openDb(":memory:"); + + db.run( + `INSERT INTO evolution_audit (timestamp, proposal_id, skill_name, action, details) + VALUES (?, ?, ?, ?, ?)`, + ["2026-03-18T12:00:00Z", "prop-1", "test-skill", "created", "test details"], + ); + + // materializeFull should throw because SQLite has data JSONL doesn't + expect(() => + materializeFull(db, { + evolutionAuditPath: auditLog, + evolutionEvidencePath: evidenceLog, + orchestrateRunLogPath: orchestrateLog, + telemetryLogPath: telemetryLog, + canonicalLogPath: canonicalLog, + }), + ).toThrow(/Rebuild blocked/); + + db.close(); + }); + + it("allows rebuild when force is set", () => { + const tmp = makeTempDir(); + cleanups.push(() => rmSync(tmp, { recursive: true, force: true })); + + const auditLog = join(tmp, "evolution_audit_log.jsonl"); + const evidenceLog = join(tmp, "evolution_evidence_log.jsonl"); + const orchestrateLog = join(tmp, "orchestrate_runs.jsonl"); + const telemetryLog = join(tmp, "session_telemetry_log.jsonl"); + const canonicalLog = join(tmp, "canonical_telemetry_log.jsonl"); + + writeFileSync(auditLog, ""); + writeFileSync(evidenceLog, ""); + writeFileSync(orchestrateLog, ""); + writeFileSync(telemetryLog, ""); + writeFileSync(canonicalLog, ""); + + const db = openDb(":memory:"); + + db.run( + `INSERT INTO evolution_audit (timestamp, proposal_id, skill_name, action, details) + VALUES (?, ?, ?, ?, ?)`, + ["2026-03-18T12:00:00Z", "prop-1", "test-skill", "created", "test details"], + ); + + // Should NOT throw with force: true + expect(() => + materializeFull(db, { + force: true, + evolutionAuditPath: auditLog, + evolutionEvidencePath: evidenceLog, + orchestrateRunLogPath: orchestrateLog, + telemetryLogPath: telemetryLog, + canonicalLogPath: canonicalLog, + }), + ).not.toThrow(); + + db.close(); + }); + + it("allows rebuild when SQLite tables are empty", () => { + const tmp = makeTempDir(); + cleanups.push(() => rmSync(tmp, { recursive: true, force: true })); + + const auditLog = join(tmp, "evolution_audit_log.jsonl"); + const evidenceLog = join(tmp, "evolution_evidence_log.jsonl"); + const orchestrateLog = join(tmp, "orchestrate_runs.jsonl"); + const telemetryLog = join(tmp, "session_telemetry_log.jsonl"); + const canonicalLog = join(tmp, "canonical_telemetry_log.jsonl"); + + writeFileSync(auditLog, ""); + writeFileSync(evidenceLog, ""); + writeFileSync(orchestrateLog, ""); + writeFileSync(telemetryLog, ""); + writeFileSync(canonicalLog, ""); + + const db = openDb(":memory:"); + + // No rows in any table — should not throw + expect(() => + materializeFull(db, { + evolutionAuditPath: auditLog, + evolutionEvidencePath: evidenceLog, + orchestrateRunLogPath: orchestrateLog, + telemetryLogPath: telemetryLog, + canonicalLogPath: canonicalLog, + }), + ).not.toThrow(); + + db.close(); + }); +});