diff --git a/.github/workflows/deploy-docs.yml b/.github/workflows/deploy-docs.yml
index a274458..22dda15 100644
--- a/.github/workflows/deploy-docs.yml
+++ b/.github/workflows/deploy-docs.yml
@@ -4,8 +4,12 @@ on:
push:
branches: [main]
paths:
- - "docs/**"
- workflow_dispatch:
+ - 'docs/**'
+ - '.github/workflows/deploy-docs.yml'
+ pull_request:
+ paths:
+ - 'docs/**'
+ - '.github/workflows/deploy-docs.yml'
jobs:
deploy:
@@ -13,30 +17,40 @@ jobs:
permissions:
contents: read
deployments: write
+ pull-requests: write
steps:
- uses: actions/checkout@v4
- - uses: pnpm/action-setup@v4
- with:
- version: 9
-
- uses: actions/setup-node@v4
with:
- node-version: "22"
- cache: pnpm
- cache-dependency-path: docs/pnpm-lock.yaml
+ node-version: 22
+ cache: npm
- name: Install dependencies
- working-directory: docs
- run: pnpm install --frozen-lockfile
+ run: npm ci
- - name: Build
- working-directory: docs
- run: pnpm run build
+ - name: Build docs
+ run: cd docs && npm run build
- name: Deploy to Cloudflare Pages
+ id: deploy
uses: cloudflare/wrangler-action@v3
with:
apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
accountId: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }}
- command: pages deploy docs/build --project-name=attache-docs
+ command: pages deploy docs/build --project-name=attache-docs --branch=${{ github.head_ref || github.ref_name }} --commit-dirty=true
+
+ - name: Comment preview URL on PR
+ if: github.event_name == 'pull_request'
+ uses: actions/github-script@v7
+ with:
+ script: |
+ const url = '${{ steps.deploy.outputs.deployment-url }}';
+ if (url) {
+ github.rest.issues.createComment({
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ issue_number: context.issue.number,
+ body: `📚 Docs preview: ${url}`
+ });
+ }
diff --git a/docs/docs/architecture/agent-orchestration.md b/docs/docs/architecture/agent-orchestration.md
new file mode 100644
index 0000000..03036ef
--- /dev/null
+++ b/docs/docs/architecture/agent-orchestration.md
@@ -0,0 +1,63 @@
+---
+sidebar_label: Agent Orchestration
+sidebar_position: 2
+---
+
+# Agent Orchestration
+
+Evie Platform's orchestration layer is runtime-agnostic. Any agent that can read files, write files, and commit to Git works as a participant. The coordination layer is the Git repository itself -- not a proprietary protocol, not a message queue, not a shared database.
+
+## Runtime-Agnostic Delegation
+
+The orchestrating agent (typically OpenClaw) delegates tasks to coding agents without coupling to a specific runtime. Claude Code, Codex, Gemini CLI, Aider, or a local model running via Ollama -- all participate through the same interface:
+
+1. The orchestrator writes a brief (a markdown file describing the task, context, and constraints)
+2. The coding agent picks up the brief, does the work, and commits the result
+3. The orchestrator reads the commit, evaluates the output, and decides what's next
+
+This works because the contract is files and Git, not an API. A coding agent doesn't need a plugin, SDK, or integration to participate. It needs a shell, a file system, and Git.
+
+The `evie-orchestrate` bounded context manages delegation lifecycle: brief generation, session launch, progress monitoring, and result collection. It's the switchboard, not the worker.
+
+## Git as Coordination Layer
+
+Git is the universal coordination layer because every coding agent already speaks it. Branches isolate parallel work. Commits provide atomic checkpoints. Diffs show exactly what changed. Merge conflicts surface when two agents touch the same code.
+
+The orchestrator uses Git worktrees to give each coding agent an isolated copy of the repository. Agents work in parallel on separate branches without stepping on each other. When work completes, the orchestrator evaluates the branch and decides whether to merge, request changes, or discard.
+
+This pattern scales to any number of concurrent agents. The coordination overhead is Git's merge machinery, which handles the hard problems (conflict detection, three-way merge, history linearization) that a custom protocol would need to reimplement.
+
+## Multi-Model Evaluation
+
+When output quality matters more than speed, the orchestrator runs blind parallel evaluation from two to three different model providers. See [Blind Multi-Model Evaluation](./design-decisions/blind-multi-model) for the full design decision.
+
+In practice, this applies to:
+
+- **Code review** -- send the same diff to Claude, GPT, and Gemini. Disagreements surface real issues that single-model review would miss.
+- **Research synthesis** -- three models independently summarize source material. Overlapping conclusions are high-confidence; divergent conclusions need human judgment.
+- **Risk assessment** -- independent security evaluations of a proposed change. Unanimous "safe" is a stronger signal than one model saying "safe."
+
+The orchestrator collects all evaluations before synthesizing a result. Models don't see each other's assessments. This eliminates anchoring and herding biases.
+
+## Local Reflection
+
+Quantized local models (Llama 3, Mistral, Phi) running on Apple Silicon handle a specific class of work: extracting procedural knowledge from session logs.
+
+After a coding session, the local model reads the session transcript and extracts patterns: "When reviewing TypeScript, always check for unhandled promise rejections." "This codebase uses barrel exports -- follow that convention." These observations become candidate entries for procedural memory (SKILL.md updates or new skills).
+
+Local reflection runs on-device with no API calls. The session logs -- which may contain sensitive code, credentials references, or internal discussion -- never leave the machine. The local model's output is lower quality than a frontier model, but the privacy tradeoff is worth it for this use case.
+
+The Dream Cycle (overnight consolidation) uses a mix of local and API-based models depending on the task. Privacy-sensitive consolidation runs locally; quality-critical synthesis uses frontier models.
+
+## Tmux as Session Management
+
+Each coding agent runs in a tmux session. Tmux provides the session lifecycle that agent orchestration needs:
+
+- **Named sessions** -- `evie-cc-auth-rewrite`, `evie-codex-review-42`. Find any agent's work by name.
+- **Detached execution** -- agents run in the background. The orchestrator launches a session and checks back later.
+- **Output capture** -- tmux's scrollback buffer captures the full session transcript for post-hoc analysis and local reflection.
+- **Multiplexing** -- multiple agents run concurrently in separate sessions on the same machine.
+
+The dispatch harness creates a tmux session, injects the brief, starts the coding agent, and monitors for completion. When the agent finishes (detected by process exit or a sentinel file), the harness collects the results and notifies the orchestrator.
+
+This is intentionally low-tech. Tmux is battle-tested, available on every Unix system, and requires zero infrastructure. The alternative -- a custom daemon managing agent processes -- would add complexity without meaningful benefit.
diff --git a/docs/docs/architecture/design-decisions/_category_.json b/docs/docs/architecture/design-decisions/_category_.json
new file mode 100644
index 0000000..355db22
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/_category_.json
@@ -0,0 +1,6 @@
+{
+ "label": "Design Decisions",
+ "position": 4,
+ "collapsible": true,
+ "collapsed": false
+}
diff --git a/docs/docs/architecture/design-decisions/blind-multi-model.md b/docs/docs/architecture/design-decisions/blind-multi-model.md
new file mode 100644
index 0000000..e9fb076
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/blind-multi-model.md
@@ -0,0 +1,34 @@
+---
+sidebar_label: Blind Multi-Model Evaluation
+---
+
+# Blind Multi-Model Evaluation
+
+## Problem
+
+When an AI agent evaluates its own output, it has a systematic bias toward confirming its own work. A Claude-based agent reviewing Claude-generated code will find fewer issues than an independent reviewer would. Self-evaluation is better than nothing, but it creates a ceiling on quality assurance.
+
+## Options Considered
+
+1. **Single-model self-evaluation** -- the same model that generates output also reviews it
+2. **Human review for everything** -- highest quality, doesn't scale
+3. **Blind parallel evaluation** -- send the same prompt to two or three different models, compare results without revealing which model produced what
+
+## Decision
+
+Blind parallel evaluation from two to three different model providers (Anthropic, OpenAI, Google). The evaluating models don't know which model produced the original output or which models are co-evaluating.
+
+"Blind" means two things: the evaluating model doesn't know the identity of the model that produced the work, and evaluating models don't see each other's assessments until all responses are collected. This eliminates anchoring bias (where a reviewer defers to a known-good model) and herding (where later reviewers converge on early assessments).
+
+The orchestrator collects all evaluations, then synthesizes a final assessment. Disagreements between models are flagged for human review rather than auto-resolved. When two out of three models agree on an issue, it's likely real. When all three disagree, the uncertainty itself is the signal.
+
+This pattern applies to code review, research synthesis, risk assessment, and any task where confidence in the output matters more than speed.
+
+## Tradeoffs
+
+- **Won.** Eliminates self-evaluation bias. Independent models catch different classes of errors.
+- **Won.** Disagreement detection surfaces genuine ambiguity that single-model evaluation would miss.
+- **Won.** Model-agnostic by design. Swap providers without changing behavior. If one provider has an outage, the system degrades to two-model evaluation rather than failing.
+- **Lost.** Two to three times the API cost per evaluation. Acceptable for high-stakes operations (code review, security assessment), expensive for routine tasks.
+- **Lost.** Latency increases -- you wait for the slowest model to respond. Mitigated by parallel execution, but still slower than single-model.
+- **Lost.** Synthesizing disagreements is a hard problem. The orchestrator's merge logic is itself a source of potential error.
diff --git a/docs/docs/architecture/design-decisions/index.md b/docs/docs/architecture/design-decisions/index.md
new file mode 100644
index 0000000..0185719
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/index.md
@@ -0,0 +1,20 @@
+---
+sidebar_label: Design Decisions
+sidebar_position: 1
+---
+
+# Design Decisions
+
+Each page in this section documents a key architectural choice using a consistent format: the problem, the options considered, the decision, and the tradeoffs.
+
+These are not retrospective justifications. They capture the reasoning at the time the decision was made, so future contributors understand the constraints and can revisit decisions when the constraints change.
+
+## Decisions
+
+- **[Why Postgres](./why-postgres)** -- single database with JSONB, pgvector, ParadeDB, TimescaleDB and pg_trgm. Graph traversal via recursive CTEs.
+- **[Why Mac Mini](./why-mac-mini)** -- per-user dedicated hardware, Apple Silicon for local inference, physical data sovereignty.
+- **[Why Bun](./why-bun)** -- one language ecosystem, native TypeScript, Python only as escape hatch.
+- **[Why Discord](./why-discord)** -- named threads, auto-hide, personal server model. Slack and others come later.
+- **[Why Not Neo4j](./why-not-neo4j)** -- heavyweight Java dependency, CTEs outperform AGE by 40x for our query patterns.
+- **[Why Local First](./why-local-first)** -- progressive trust, no cloud dependency for core ops, network for enrichment only.
+- **[Blind Multi-Model Evaluation](./blind-multi-model)** -- parallel eval from two to three models eliminates self-evaluation bias.
diff --git a/docs/docs/architecture/design-decisions/why-bun.md b/docs/docs/architecture/design-decisions/why-bun.md
new file mode 100644
index 0000000..622dd3b
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/why-bun.md
@@ -0,0 +1,32 @@
+---
+sidebar_label: Why Bun
+---
+
+# Why Bun
+
+## Problem
+
+Evie Platform scripts, skills, and tooling need a runtime. OpenClaw itself runs on Node.js. The question is whether to standardize on Node, adopt Bun, or split between TypeScript and Python.
+
+## Options Considered
+
+1. **Node.js only** -- the runtime OpenClaw already uses
+2. **Bun** -- binary drop-in replacement for Node with built-in TypeScript, bundler, and package manager
+3. **Python for tooling, Node for runtime** -- common in AI/ML ecosystems
+4. **Mixed Bun + Python** -- Bun as primary, Python as escape hatch
+
+## Decision
+
+Bun as the primary runtime for all Evie Platform scripts, skills, and tooling. Python only as an escape hatch for workloads that have no TypeScript equivalent (e.g., OpenCV keyframe extraction from video).
+
+One language ecosystem eliminates the dev-tool drift that comes from maintaining both `pyproject.toml` and `package.json`, both `pip` and `npm`, both `ruff` and `eslint`. Bun is a binary drop-in: it runs TypeScript natively, bundles without a separate tool, and manages packages faster than npm.
+
+Bun's built-in test runner, HTTP server, and file I/O APIs reduce the dependency count for common operations. A skill script that needs to make HTTP calls and parse JSON doesn't need axios or node-fetch.
+
+## Tradeoffs
+
+- **Won.** Single language ecosystem for the entire platform. Every contributor needs to know TypeScript, not TypeScript and Python.
+- **Won.** Native TypeScript execution -- no compile step, no tsconfig complexity for scripts.
+- **Won.** Faster package installs and script startup vs. Node.
+- **Lost.** Bun's Node.js compatibility is not 100%. Some npm packages with native addons or Node-specific APIs may not work. Mitigated by falling back to Node for those cases.
+- **Lost.** Python's ML/AI library ecosystem is unmatched. The escape hatch exists because some tasks (video processing, specialized ML inference) have no viable TypeScript alternative.
diff --git a/docs/docs/architecture/design-decisions/why-discord.md b/docs/docs/architecture/design-decisions/why-discord.md
new file mode 100644
index 0000000..ba191fa
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/why-discord.md
@@ -0,0 +1,36 @@
+---
+sidebar_label: Why Discord
+---
+
+# Why Discord
+
+## Problem
+
+Evie Platform agents need a messaging surface for human-agent interaction: approval prompts, status updates, conversational commands, and trust-tier escalations. The channel needs to support structured conversations that stay organized over time.
+
+## Options Considered
+
+1. **Slack** -- dominant in enterprise, rich API, but threads are unnamed and don't auto-hide
+2. **Discord** -- named threads, auto-hide for inactive threads, strong bot API
+3. **Telegram** -- lightweight, good bot API, limited thread support
+4. **Signal** -- privacy-first, minimal bot support
+5. **Custom web UI** -- full control, high development cost
+
+## Decision
+
+Discord as the V1 messaging channel. Slack, Telegram, and Signal come later as additional surfaces.
+
+Discord's named threads solve a real organizational problem. When your agent opens a thread called "PR Review: auth-middleware-rewrite," you can find it by name, archive it, and come back to it. Slack threads are unnamed replies to a message -- they disappear into the scroll. For an agent that opens dozens of threads per day, the naming matters.
+
+Inactive threads auto-hide after a configurable period. Your channel stays clean without manual archiving. Active conversations surface; finished ones fade.
+
+The personal server model (one Discord server per agent) creates a defensible 1:1 space. Your agent's server is yours. No shared workspace admins, no IT policies restricting bot permissions, no enterprise licensing.
+
+## Tradeoffs
+
+- **Won.** Named threads keep agent conversations organized and searchable.
+- **Won.** Auto-hide prevents channel clutter from resolved conversations.
+- **Won.** Personal server model -- no dependency on organizational Slack admin permissions.
+- **Lost.** Enterprise teams already on Slack face friction adopting a second messaging tool. The Slack integration is planned but not yet built.
+- **Lost.** Discord's reputation as a "gaming platform" can create perception issues in professional contexts.
+- **Lost.** No built-in email integration. Slack Connect bridges to external parties; Discord doesn't.
diff --git a/docs/docs/architecture/design-decisions/why-local-first.md b/docs/docs/architecture/design-decisions/why-local-first.md
new file mode 100644
index 0000000..ec5ace6
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/why-local-first.md
@@ -0,0 +1,32 @@
+---
+sidebar_label: Why Local First
+---
+
+# Why Local First
+
+## Problem
+
+AI agent platforms face a fundamental tension: cloud services offer convenience and scale, but they require sending your data to someone else's infrastructure. For a personal agent with access to your email, calendar, credentials, and file system, the data sovereignty question is not abstract.
+
+## Options Considered
+
+1. **Cloud-hosted** -- agent runs on managed infrastructure (AWS, GCP, or a SaaS platform)
+2. **Hybrid** -- agent runs locally, memory and state stored in cloud
+3. **Local-first** -- everything runs on your hardware, network used only for enrichment
+
+## Decision
+
+Local-first architecture with a progressive trust model. The agent sees your data locally. No cloud dependency for core operations. Network connectivity is used for enrichment only: LLM API calls, web fetches, integration syncs.
+
+The progressive trust model means the agent starts with no network access to external services and gains it incrementally as you configure integrations. Your Postgres instance, memory files, knowledge graph, and activity log all live on your Mac. If you disconnect from the internet, the agent still works -- it just can't call LLM APIs or sync external services.
+
+This design aligns with the dedicated Mac mini model. The hardware is yours. The data is yours. The agent process runs under a restricted OS user on hardware you physically control.
+
+## Tradeoffs
+
+- **Won.** Data sovereignty -- your conversations, credentials, and knowledge graph never leave your hardware unless you explicitly configure an integration to sync them.
+- **Won.** Latency -- local Postgres queries are faster than round-trips to a cloud database. Memory retrieval and knowledge graph lookups run in single-digit milliseconds.
+- **Won.** Availability -- core agent functionality works offline. No dependency on cloud uptime for local operations.
+- **Lost.** No automatic backups without configuration. Cloud-hosted solutions handle this by default. You need to set up your own backup strategy (Time Machine, rsync, or Restic).
+- **Lost.** No multi-device sync out of the box. Your agent's state lives on one machine. Accessing it from elsewhere requires Tailscale or similar remote access.
+- **Lost.** Compute is bounded by your hardware. Cloud solutions can scale up for heavy workloads. A Mac mini has fixed CPU, memory, and storage.
diff --git a/docs/docs/architecture/design-decisions/why-mac-mini.md b/docs/docs/architecture/design-decisions/why-mac-mini.md
new file mode 100644
index 0000000..30889e8
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/why-mac-mini.md
@@ -0,0 +1,33 @@
+---
+sidebar_label: Why Mac Mini
+---
+
+# Why Mac Mini
+
+## Problem
+
+Every Evie Platform agent needs dedicated compute. The agent runs Docker containers, a Postgres instance, local inference models, and the OpenClaw gateway. It needs to be always-on, physically isolated from your primary workstation, and powerful enough for real-time work.
+
+## Options Considered
+
+1. **Cloud VM** -- AWS EC2, GCP, or a VPS provider
+2. **Linux mini PC** -- Intel NUC or similar
+3. **Mac mini with Apple Silicon** -- M4 now, M5 when available (May 2026)
+
+## Decision
+
+Dedicated Mac mini per agent, 512 GB SSD minimum. M4 for current deployments, upgrading to M5 when it ships.
+
+The model is the same as Vision Pro: the device is yours. Your agent runs on your hardware, on your desk or in your closet, under your physical control.
+
+Apple Silicon provides the unified memory architecture that makes local inference practical. A Mac mini with 24 GB unified memory can run quantized models (Llama 3, Mistral, Phi) for local reflection tasks without a discrete GPU. The Neural Engine accelerates inference workloads that would require an expensive GPU on x86.
+
+macOS is the native target for OpenClaw. The gateway, tools, and ecosystem assume macOS or Linux -- and macOS has better support for the desktop integration patterns Evie Platform uses (launchd agents, Keychain Access, Shortcuts).
+
+## Tradeoffs
+
+- **Won.** Physical sovereignty -- no cloud provider can access, throttle, or terminate your agent. Data never leaves the box unless the agent explicitly sends it.
+- **Won.** Local inference capability -- Apple Silicon's unified memory makes running quantized models practical without a separate GPU budget.
+- **Lost.** Higher upfront cost than a cloud VM (though break-even is typically three to five months vs. a comparable EC2 instance).
+- **Lost.** macOS-specific. Teams running Linux infrastructure need the [macOS vs. Linux](../macos-vs-linux.md) guide to evaluate the gap.
+- **Lost.** Single point of failure without redundancy planning. A dead Mac mini means a dead agent until you replace it.
diff --git a/docs/docs/architecture/design-decisions/why-not-neo4j.md b/docs/docs/architecture/design-decisions/why-not-neo4j.md
new file mode 100644
index 0000000..e6d813f
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/why-not-neo4j.md
@@ -0,0 +1,33 @@
+---
+sidebar_label: Why Not Neo4j
+---
+
+# Why Not Neo4j
+
+## Problem
+
+Evie Platform's memory system stores entity relations (people linked to organizations, projects linked to repositories, meetings linked to participants). Graph databases are the conventional tool for this. Neo4j is the most prominent option.
+
+## Options Considered
+
+1. **Neo4j** -- purpose-built graph database with Cypher query language
+2. **Apache AGE** -- Postgres extension that adds openCypher support
+3. **Recursive CTEs in Postgres** -- native SQL graph traversal, no extensions needed
+
+## Decision
+
+Recursive CTEs in standard Postgres. No Neo4j, no Apache AGE.
+
+Neo4j is a heavyweight dependency. It's a Java application that needs its own JVM, its own memory allocation, its own backup strategy, and its own monitoring. For a per-agent platform that already runs Postgres with four extensions queued (pgvector, ParadeDB, TimescaleDB, pg_trgm), adding a separate database server for graph queries is hard to justify.
+
+Apache AGE was evaluated as a lighter alternative -- it adds Cypher support to Postgres. But benchmarks showed recursive CTEs outperform AGE by roughly 40x for Evie Platform's typical graph patterns (two to three hops through entity relations). AGE adds a custom query language and catalog overhead without a performance benefit at this scale.
+
+Recursive CTEs handle the graph traversal patterns Evie Platform needs: "find all people connected to this organization," "trace the meeting chain for this project," "show me everything two hops from this entity." The queries are verbose compared to Cypher but they run on the same Postgres instance that handles everything else.
+
+## Tradeoffs
+
+- **Won.** No additional database to operate, monitor, back up, or patch. One Postgres instance handles relational, vector, full-text, time-series, and graph queries.
+- **Won.** 40x faster than AGE for typical two to three hop traversals.
+- **Won.** No Java dependency. Evie Platform's runtime stack stays TypeScript + Postgres.
+- **Lost.** Cypher is more expressive than recursive CTEs for complex graph patterns. If query complexity grows beyond three to four hops, CTEs become unwieldy.
+- **Lost.** No visual graph explorer out of the box. Neo4j's browser is genuinely useful for exploring relationships. Evie Platform would need a custom visualization if that becomes a need.
diff --git a/docs/docs/architecture/design-decisions/why-postgres.md b/docs/docs/architecture/design-decisions/why-postgres.md
new file mode 100644
index 0000000..d17da04
--- /dev/null
+++ b/docs/docs/architecture/design-decisions/why-postgres.md
@@ -0,0 +1,31 @@
+---
+sidebar_label: Why Postgres
+---
+
+# Why Postgres
+
+## Problem
+
+Evie Platform needs search (vector, full-text, fuzzy), time-series storage, graph traversal, and relational data. The typical answer is to run separate databases for each concern: Pinecone for vectors, Elasticsearch for full-text, Neo4j for graphs, InfluxDB for time-series.
+
+## Options Considered
+
+1. **Polyglot persistence** -- separate databases per concern (Pinecone + Elasticsearch + Neo4j + InfluxDB + Postgres)
+2. **Postgres with extensions** -- single database using pgvector, ParadeDB (BM25), TimescaleDB and pg_trgm
+3. **SQLite with extensions** -- embedded database, simpler ops but weaker extension ecosystem
+
+## Decision
+
+Postgres with four extensions on a single instance, deployed via Supabase.
+
+**JSONB** handles semi-structured data (activity logs, entity metadata) without separate document stores. **pgvector** provides cosine-distance similarity search over embeddings. **ParadeDB** adds BM25 full-text search with TF-IDF weighting. **pg_trgm** handles fuzzy matching for typos and partial names. **TimescaleDB** powers time-series queries for the activity log and memory vitality decay.
+
+Graph traversal uses recursive CTEs rather than a graph database. Benchmarks show CTEs run 40x faster than Apache AGE for our query patterns (two to three hops through entity relations). AGE adds a custom query language and a maintenance burden for marginal benefit at our scale.
+
+Supabase wraps Postgres with Auth, Realtime subscriptions, a management dashboard, and managed migrations. One `docker compose up` gives you the full stack.
+
+## Tradeoffs
+
+- **Won.** One backup strategy, one connection pool, one set of migrations. Operational simplicity at the cost of Postgres expertise being a hard requirement.
+- **Lost.** Dedicated vector databases like Pinecone optimize for billion-scale similarity search. Evie Platform's per-agent corpus is small enough that pgvector performs well, but this decision would need revisiting at enterprise scale.
+- **Lost.** No native graph query language. CTEs are powerful but verbose compared to Cypher. Acceptable because graph queries are a small fraction of total queries.
diff --git a/docs/docs/architecture/index.md b/docs/docs/architecture/index.md
index 8aa1621..11531ed 100644
--- a/docs/docs/architecture/index.md
+++ b/docs/docs/architecture/index.md
@@ -8,13 +8,15 @@ Evie Platform turns OpenClaw from a generic agent runtime into a personal AI age
## Platform Overview
-
+import ImageLightbox from '@site/src/components/ImageLightbox';
+
+
OpenClaw provides the agent runtime: a gateway daemon, message routing across channels, tool execution, and the SOUL.md personality system. Evie Platform adds five subsystems on top:
- **Reasoning and Orchestration** -- Ego (identity governance), blind multi-model evaluation (swap between Anthropic, OpenAI, Google without changing behavior), and CC/Codex Dispatch for bidirectional agent-to-CLI orchestration via tmux
-
+
- **Memory System** -- A [five-layer model](/memory/) (episodic, identity, topical, procedural, artifact) built on an activity log foundation, with hybrid search and overnight Dream Cycle consolidation
- **Secrets and Security** -- [Progressive trust](/security/), agent-blind credential injection, four-tier risk model, and leak detection
- **Skills and Extensibility** -- [SKILL.md convention](/specifications/skill-manifests), security-first custom skills, MCP bridge via mcporter
@@ -122,3 +124,8 @@ Secure tunneling is required for every Evie Platform deployment. Agent machines
**Tailscale was chosen for practical reasons.** Zero-config mesh networking means agent machines are reachable by hostname without port forwarding or dynamic DNS. MagicDNS gives you `agent-mac.tailnet.ts.net` out of the box. ACLs control who can reach the machine and which ports are open. And Tailscale Serve/Funnel lets you expose specific services without touching the firewall.
**Future tunnel providers** (Cloudflare Tunnel, WireGuard) can be added as alternatives, but every deployment must have at least one configured via `backends.tunnel` in `evie.config.json`.
+
+## Further Reading
+
+- **[Design Decisions](./design-decisions/)** -- the reasoning behind key architectural choices: why Postgres, why Mac mini, why Bun, why Discord, why not Neo4j, why local-first, and blind multi-model evaluation.
+- **[Agent Orchestration](./agent-orchestration)** -- runtime-agnostic agent delegation, Git as coordination layer, and multi-model evaluation.
diff --git a/docs/docs/intro.md b/docs/docs/intro.md
index 35160da..46e0685 100644
--- a/docs/docs/intro.md
+++ b/docs/docs/intro.md
@@ -22,7 +22,9 @@ Evie Platform builds on that foundation with three layers OpenClaw doesn't provi
Think of it this way: OpenClaw gives you the engine. Evie Platform gives you the car.
-
+import ImageLightbox from '@site/src/components/ImageLightbox';
+
+
## Why Evie Platform?
diff --git a/docs/docs/memory/index.md b/docs/docs/memory/index.md
index fef64a3..230b6ea 100644
--- a/docs/docs/memory/index.md
+++ b/docs/docs/memory/index.md
@@ -13,7 +13,9 @@ An Evie Platform agent wakes up fresh every session. It has no built-in memory o
The memory system gives agents continuity across sessions. It's built on an activity log foundation and organized into five layers, each handling a different recall pattern with storage optimized for how that information gets accessed.
-
+import ImageLightbox from '@site/src/components/ImageLightbox';
+
+
## The Activity Log
@@ -61,14 +63,14 @@ With five layers accumulating over months, surfacing the right memories at the r
All five layers plus the activity log converge on Supabase (Postgres) as the data backbone, though they use different access patterns.
-| Layer | Primary Storage | Access Pattern |
-|---|---|---|
-| Activity Log | Postgres (TimescaleDB) | Time-range queries + full-text search |
-| [Episodic Memory](./episodic-layer) | Markdown files | File reads + embedding search |
-| [Identity](./identity-layer) | Postgres | Lookup by identifier + fuzzy matching |
-| [Topical (Knowledge)](./knowledge-layer) | Markdown + Postgres (basic-memory) | Structured queries + semantic search |
-| Procedural | SKILL.md files | File reads + skill registry |
-| Artifact | Markdown + Postgres | Content-addressed retrieval |
+| Layer | Primary Storage | Access Pattern |
+| ---------------------------------------- | ---------------------------------- | ------------------------------------- |
+| Activity Log | Postgres (TimescaleDB) | Time-range queries + full-text search |
+| [Episodic Memory](./episodic-layer) | Markdown files | File reads + embedding search |
+| [Identity](./identity-layer) | Postgres | Lookup by identifier + fuzzy matching |
+| [Topical (Knowledge)](./knowledge-layer) | Markdown + Postgres (basic-memory) | Structured queries + semantic search |
+| Procedural | SKILL.md files | File reads + skill registry |
+| Artifact | Markdown + Postgres | Content-addressed retrieval |
### Why Postgres?
diff --git a/docs/docs/security/credential-management.md b/docs/docs/security/credential-management.md
new file mode 100644
index 0000000..b4d8a5f
--- /dev/null
+++ b/docs/docs/security/credential-management.md
@@ -0,0 +1,81 @@
+---
+sidebar_label: Credential Management
+draft: true
+---
+
+# Credential Management
+
+:::caution Draft
+This page is a stub. Full implementation details are coming.
+:::
+
+Evie Platform treats credential management as a graduated system, not a binary gate. The agent earns access incrementally, credentials stay invisible to the LLM, and high-risk operations require explicit human approval.
+
+## Four-Tier Trust Model
+
+Every credential-bearing action falls into one of four tiers, matching the [progressive trust model](/security/#progressive-trust):
+
+| Tier | Policy | Credential Behavior |
+| -------------------------- | --------------------------- | ------------------------------------------------------------------------------------- |
+| **Tier 1: Run Freely** | Read-only operations | No credentials needed, or credentials auto-injected silently |
+| **Tier 2: Pre-approved** | Reversible write operations | Credentials injected from vault via allowlist. No human prompt. |
+| **Tier 3: Approve Once** | External side effects | Agent requests credential through proxy. You approve once per session via Discord DM. |
+| **Tier 4: Always Approve** | High-consequence operations | Every use requires explicit approval. Discord yes/no prompt with context. |
+
+The tier assignment is per-credential, not per-action. Your GitHub token might be tier 2 (pre-approved for commits and PRs) while your production database password is tier 4 (always approve).
+
+## Bitwarden as Vault
+
+Evie Platform uses Bitwarden as the credential vault. 1Password was the original default and remains supported, but Bitwarden is the recommended choice going forward.
+
+**Why not HashiCorp Vault?** HashiCorp Vault is designed for enterprise secret management at scale -- dynamic secrets, lease rotation, policy engines. For a single-agent platform, it's overengineered and expensive. Bitwarden provides what Evie Platform needs: encrypted storage, CLI access, organizational vaults, and mobile push approval -- at a fraction of the cost.
+
+Credentials are organized in scoped vaults:
+
+- **Agent vault** -- credentials the agent uses for integrations (API keys, OAuth tokens)
+- **Infrastructure vault** -- credentials for platform services (Postgres passwords, Tailscale auth keys)
+- **Personal vault** -- your credentials that the agent should never access directly
+
+## Agent-Blind Injection
+
+The core security principle: **PIDs never see secrets**. The agent process (and by extension, the LLM) never has direct access to credential values.
+
+The secrets proxy daemon sits between the agent and Bitwarden. When a skill needs an API key:
+
+1. The skill requests a credential by name through the proxy
+2. The proxy checks the credential's tier and allowlist
+3. For tier 3 and 4 credentials, the proxy sends a Discord DM asking for approval
+4. You tap yes or no from your phone
+5. The proxy injects the credential into the environment variable or HTTP header at the tool execution layer
+6. The LLM context never contains the credential value
+
+Even if the agent's context is fully compromised via prompt injection, the attacker gets credential names, not credential values.
+
+## Risk-Scoring Proxy
+
+A separate proxy service assigns risk scores to credential requests based on:
+
+- **Credential tier** -- base risk from the tier assignment
+- **Request context** -- what tool is requesting the credential and why
+- **Recency** -- recently-approved credentials decay to lower risk scores; stale approvals climb back
+- **Frequency** -- unusual access patterns (a credential requested ten times in a minute) trigger escalation
+
+The proxy runs as its own process, isolated from the agent runtime. It maintains an audit log of every credential request, approval, denial, and injection.
+
+## Discord Approval Prompts
+
+For tier 3 and 4 operations, the proxy sends a structured Discord DM:
+
+```
+🔐 Credential Request
+Skill: slack-integration
+Credential: SLACK_BOT_TOKEN
+Tier: 3 (approve once per session)
+Context: Posting status update to #general
+
+✅ Approve ❌ Deny
+```
+
+You respond with a reaction or button tap. Approved tier 3 credentials remain available for the rest of the session. Tier 4 credentials require approval every time.
+
+If you don't respond within a configurable timeout (default: five minutes), the request is denied automatically. The agent receives a "credential unavailable" response and can choose to skip the operation or ask you directly.
diff --git a/docs/docs/security/index.md b/docs/docs/security/index.md
index 6ee4edb..ace7467 100644
--- a/docs/docs/security/index.md
+++ b/docs/docs/security/index.md
@@ -28,16 +28,16 @@ OpenClaw maintains its own security documentation at [docs.openclaw.ai/gateway/s
A fresh Evie Platform deployment ships with these decisions already made:
-| Default | What it prevents |
-| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Dedicated Mac mini** | Agent runs on separate hardware. A compromise doesn't touch your primary workstation. |
-| **Dedicated `openclaw` OS user** | Agent process runs under a restricted account, not your admin user. |
+| Default | What it prevents |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Dedicated Mac mini** | Agent runs on separate hardware. A compromise doesn't touch your primary workstation. |
+| **Dedicated `openclaw` OS user** | Agent process runs under a restricted account, not your admin user. |
| **Key-only SSH** | Evie Platform's setup playbook disables password authentication. Brute-force isn't viable. |
-| **Tailscale for remote access** | Gateway never touches the public internet. Every connection requires authenticated tailnet membership. |
-| **Loopback gateway binding** | Gateway listens on `127.0.0.1` only. Not reachable from LAN. Traffic goes through Tailscale or stays on localhost. |
+| **Tailscale for remote access** | Gateway never touches the public internet. Every connection requires authenticated tailnet membership. |
+| **Loopback gateway binding** | Gateway listens on `127.0.0.1` only. Not reachable from LAN. Traffic goes through Tailscale or stays on localhost. |
| **Token-based auth** | Every connection presents a token. Evie Platform does not use `trusted-proxy` mode, which has been the subject of a [critical advisory](#the-cve-and-advisory-situation). |
-| **1Password for secrets** | Credentials live in scoped 1Password vaults, not plaintext config files. |
-| **Ansible-managed setup** | Infrastructure is declarative. Drift is detectable. Re-running a playbook converges back to a known-good configuration. |
+| **1Password for secrets** | Credentials live in scoped 1Password vaults, not plaintext config files. |
+| **Ansible-managed setup** | Infrastructure is declarative. Drift is detectable. Re-running a playbook converges back to a known-good configuration. |
Most OpenClaw tutorials have you running the gateway on your laptop, exposed on all interfaces, with full exec permissions. Evie Platform exists because that's a terrible idea.
@@ -51,12 +51,12 @@ Even on day one, the agent can draft emails freely. It just can't send them with
Evie Platform encodes this progression into a **four-tier trust model**:
-| Tier | Policy | Examples |
-|---|---|---|
-| **Tier 1: Run freely** | Read-only, no side effects | Reading email, checking calendar, searching Slack, browsing repos, fetching web pages |
-| **Tier 2: Pre-approved** | Write operations within reversible boundaries | Creating git branches, committing code, opening PRs, drafting emails |
-| **Tier 3: Approve once per session** | External side effects, used regularly | Sending email, posting to Slack channels, triggering deployments |
-| **Tier 4: Always approve** | High-consequence, irreversible operations | Deleting data, modifying SSH config, accessing production DB credentials, financial transactions |
+| Tier | Policy | Examples |
+| ------------------------------------ | --------------------------------------------- | ------------------------------------------------------------------------------------------------ |
+| **Tier 1: Run freely** | Read-only, no side effects | Reading email, checking calendar, searching Slack, browsing repos, fetching web pages |
+| **Tier 2: Pre-approved** | Write operations within reversible boundaries | Creating git branches, committing code, opening PRs, drafting emails |
+| **Tier 3: Approve once per session** | External side effects, used regularly | Sending email, posting to Slack channels, triggering deployments |
+| **Tier 4: Always approve** | High-consequence, irreversible operations | Deleting data, modifying SSH config, accessing production DB credentials, financial transactions |
Tiers 1 and 2 cover the vast majority of daily agent activity. The agent only blocks on tier 3 and 4 operations, which come up far less frequently. See [Hardening: the four-tier model](./hardening.md#the-four-tier-model) for implementation details.
@@ -74,7 +74,9 @@ The [secrets proxy daemon](./hardening.md#secrets-proxy-daemon) sits between the
There is no direct interaction between the requesting agent and the credential itself. Even a fully compromised agent can only access secrets the proxy's allowlist permits. The proxy also runs **leak detection** (regex + entropy scanning) on every outbound message to catch accidental credential exposure.
-
+import ImageLightbox from '@site/src/components/ImageLightbox';
+
+
## The threat landscape
@@ -143,14 +145,14 @@ Evie Platform's security controls today cover gateway hardening, exec allowlists
**Bloom filter credential scanning** — Build a bloom filter from 1Password vault entries and scan outbound commands for potential credential exfiltration. The target is sub-5ms per command, though we haven't benchmarked at scale yet. Catches the most dangerous exfiltration pattern: an agent embedding an API key in a curl command or URL.
:::info What's a bloom filter?
-A bloom filter is a compact data structure that answers one question very fast: *"Have I seen this before?"* You feed it a set of known values (in this case, your credentials from 1Password), and it builds a bit array using multiple hash functions. Later, you can test any string against it and get one of two answers: **definitely not in the set** or **probably in the set**.
+A bloom filter is a compact data structure that answers one question very fast: _"Have I seen this before?"_ You feed it a set of known values (in this case, your credentials from 1Password), and it builds a bit array using multiple hash functions. Later, you can test any string against it and get one of two answers: **definitely not in the set** or **probably in the set**.
The key properties that make it useful here:
- **Speed.** Testing a string takes microseconds, not milliseconds. You can scan every command the agent runs without perceptible delay.
- **Size.** A bloom filter holding 1,000 credentials uses roughly 1.2 KB of memory. The full credential values never exist in the filter — only their hashed fingerprints.
- **One-way.** You can't extract the original credentials from the filter. Even if an attacker accessed the filter itself, they'd get a bit array, not your API keys.
-- **False positives, never false negatives.** The filter might occasionally flag an innocent string as a match (tunable — typically under 0.1%), but it will *never* miss a real credential. For security scanning, that's exactly the tradeoff you want.
+- **False positives, never false negatives.** The filter might occasionally flag an innocent string as a match (tunable — typically under 0.1%), but it will _never_ miss a real credential. For security scanning, that's exactly the tradeoff you want.
In practice: Evie Platform syncs your 1Password vault into a bloom filter on startup, then tests every outbound shell command and URL against it. A credential appearing in a `curl` command or git push triggers an immediate block and DM approval request — all in under 5ms.
diff --git a/docs/docs/specifications/service-architecture.md b/docs/docs/specifications/service-architecture.md
index 7485370..c308b24 100644
--- a/docs/docs/specifications/service-architecture.md
+++ b/docs/docs/specifications/service-architecture.md
@@ -1,6 +1,8 @@
# Service Architecture
-
+import ImageLightbox from '@site/src/components/ImageLightbox';
+
+
Evie Platform uses Docker Compose as its service layer. The base platform ships required services (Supabase). Skills can add optional services via their own compose files. Each skill runs its own independent compose project — no merging, no assembly.
diff --git a/docs/docs/specifications/skill-manifests.md b/docs/docs/specifications/skill-manifests.md
index 3352f42..de7c405 100644
--- a/docs/docs/specifications/skill-manifests.md
+++ b/docs/docs/specifications/skill-manifests.md
@@ -1,8 +1,10 @@
# Skill Manifests
-Skills are the primary extension point for Evie Platform agents. A skill teaches an agent *how* to do something: code review, research, deployment, whatever. Some skills are pure logic (just markdown and scripts). Others need infrastructure like Docker services, API keys, or CLI tools.
+Skills are the primary extension point for Evie Platform agents. A skill teaches an agent _how_ to do something: code review, research, deployment, whatever. Some skills are pure logic (just markdown and scripts). Others need infrastructure like Docker services, API keys, or CLI tools.
-
+import ImageLightbox from '@site/src/components/ImageLightbox';
+
+
## The SKILL.md Convention
diff --git a/docs/package.json b/docs/package.json
index 3a81fc2..fa99d44 100644
--- a/docs/package.json
+++ b/docs/package.json
@@ -21,7 +21,8 @@
"react": "^18.3.1",
"react-dom": "^18.3.1",
"rehype-katex": "^7.0.1",
- "remark-math": "^6.0.0"
+ "remark-math": "^6.0.0",
+ "yet-another-react-lightbox": "^3.30.1"
},
"devDependencies": {
"@docusaurus/module-type-aliases": "3.9.2",
diff --git a/docs/src/components/ImageLightbox.tsx b/docs/src/components/ImageLightbox.tsx
new file mode 100644
index 0000000..d3dda7b
--- /dev/null
+++ b/docs/src/components/ImageLightbox.tsx
@@ -0,0 +1,41 @@
+import React, { useState, useCallback } from "react";
+import Lightbox from "yet-another-react-lightbox";
+import "yet-another-react-lightbox/styles.css";
+
+interface ImageLightboxProps {
+ src: string;
+ alt: string;
+}
+
+export default function ImageLightbox({ src, alt }: ImageLightboxProps) {
+ const [open, setOpen] = useState(false);
+
+ const handleOpen = useCallback(() => setOpen(true), []);
+ const handleClose = useCallback(() => setOpen(false), []);
+
+ return (
+ <>
+
{
+ if (e.key === "Enter" || e.key === " ") handleOpen();
+ }}
+ style={{ cursor: "zoom-in", maxWidth: "100%" }}
+ />
+ null,
+ buttonNext: () => null,
+ }}
+ />
+ >
+ );
+}
diff --git a/package-lock.json b/package-lock.json
index 8125852..328e56d 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -24,7 +24,8 @@
"react": "^18.3.1",
"react-dom": "^18.3.1",
"rehype-katex": "^7.0.1",
- "remark-math": "^6.0.0"
+ "remark-math": "^6.0.0",
+ "yet-another-react-lightbox": "^3.30.1"
},
"devDependencies": {
"@docusaurus/module-type-aliases": "3.9.2",
@@ -16976,6 +16977,32 @@
"version": "3.1.1",
"license": "ISC"
},
+ "node_modules/yet-another-react-lightbox": {
+ "version": "3.30.1",
+ "resolved": "https://registry.npmjs.org/yet-another-react-lightbox/-/yet-another-react-lightbox-3.30.1.tgz",
+ "integrity": "sha512-VYy9UZbBtHkuU6FnABF9G9UCAr56TPUcb3uFXHpgMd+FhqvPQDTbfSmwsax3cdMtSc/udu5ycfHz8jwFeWMP3g==",
+ "license": "MIT",
+ "engines": {
+ "node": ">=14"
+ },
+ "funding": {
+ "url": "https://github.com/sponsors/igordanchenko"
+ },
+ "peerDependencies": {
+ "@types/react": "^16 || ^17 || ^18 || ^19",
+ "@types/react-dom": "^16 || ^17 || ^18 || ^19",
+ "react": "^16.8.0 || ^17 || ^18 || ^19",
+ "react-dom": "^16.8.0 || ^17 || ^18 || ^19"
+ },
+ "peerDependenciesMeta": {
+ "@types/react": {
+ "optional": true
+ },
+ "@types/react-dom": {
+ "optional": true
+ }
+ }
+ },
"node_modules/yocto-queue": {
"version": "1.2.2",
"license": "MIT",