diff --git a/HEADLESS.md b/HEADLESS.md new file mode 100644 index 0000000..3f90207 --- /dev/null +++ b/HEADLESS.md @@ -0,0 +1,406 @@ +# Hawkeye Headless Operations Guide + +Hawkeye runs in three modes: + +| Mode | Binary | Use case | +|---|---|---| +| **Desktop** | `hawkeye-desktop` (Tauri) | Full UI: chat panel, life tree, gaze overlay, observe HUD | +| **Node CLI** | `hawkeye` (npm package `@hawkeye/cli`) | Cross-platform scripting; one-shot perceive / plan / execute via `@hawkeye/core` | +| **Rust CLI** | `hawkeye-cli` (single static binary) | Single-binary deployment; reuses the Tauri Rust backend without any webview | + +The Desktop and Rust modes share the same Rust backend (`packages/desktop-tauri/src-tauri/`); the Node CLI wraps `@hawkeye/core` directly. + +This document covers the two headless modes (Node + Rust). For the Tauri desktop UI plus its agent (cua-driver) integration, see [`packages/desktop-tauri/AGENT_INTEGRATION.md`](packages/desktop-tauri/AGENT_INTEGRATION.md). + +--- + +## Table of contents + +- [Quick start](#quick-start) +- [Architecture](#architecture) +- [Node CLI (`hawkeye`)](#node-cli-hawkeye) +- [Rust CLI (`hawkeye-cli`)](#rust-cli-hawkeye-cli) +- [Configuration](#configuration) +- [Choosing between Node and Rust](#choosing-between-node-and-rust) +- [cua-driver agent mode](#cua-driver-agent-mode) +- [Phase 3 roadmap](#phase-3-roadmap) +- [File-level reference](#file-level-reference) + +--- + +## Quick start + +### Node CLI + +```bash +cd packages/cli +pnpm install && pnpm build + +# Optional: link globally so `hawkeye` is on $PATH +ln -s "$(pwd)/dist/main.js" /usr/local/bin/hawkeye + +hawkeye init # writes ~/.config/hawkeye/cli.json +export GEMINI_API_KEY=… +hawkeye chat "what model are you?" # one-turn chat +hawkeye perceive --json | jq # screenshot + OCR + intent +hawkeye run "open Safari" # end-to-end perceive→plan→execute +hawkeye daemon # NDJSON event stream +``` + +### Rust CLI + +```bash +cd packages/desktop-tauri/src-tauri +cargo build --release --bin hawkeye-cli + +# Single-binary deployment +cp target/release/hawkeye-cli /usr/local/bin/ + +hawkeye-cli config # print effective AppConfig +hawkeye-cli chat "hello" # one-turn chat +hawkeye-cli observe # NDJSON event stream +hawkeye-cli agent-status # cua-driver health probe +hawkeye-cli agent "list windows" # tool-using turn (needs cua-driver) +``` + +--- + +## Architecture + +``` + ┌─────────────────────┐ + │ @hawkeye/core │ pure Node lib (zero UI deps) + │ - perception │ + │ - reasoning │ + │ - execution │ + │ - storage / memory │ + └──────────▲──────────┘ + │ + ┌─────────────────────┼─────────────────────┐ + │ │ │ + ┌────────┴───────┐ ┌────────┴────────┐ ┌──────┴───────┐ + │ @hawkeye/cli │ │ Electron (old) │ │ desktop-tauri │ + │ (Node CLI) │ │ packages/ │ │ Rust backend │ + │ │ │ desktop/ │ │ + React UI │ + └────────────────┘ └─────────────────┘ └───┬───────┬───┘ + │ │ + ┌─────┴──┐ ┌──┴────────┐ + │ Tauri │ │ hawkeye- │ + │ webview│ │ cli │ + └────────┘ │ (Rust) │ + └───────────┘ + ┌────────────────────┐ + │ cua-driver daemon │ + │ (Swift, macOS) │ + └────────────────────┘ +``` + +### EventSink decoupling + +The Tauri Rust backend was decoupled from `tauri::AppHandle` so the same observe / agent code paths run from CLI: + +```rust +// packages/desktop-tauri/src-tauri/src/event_sink.rs +pub trait EventSink: Send + Sync { + fn emit(&self, event: &str, payload: Value); +} +pub struct TauriSink { handle: AppHandle } // GUI: forwards to webview +pub struct StdoutSink; // CLI: NDJSON to stdout +pub struct NoopSink; // tests: drops events +pub type SharedSink = Arc; +``` + +`ObserveLoop::start` and `agent::run_user_turn` both take `Arc` — the GUI plugs in a `TauriSink` during Tauri setup, the CLI plugs in a `StdoutSink`. Tests can use `NoopSink`. + +--- + +## Node CLI (`hawkeye`) + +**Source**: [`packages/cli/`](packages/cli/) — TypeScript package `@hawkeye/cli` v0.1.0, ESM-only, ~17 KB compiled output. + +### Commands + +| Command | Description | +|---|---| +| `hawkeye init [--force]` | Write a starter `~/.config/hawkeye/cli.json`, create `~/.hawkeye/` data dir | +| `hawkeye perceive [--json]` | One-shot screenshot + OCR + intent recognition; emits `UserIntent[]` | +| `hawkeye plan ` | Generate `ExecutionPlan` from a stored intent (use `-` for stdin) | +| `hawkeye execute ` | Execute a plan, streaming step results | +| `hawkeye run ""` | End-to-end: perceive → top intent → plan → execute | +| `hawkeye chat ""` | One-turn AI chat (no perception, no tools) | +| `hawkeye daemon [--interval=3000]` | Long-running observe loop, NDJSON events to stdout | + +Global flag `--json` switches to NDJSON output for any command. + +### Build + +```bash +cd packages/cli +pnpm install # resolves @hawkeye/core via workspace link +pnpm build # tsup → dist/main.js (shebang'd, exec) +pnpm typecheck +``` + +**Bundle**: 17 KB ESM file (`dist/main.js`), `@hawkeye/core` and its native deps (`better-sqlite3`, `screenshot-desktop`) externalized — resolved at runtime from `node_modules`. + +### Disabled-by-default modules + +`buildHawkeyeConfig()` in [`packages/cli/src/config.ts`](packages/cli/src/config.ts) turns off behavior tracking, memory, dashboard, workflow, plugins, autonomous, and the task queue. The CLI is one-shot; these modules add startup cost and pull native deps. The `daemon` subcommand can opt in via env vars (Phase 3 work). + +### Daemon polling caveat + +`@hawkeye/core` does not currently expose a single `observation` event. The `daemon` subcommand falls back to polling `perceiveAndRecognize` on the configured interval, while also subscribing to 11 real `Hawkeye` events (`ready`, `perceiving`, `intents:detected`, `plan:generated`, `execution:step:*`, etc.). All emitted as NDJSON to stdout. + +--- + +## Rust CLI (`hawkeye-cli`) + +**Source**: [`packages/desktop-tauri/src-tauri/src/bin/cli.rs`](packages/desktop-tauri/src-tauri/src-tauri/src/bin/cli.rs) — clap-based, 173 LOC, reuses `hawkeye_lib` crate. + +### Commands + +| Command | Description | +|---|---| +| `hawkeye-cli config` | Pretty-print the effective `AppConfig` (after env + file resolution) | +| `hawkeye-cli observe [--interval-ms=3000] [--change-threshold=0.05]` | Run the observe loop, NDJSON events to stdout, Ctrl+C to stop | +| `hawkeye-cli chat ` | One-turn AI chat using the configured provider (Gemini default) | +| `hawkeye-cli agent ` | Tool-using agent turn (requires cua-driver running) | +| `hawkeye-cli agent-status` | Probe cua-driver socket connectivity, print JSON status | + +### Build + +```bash +cd packages/desktop-tauri/src-tauri + +# Debug build (~80 MB) +cargo build --bin hawkeye-cli + +# Release build (~7-10 MB with LTO + opt-level="s" + strip) +cargo build --release --bin hawkeye-cli +``` + +The Tauri desktop binary still builds normally: + +```bash +cargo build --bin hawkeye-desktop # original, unchanged +``` + +`Cargo.toml` declares both: + +```toml +[[bin]] +name = "hawkeye-desktop" +path = "src/main.rs" + +[[bin]] +name = "hawkeye-cli" +path = "src/bin/cli.rs" +``` + +### Provider support + +| Provider | Status | +|---|---| +| Gemini | ✅ full (chat + tools / function-calling) | +| OpenAI | ✅ chat only (tool calling: not yet implemented) | +| Local llama-cpp | ❌ rejected at startup — requires Tauri-only `init_ai` lifecycle. Will be wired up later. | + +--- + +## Configuration + +### Node CLI: `~/.config/hawkeye/cli.json` + +```jsonc +{ + "ai": { + "provider": "gemini", + "apiKey": "…", // OR set GEMINI_API_KEY env var + "model": "gemini-2.5-flash", + "baseUrl": "https://generativelanguage.googleapis.com/v1beta" + }, + "perception": { "enableScreen": true, "enableOCR": true }, + "storage": { "dataDir": "~/.hawkeye" }, + "observe": { "intervalMs": 3000, "changeThreshold": 0.05 } +} +``` + +### Rust CLI: `~/.config/hawkeye/config.json` + +The Rust backend was already file-driven via `dirs::config_dir().join("hawkeye/config.json")` — that path is reused untouched. + +### Resolution order (Node CLI) + +1. CLI args (`--json`, etc.) +2. Env vars: `HAWKEYE_CONFIG` (path override), `HAWKEYE_DATA_DIR`, `GEMINI_API_KEY` / `GOOGLE_API_KEY`, `OPENAI_API_KEY` +3. JSON file at `$HAWKEYE_CONFIG` (or default path) +4. Built-in defaults + +### Custom binary location for cua-driver + +Override `CUA_DRIVER_BIN`: + +```bash +export CUA_DRIVER_BIN="$HOME/Applications/CuaDriver.app/Contents/MacOS/cua-driver" +``` + +Search order: `$CUA_DRIVER_BIN` → `/usr/local/bin/cua-driver` → `/Applications/CuaDriver.app/Contents/MacOS/cua-driver`. + +--- + +## Choosing between Node and Rust + +| Concern | Node (`hawkeye`) | Rust (`hawkeye-cli`) | +|---|---|---| +| **Setup** | requires Node 20+, pnpm, `@hawkeye/core` workspace deps | single static binary | +| **Bundle size** | 17 KB CLI + ~150 MB `node_modules` (shared with workspace) | ~7-10 MB release | +| **Cross-platform** | macOS / Linux / Windows | macOS only currently (cua-driver, Swift OCR/Speech, Metal llama.cpp) | +| **Coverage** | full `@hawkeye/core` (memory, life-tree, knowledge graph, browser-agent, MCP) | observe + chat + agent (cua-driver) only | +| **Use cases** | scripts, CI, Docker, dev workflows | distributable single binary, embedded in other macOS tools | +| **Startup** | ~500 ms (Node bootstrap + better-sqlite3 native init) | ~50 ms | +| **Fits remote dev** | ✅ works fine via SSH (no display needed) | ✅ same | + +Rule of thumb: **Node CLI for breadth, Rust CLI for distribution.** Most users on macOS dev machines prefer Node — it covers all of `@hawkeye/core`. Rust CLI shines when you want one binary you can `scp` to another box. + +--- + +## cua-driver agent mode + +When `agent` mode is enabled, Hawkeye gains "hands" via [trycua/cua's `cua-driver`](https://github.com/trycua/cua) — a Swift daemon that drives macOS apps **in the background without stealing focus**. + +### Install + +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)" +``` + +This downloads a signed/notarized release tarball, installs `CuaDriver.app` to `/Applications/` and symlinks `/usr/local/bin/cua-driver`. macOS will prompt for **Accessibility** and **Screen Recording** permissions on first launch. + +### Verify + +```bash +cua-driver --version +cua-driver serve & # starts daemon +ls ~/Library/Caches/cua-driver/cua-driver.sock # should exist + +# From either CLI: +hawkeye-cli agent-status # JSON status: binaryInstalled, daemonRunning, socketPath +``` + +### 8 curated tools + +The model sees these via Gemini function-calling. Defined in [`packages/desktop-tauri/src-tauri/src/agent/tools.rs`](packages/desktop-tauri/src-tauri/src/agent/tools.rs): + +| Tool | Purpose | +|---|---| +| `screenshot` | Capture PNG of full screen or a specific window | +| `list_windows` | Enumerate visible windows with pid/title/bounds | +| `get_window_state` | AX tree + PNG snapshot of a window | +| `click` | AX-element OR pixel-coord click without focus theft | +| `type_text` | Type into a focused field without raising the window | +| `press_key` | Hotkey combo (`cmd+s`, `return`, `escape`) | +| `scroll` | Scroll within a window | +| `launch_app` | Open a macOS app by bundle id or path | + +The model can chain these — e.g., `screenshot → click(x,y) → type_text → screenshot` — for up to **`MAX_TOOL_ROUNDS = 8`** rounds per user turn. Anything beyond hard-stops with an error. + +### Security + +- **Allow-list**: only the 8 tools above are accepted; anything else returns `{ok: false, error: "not in allow-list"}` to the model so it can recover, without ever reaching the daemon. +- **Socket permissions**: cua-driver creates the Unix socket with mode `0o600` — only the owning user can speak to it. +- **No daemon auth**: filesystem permissions are the only gate. Accessibility/Screen Recording grants live at the OS level on the cua-driver app bundle. +- **Round cap**: `MAX_TOOL_ROUNDS = 8`. Exceeding raises a hard error. +- **Failures degrade gracefully**: tool errors become `{ok:false, error:…}` payloads fed back to the model, not exceptions to the user. + +For the full design — Swift SkyLight SPIs, focus-without-raise, AX-vs-pixel addressing — see [`packages/desktop-tauri/AGENT_INTEGRATION.md`](packages/desktop-tauri/AGENT_INTEGRATION.md). + +--- + +## Phase 3 roadmap + +The following are documented in [`HEADLESS_PLAN.md`](HEADLESS_PLAN.md) and not yet built: + +- **3a. YAML / TOML config schema** — let Rust + Node share a single config format with IDE auto-complete via JSON Schema +- **3b. REST / gRPC server mode** — `hawkeye-cli serve --port 8080` with axum, exposing `POST /v1/{perceive,plan,execute,chat,agent}` + WebSocket `/v1/observe` for event streaming +- **3c. systemd / launchd service** — daemon mode runnable as a managed service on macOS / Linux +- **3d. Multi-platform CI builds** — release matrix for macOS arm64+x86_64, Linux x86_64+arm64 +- **3e. Docker image** — `hawkeye-cli` on Linux (no agent mode — cua-driver is macOS-only) +- **OpenAI / local-llama tool calling** — extend `chat_with_tools` for non-Gemini providers + +--- + +## File-level reference + +### Node CLI (`packages/cli/`) + +| Path | Purpose | +|---|---| +| `package.json` | `@hawkeye/cli@0.1.0`, `bin: hawkeye`, workspace dep on `@hawkeye/core` | +| `tsup.config.ts` | ESM bundle, `node20` target, shebang banner | +| `src/main.ts` | Commander entrypoint + global `--json` flag | +| `src/config.ts` | 4-layer config merge (defaults → file → env → overrides), `CliConfig`/`HawkeyeConfig` translation | +| `src/output.ts` | `pretty` (ANSI, TTY-aware) and `json` (NDJSON) modes | +| `src/commands/{init,perceive,plan,execute,run,chat,daemon}.ts` | One file per subcommand | +| `README.md` | One-page usage doc | + +### Rust CLI + EventSink decoupling (`packages/desktop-tauri/src-tauri/`) + +| Path | Status | Purpose | +|---|---|---| +| `src/event_sink.rs` | NEW | `EventSink` trait + `TauriSink` / `StdoutSink` / `NoopSink` impls | +| `src/bin/cli.rs` | NEW | clap-based CLI: `config` / `observe` / `chat` / `agent` / `agent-status` | +| `Cargo.toml` | MOD | Two `[[bin]]` entries; new `clap = "4"` dep | +| `src/lib.rs` | MOD | `pub mod` everywhere; setup installs `TauriSink` into `AppState.event_sink` | +| `src/state.rs` | MOD | Added `event_sink: RwLock>` | +| `src/agent/runner.rs` | MOD | `run_user_turn(sink: Arc, …)` instead of `AppHandle` | +| `src/observe/loop_runner.rs` | MOD | `ObserveLoop::start(sink: Arc, …)` | +| `src/perception/mod.rs` | MOD | Dropped unused `_app: &AppHandle` parameter | +| `src/commands/{observe_cmd,agent_cmd}.rs` | MOD | Resolve sink from state, fall back to ad-hoc `TauriSink::new(app)` | + +### Agent / cua-driver integration (`packages/desktop-tauri/src-tauri/src/agent/`) + +| Path | Status | Purpose | +|---|---|---| +| `agent/protocol.rs` | NEW | Wire types for cua-driver line-delimited JSON protocol | +| `agent/cua_driver.rs` | NEW | Async Unix-socket client + `DaemonSupervisor` (binary discovery, spawn) | +| `agent/tools.rs` | NEW | Curated 8-tool catalog → Gemini `FunctionDeclaration`s | +| `agent/runner.rs` | NEW | Tool-use loop, `MAX_TOOL_ROUNDS = 8`, emits `agent:tool-call-{start,end}` | +| `agent/mod.rs` | NEW | Module exports | +| `commands/agent_cmd.rs` | NEW | Tauri commands: `get_agent_status`, `start_agent`, `chat_with_agent`, `invoke_cua_tool` | +| `ai/types.rs` | MOD | `FunctionDeclaration`, `FunctionCall`, `FunctionResult`, `ToolMessage`, `ToolTurn` + Gemini wire types | +| `ai/provider.rs` | MOD | `chat_with_tools()` default-impl (unsupported), `supports_tools()` | +| `ai/gemini.rs` | MOD | Full `chat_with_tools` impl + `tool_config` + `function_call`/`function_response` translation | + +### Frontend (`packages/desktop-tauri/src/`) + +| Path | Status | Purpose | +|---|---|---| +| `hooks/useAgent.ts` | NEW | React hook: live tool-call stream from `agent:tool-call-*` events | +| `hooks/useTauri.ts` | MOD | Types + invoke wrappers: `AgentStatus`, `AgentTurnResult`, `ToolCallRecord`, `getAgentStatus`, `startAgent`, `chatWithAgent`, `invokeCuaTool` | +| `components/ChatPanel.tsx` | MOD | Agent-mode toggle, tool-call audit trail, live-stream UI | + +--- + +## Verification log + +```text +$ cd packages/cli && pnpm build && node dist/main.js --version +0.1.0 +$ node dist/main.js init +Wrote ~/.config/hawkeye/cli.json +Created data dir at ~/.hawkeye + +$ cd packages/desktop-tauri/src-tauri && cargo test --lib agent:: +test result: ok. 5 passed; 0 failed; 0 ignored +$ cargo build --bin hawkeye-cli + Finished `dev` profile [unoptimized + debuginfo] target(s) in 7.83s +$ cargo build --bin hawkeye-desktop + Finished `dev` profile [unoptimized + debuginfo] target(s) in 8.13s +$ ./target/debug/hawkeye-cli agent-status +{ + "binaryInstalled": false, + "binaryPath": null, + "daemonRunning": false, + "socketPath": "/Users/.../Library/Caches/cua-driver/cua-driver.sock" +} +``` diff --git a/HEADLESS_PLAN.md b/HEADLESS_PLAN.md new file mode 100644 index 0000000..fb9f2d4 --- /dev/null +++ b/HEADLESS_PLAN.md @@ -0,0 +1,365 @@ +# Hawkeye 无 UI 运行能力 —— 执行计划 + +> **状态(2026-04-26)**:✅ Phase 1 完成 · ✅ Phase 2 完成 · 📋 Phase 3 待启动 +> 完整使用文档见 [`HEADLESS.md`](HEADLESS.md);cua-driver agent 集成见 [`packages/desktop-tauri/AGENT_INTEGRATION.md`](packages/desktop-tauri/AGENT_INTEGRATION.md)。 + +> **结论**:3 条路径技术上都可行,且彼此不互斥。建议 **Phase 1(Node CLI)** 立即开干(1 天交付),**Phase 2(Tauri Rust CLI)** 在 Phase 1 验证完后启动,**Phase 3** 按需推进。 + +--- + +## 验证后的事实基线 + +| 假设 | 状态 | 关键证据 | +|---|---|---| +| `@hawkeye/core` 完全 UI-agnostic | ✅ 真 | 零 `electron/react/@tauri-apps/document` 引用 | +| 已发布为双格式 (ESM+CJS+types) 库 | ✅ 真 | `package.json:5-29` 完整 `exports` map | +| 子路径导入可用 | ✅ 真 | `/perception` `/reasoning` `/execution` 都已 export | +| `createHawkeye()` 工厂 + 核心方法 | ✅ 真 | `hawkeye.ts:1554-1557` 工厂;`initialize/perceiveAndRecognize/generatePlan/executePlan` 全部 `hawkeye.ts:265-562` | +| `~/.hawkeye/` 存储惯例可被 config 覆盖 | ✅ 真 | `storage/storage.ts:19` `config.dataDir \|\| os.homedir()/.hawkeye` | +| Tauri Cargo.toml 已是库形式 | ✅ 真 | `crate-type = ["staticlib","cdylib","rlib"]` 已声明 | +| 现有 `bin` CLI 入口 | ❌ **缺** | `package.json` 无 `bin` 字段;需要新建 | + +**已知风险**: +- ⚠️ `better-sqlite3` 需 native compile(Python + 编译工具链)—— 文档说明即可 +- ⚠️ `nutjs-executor` 在纯 Node 下 GUI 操作会失败 → 默认禁用,引导用 cua-driver 或 browser-agent +- ⚠️ `screenshot-desktop` macOS 需 Screen Recording 权限 + +--- + +## Phase 1:Node CLI(路径 1)—— ✅ 已完成 + +**目标**:新增 `packages/cli/`,零改动现有代码,立即拿到 `hawkeye` 命令。 + +**完成情况**: +- ✅ 781 LOC TypeScript across 10 source files +- ✅ ESM bundle 17 KB (`packages/cli/dist/main.js`, shebang'd, exec) +- ✅ 7 子命令:`init / perceive / plan / execute / run / chat / daemon` +- ✅ 全局 `--json` flag for NDJSON output +- ✅ 4 层 config merge(defaults → file → env → overrides) +- ✅ `pnpm build` + `pnpm typecheck` 全绿;`hawkeye init` / `--version` / `--help` smoke 通过 +- ⚠️ Daemon 子命令暂用 polling fallback —— `@hawkeye/core` 未暴露统一 `observation` event;订阅了 11 个真实事件。已在 README 注明。 + +### 文件清单 +``` +packages/cli/ +├── package.json # name=@hawkeye/cli, bin: { hawkeye } +├── tsconfig.json # 继承根 +├── tsup.config.ts # 单文件打包,shebang +├── src/ +│ ├── main.ts # commander/yargs 入口 +│ ├── config.ts # 加载顺序:CLI args > env > ~/.config/hawkeye/cli.json > defaults +│ ├── output.ts # JSON / pretty 双格式 +│ └── commands/ +│ ├── init.ts # 生成默认 cli.json + 创建 ~/.hawkeye/ +│ ├── perceive.ts # 单次截屏+OCR+意图,输出 JSON +│ ├── plan.ts # 读 intent.json → 生成 ExecutionPlan +│ ├── execute.ts # 读 plan.json → 执行 +│ ├── run.ts # 端到端 perceive→plan→execute +│ ├── chat.ts # 单轮 chat(无工具) +│ └── daemon.ts # 长连接 observe loop,stdout 流式输出 +└── README.md +``` + +### 子命令定义 + +| 命令 | 功能 | I/O | +|---|---|---| +| `hawkeye init` | 写 `~/.config/hawkeye/cli.json` 默认配置 | stdout: 路径 | +| `hawkeye perceive [--json]` | 单次感知 | stdout: `UserIntent[]` | +| `hawkeye plan ` | 从 intent.json 生成计划 | stdout: `ExecutionPlan` JSON | +| `hawkeye execute ` | 执行计划 | stdout: PlanExecution 状态流 | +| `hawkeye run ""` | 端到端 | stdout: 执行结果 | +| `hawkeye chat ""` | 一次 chat | stdout: assistant text | +| `hawkeye daemon [--interval=3000]` | 持续 observe | stdout: NDJSON 事件流 | +| `hawkeye --version` / `--help` | 元信息 | stdout | + +### 配置 schema (cli.json) +```jsonc +{ + "ai": { + "providers": [ + { "type": "gemini", "apiKey": "...", "model": "gemini-2.5-flash" } + ], + "preferredProvider": "gemini" + }, + "perception": { "enableScreen": true, "enableOCR": true }, + "storage": { "database": { "dbPath": "~/.hawkeye/hawkeye.db" } }, + "observe": { "intervalMs": 3000, "changeThreshold": 0.05 } +} +``` + +### 环境变量覆盖 +- `HAWKEYE_CONFIG` → 自定义 config 路径 +- `HAWKEYE_DATA_DIR` → 覆盖 `~/.hawkeye` +- `GEMINI_API_KEY` / `GOOGLE_API_KEY` / `OPENAI_API_KEY` → 已在 core 内置支持 + +### 工作量 +- ~400 行 TS +- 0 行核心改动 +- **1 个工程师 1 天** + +### 验收 +- [ ] `hawkeye init` 写出配置 +- [ ] `hawkeye perceive --json | jq '.[].intentType'` 输出意图 +- [ ] `hawkeye run "open Safari"` 端到端跑通 +- [ ] `hawkeye daemon` 流式输出事件 +- [ ] 完全脱离 Electron/Tauri(用 `pmap` / `lsof` 验证无 webview 进程) + +--- + +## Phase 2:Tauri Rust CLI bin(路径 2)—— ✅ 已完成 + +**目标**:复用 desktop-tauri 的 Rust 后端,编译成 7-10MB 单二进制(无 webview / 无 Node)。 + +**完成情况**: +- ✅ 新增 `src/event_sink.rs`(54 LOC)—— `EventSink` trait + `TauriSink` / `StdoutSink` / `NoopSink` +- ✅ 新增 `src/bin/cli.rs`(173 LOC)—— clap-based 5 子命令 +- ✅ `Cargo.toml` 显式声明 `[[bin]] hawkeye-desktop` + `[[bin]] hawkeye-cli`,新增 `clap = "4"` dep +- ✅ `lib.rs` 全部 `mod` → `pub mod`;setup 时把 `TauriSink` 装进 `AppState.event_sink` +- ✅ `state.rs` 加 `event_sink: RwLock>` 字段 +- ✅ `agent/runner.rs` 的 `run_user_turn` 改用 `Arc` +- ✅ `observe/loop_runner.rs::ObserveLoop::start` 改用 `Arc` +- ✅ `perception/mod.rs::init` 删了未使用的 AppHandle 参数 +- ✅ Verification:`cargo build --bin hawkeye-{desktop,cli}` 双绿;`cargo test --lib agent::` 5/5 通过;CLI smoke (`--help` / `--version` / `agent-status` / `config`) 全部通过 +- 📦 ~227 LOC 新增 + ~70 LOC 重构跨 7 个现有文件 + +### 重构步骤 + +#### Step 2.1:抽 EventSink trait(解耦核心和 Tauri) + +新文件 `src-tauri/src/event_sink.rs`: +```rust +use serde_json::Value; +use std::sync::Arc; + +pub trait EventSink: Send + Sync { + fn emit(&self, event: &str, payload: Value); +} + +/// Tauri 实现 —— 转发到 AppHandle.emit +pub struct TauriSink(pub tauri::AppHandle); +impl EventSink for TauriSink { + fn emit(&self, event: &str, payload: Value) { + let _ = self.0.emit(event, payload); + } +} + +/// CLI 实现 —— stdout NDJSON +pub struct StdoutSink; +impl EventSink for StdoutSink { + fn emit(&self, event: &str, payload: Value) { + println!("{}", serde_json::json!({ "event": event, "data": payload })); + } +} + +/// 静默实现 —— 给一次性命令用 +pub struct NoopSink; +impl EventSink for NoopSink { + fn emit(&self, _: &str, _: Value) {} +} +``` + +#### Step 2.2:把 `lib.rs::run()` 拆三段 + +```rust +// 现状:单个 pub fn run() { tauri::Builder::default().setup(|app| {...}).invoke_handler!(...).run(...) } + +// 改成: +pub fn run() { // Tauri 入口(保留) + let cfg = config::load_config().unwrap_or_default(); + let state = init_core(cfg); + tauri::Builder::default() + .setup(move |app| init_tauri(app, state.clone())) + .invoke_handler(tauri::generate_handler![...]) + .run(...); +} + +pub fn init_core(cfg: AppConfig) -> Arc { + state::AppState::new(cfg) // 纯逻辑 +} + +fn init_tauri(app: &mut App, state: Arc) -> Result<()> { + let handle = app.handle().clone(); + app.manage(state.clone()); + spawn_perception(&handle); + spawn_agent_supervisor(&handle, &state); + setup_tray(...)?; + Ok(()) +} +``` + +#### Step 2.3:observe/loop_runner.rs 解耦 + +```rust +// 旧: +pub fn start(handle: AppHandle, state: Arc, ...) -> ObserveLoop { + handle.emit(events::OBSERVE_STOPPED, ...); +} + +// 新: +pub fn start(sink: Arc, state: Arc, ...) -> ObserveLoop { + sink.emit(events::OBSERVE_STOPPED, json!({})); +} +``` + +约 5 处调用站点要改(`loop_runner.rs:6, 64`、`gaze_cmd.rs`、`commands/observe_cmd.rs` 等)。 + +#### Step 2.4:agent/runner.rs 解耦 + +`run_user_turn(app: AppHandle, ...)` → `run_user_turn(sink: Arc, ...)`。 +`commands/agent_cmd.rs::chat_with_agent` 在 Tauri 上下文里把 `app` 包成 `TauriSink`。 + +#### Step 2.5:perception/mod.rs::init 删 AppHandle 参数(你的报告说没用) + +#### Step 2.6:新增 CLI bin + +`src-tauri/Cargo.toml`: +```toml +[[bin]] +name = "hawkeye-cli" +path = "src/bin/cli.rs" + +[dependencies] +clap = { version = "4", features = ["derive"] } # 新增 +``` + +`src-tauri/src/bin/cli.rs`: +```rust +use clap::{Parser, Subcommand}; +use hawkeye_lib::{config, event_sink::StdoutSink, init_core, observe}; +use std::sync::Arc; + +#[derive(Parser)] +#[command(version, about = "Hawkeye headless CLI")] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Subcommand)] +enum Cmd { + /// Single-shot screen perception (screenshot + OCR + window). + Perceive, + /// Continuous observe loop, NDJSON events to stdout. + Observe { #[arg(long, default_value_t = 3000)] interval_ms: u64 }, + /// One-turn AI chat. + Chat { text: String }, + /// Tool-using agent turn (requires cua-driver installed). + Agent { text: String }, + /// Print effective config. + Config, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + env_logger::init(); + let cli = Cli::parse(); + let cfg = config::load_config()?; + let state = init_core(cfg); + let sink = Arc::new(StdoutSink); + + match cli.cmd { + Cmd::Perceive => { /* call perception once, print JSON */ } + Cmd::Observe { interval_ms } => { + let _loop = observe::ObserveLoop::start(sink.clone(), state, interval_ms, 0.05); + tokio::signal::ctrl_c().await?; + } + Cmd::Chat { text } => { /* init AI, call chat, print */ } + Cmd::Agent { text } => { /* init AI, ensure daemon, run agent turn */ } + Cmd::Config => { println!("{}", serde_json::to_string_pretty(&*state.config.read().await)?); } + } + Ok(()) +} +``` + +#### Step 2.7:Build matrix 验证 +- `cargo build --release --bin hawkeye-cli` 跑通 +- `cargo build --release --bin hawkeye-desktop` 仍跑通(Tauri 主程序不破) +- `pnpm tauri:dev` 仍跑通(端到端) +- `ls -lh target/release/hawkeye-cli` < 10MB + +### 工作量 +- 重构 lib.rs:~80 行变动 +- EventSink + 实现:~80 行新增 +- observe/agent 解耦:~50 行变动 +- src/bin/cli.rs:~150 行新增 +- Cargo.toml + clap:5 行 +- **~200 行重构 + 250 行新增** +- **1 个 Rust 工程师 3-5 天** + +### 验收 +- [ ] `cargo build --bin hawkeye-cli` 通过 +- [ ] 二进制 < 10MB +- [ ] `hawkeye-cli observe` 流式输出 NDJSON +- [ ] `hawkeye-cli agent "list windows"` 联通 cua-driver 并执行 +- [ ] Tauri app 仍正常构建启动 +- [ ] 单元测试全过(agent 模块 5 个 + 其他) + +### 风险 +- ⚠️ Swift 子进程(hawkeye-ocr/speech/ane)由 build.rs 编译路径硬编码 —— CLI bin 也要包路径常量。`HAWKEYE_OCR_PATH` 已是 `option_env!()` 模式(你 memory 里写的),CLI bin 会自动继承。✅ 不阻塞。 +- ⚠️ llama-cpp-2 + Metal feature 是 macOS-only。Linux build 需要 `#[cfg(target_os = "macos")]` 守卫或 stub。Phase 2 只做 macOS。 + +--- + +## Phase 3:高级特性(按需) + +### 3a. YAML/TOML 配置 schema(半天) +- 加 `serde_yaml` / `toml` deps +- `config::load_config()` 自动检测 `.json` / `.yaml` / `.toml` 后缀 +- 写 `config.schema.json`(JSON Schema) 让 IDE 提供补全 + +### 3b. REST/gRPC server 模式(2 天) +- `hawkeye-cli serve --port 8080` +- axum router:`POST /v1/perceive`、`/v1/plan`、`/v1/execute`、`/v1/chat`、`/v1/agent` +- WebSocket `/v1/observe` 推送事件 +- 直接复用 `EventSink` 的 broadcaster 实现 + +### 3c. systemd / launchd 服务(半天) +- macOS:`packages/desktop-tauri/scripts/com.hawkeye.cli.plist` + 安装脚本 +- Linux:`hawkeye.service` unit +- `hawkeye-cli daemon --foreground` 给 launchd 调用 + +### 3d. Multi-platform CI 构建(1 天) +- `.github/workflows/cli-release.yml` +- macOS(arm64+x86_64)+ Linux(x86_64+arm64)矩阵 +- 自动产生 release artifact + +### 3e. Docker image(半天) +- `Dockerfile` for hawkeye-cli on Linux +- 注意:cua-driver 不能跑(macOS only),所以 Docker 镜像里 agent 模式要 disable + +--- + +## 推荐执行顺序与并行度 + +``` +Day 1 ───────────► Phase 1(Node 工程师) +Day 2 ─► Phase 1 验收 + 文档 +Day 3-5 ───────────► Phase 2 重构 + bin(Rust 工程师,可与 Phase 1 并行启动) +Day 6 ─► Phase 2 验收 +Day 7+ ───────────► Phase 3 按需扩展 +``` + +Phase 1 和 Phase 2 完全可以并行 —— Node CLI 不动 Rust,Rust 重构不动 core 包,互不干扰。 + +--- + +## 立即可做的 3 件事 + +1. **创建 `packages/cli/` 包**(Phase 1.1-1.4):~4 小时 +2. **写 EventSink trait + 改 ObserveLoop**(Phase 2.3):~2 小时 +3. **加 [[bin]] hawkeye-cli + clap dispatch**(Phase 2.6 骨架):~2 小时 + +任意一个我都可以现在就开干。 + +--- + +## 关键决策点(需要你拍板) + +| # | 决策 | 我的推荐 | +|---|---|---| +| A | 先做 Phase 1 还是 Phase 2 | **先 Phase 1** —— 1 天交付,验证 core 真的能脱壳 | +| B | Phase 2 的 CLI bin 名称 | `hawkeye-cli`(避免与 Node CLI `hawkeye` 撞名) | +| C | 配置文件格式 | **JSON 起步**(Rust 端已经是 json),Phase 3 加 YAML | +| D | hawkeye CLI 是否包含 cua-driver 集成 | **是**,作为 `agent` 子命令,复用 Phase 1 已建好的 Rust agent 模块 | +| E | Linux 支持范围 | Phase 1 全平台 / Phase 2 macOS only / Phase 3 加 Linux | diff --git a/packages/cli/README.md b/packages/cli/README.md new file mode 100644 index 0000000..5e9c444 --- /dev/null +++ b/packages/cli/README.md @@ -0,0 +1,101 @@ +# @hawkeye/cli + +A standalone command-line interface to the Hawkeye Core engine. + +## Install (within the monorepo) + +```bash +cd packages/cli +pnpm install +pnpm build +node dist/main.js --help +``` + +The `dist/main.js` file is a shebanged ESM bundle, so you can also do: + +```bash +chmod +x dist/main.js +./dist/main.js --help +``` + +If you publish or `npm link` the package, the `hawkeye` binary will be on PATH. + +## Setup + +```bash +hawkeye init +``` + +That writes a starter config to `~/.config/hawkeye/cli.json` and creates the data +directory at `~/.hawkeye/`. Edit the config to set your API key, or export one of: + +- `GEMINI_API_KEY` (or `GOOGLE_API_KEY`) +- `OPENAI_API_KEY` +- `HAWKEYE_DATA_DIR` (overrides storage path) +- `HAWKEYE_CONFIG` (overrides config path) + +Resolution order (highest priority first): CLI args → env vars → config file → built-in defaults. + +## Commands + +| Command | Description | +|---------|-------------| +| `hawkeye init [--force]` | Write starter config + create data dir | +| `hawkeye perceive [--json]` | Capture screen, recognize intents | +| `hawkeye plan [--json]` | Generate plan for a stored UserIntent (use `-` for stdin) | +| `hawkeye execute [--json]` | Run a previously generated plan | +| `hawkeye run "" [--json]` | End-to-end: perceive → plan → execute | +| `hawkeye chat "" [--json]` | One-turn chat against the configured AI provider | +| `hawkeye daemon [--interval=3000]` | Long-running observe loop, NDJSON output to stdout | + +The `--json` global flag switches output to NDJSON (one JSON value per line on +stdout, errors on stderr). Without it, output is colored/pretty. + +## Examples + +```bash +# Quick chat +hawkeye chat "summarize the OAuth 2.0 device flow" + +# One-shot capture, intents to JSON for later: +hawkeye perceive --json > intents.json + +# Pipe an intent into plan: +jq '.value[0]' intents.json | hawkeye plan - + +# End-to-end: +hawkeye run "rename the screenshots in ~/Desktop to today's date" +``` + +## Known limitations + +- The CLI runs `@hawkeye/core` directly, which depends on native modules + (`better-sqlite3`, `screenshot-desktop`, etc.). You need the same Node + version that built core's native bindings. +- Most subcommands require an AI API key. `init`, `--version`, and `--help` + do not. +- `perceive`/`run` capture the screen on macOS — grant screen-recording + permission to your terminal first. +- `daemon` falls back to interval polling because @hawkeye/core does not yet + expose a dedicated `observation` event. + +## Architecture + +``` +src/ + main.ts # commander setup, dispatch + config.ts # CliConfig + 3-layer merge + buildHawkeyeConfig() + output.ts # pretty / json output modes + commands/ + init.ts + perceive.ts + plan.ts + execute.ts + run.ts + chat.ts + daemon.ts +``` + +Zero changes are made to `@hawkeye/core`. The CLI translates a small `CliConfig` +into the full `HawkeyeConfig` and consumes the same public API as +`@hawkeye/desktop` and `@hawkeye/desktop-tauri`. diff --git a/packages/cli/package.json b/packages/cli/package.json new file mode 100644 index 0000000..333dc64 --- /dev/null +++ b/packages/cli/package.json @@ -0,0 +1,32 @@ +{ + "name": "@hawkeye/cli", + "version": "0.1.0", + "description": "Hawkeye CLI - command-line interface to the Hawkeye Core engine", + "type": "module", + "private": false, + "bin": { + "hawkeye": "./dist/main.js" + }, + "files": [ + "dist", + "README.md" + ], + "scripts": { + "build": "tsup", + "dev": "tsup --watch", + "clean": "rm -rf dist", + "typecheck": "tsc --noEmit" + }, + "dependencies": { + "@hawkeye/core": "workspace:*", + "commander": "^12.0.0" + }, + "devDependencies": { + "@types/node": "^20.10.0", + "tsup": "^8.0.1", + "typescript": "^5.3.3" + }, + "engines": { + "node": ">=20.0.0" + } +} diff --git a/packages/cli/src/commands/chat.ts b/packages/cli/src/commands/chat.ts new file mode 100644 index 0000000..960aef2 --- /dev/null +++ b/packages/cli/src/commands/chat.ts @@ -0,0 +1,50 @@ +import type { Command } from 'commander'; +import { + createAIManager, + type AIManagerConfig, + type AIMessage, +} from '@hawkeye/core'; +import { loadConfig } from '../config.js'; +import { printError, printResult } from '../output.js'; + +export function registerChat(program: Command): void { + program + .command('chat ') + .description('one-turn AI chat (no perception, no execution)') + .action(async (message: string) => { + try { + const cliCfg = loadConfig(); + if (!cliCfg.ai.apiKey) { + throw new Error( + 'No AI API key configured (set GEMINI_API_KEY / OPENAI_API_KEY).' + ); + } + + const aiConfig: AIManagerConfig = { + providers: [ + { + type: cliCfg.ai.provider, + apiKey: cliCfg.ai.apiKey, + model: cliCfg.ai.model, + baseUrl: cliCfg.ai.baseUrl, + }, + ], + preferredProvider: cliCfg.ai.provider, + enableFailover: false, + }; + + const ai = createAIManager(aiConfig); + await ai.initialize(); + + const messages: AIMessage[] = [{ role: 'user', content: message }]; + const response = await ai.chat(messages); + printResult('reply', response.text); + + await ai.terminate(); + process.exit(0); + } catch (err) { + printError(err); + process.exit(1); + } + }); +} diff --git a/packages/cli/src/commands/daemon.ts b/packages/cli/src/commands/daemon.ts new file mode 100644 index 0000000..025b39d --- /dev/null +++ b/packages/cli/src/commands/daemon.ts @@ -0,0 +1,106 @@ +import type { Command } from 'commander'; +import { createHawkeye } from '@hawkeye/core'; +import { buildHawkeyeConfig, loadConfig } from '../config.js'; +import { printError, printEvent, printInfo } from '../output.js'; + +/** + * Long-running observe loop. + * + * The Hawkeye core engine emits events like `intents:detected`, `perceiving`, + * and `execution:*` via its EventEmitter. We subscribe to a handful of them + * and re-emit each as NDJSON on stdout. SIGINT triggers a clean shutdown. + * + * If event subscriptions yield nothing within an interval, we still poll + * `perceiveAndRecognize` so the daemon is useful even before any signal fires. + */ +export function registerDaemon(program: Command): void { + program + .command('daemon') + .description('long-running observe + intent loop, emitting NDJSON events to stdout') + .option('--interval ', 'polling interval in milliseconds', '3000') + .action(async (opts: { interval: string }) => { + const intervalMs = Number.parseInt(opts.interval, 10) || 3000; + let hawkeye: ReturnType | null = null; + let timer: NodeJS.Timeout | null = null; + let stopping = false; + + const shutdown = async (signal: string): Promise => { + if (stopping) return; + stopping = true; + printInfo(`Received ${signal}, shutting down...`); + if (timer) clearInterval(timer); + try { + await hawkeye?.shutdown(); + } catch (err) { + printError(err); + } + process.exit(0); + }; + + process.on('SIGINT', () => { + void shutdown('SIGINT'); + }); + process.on('SIGTERM', () => { + void shutdown('SIGTERM'); + }); + + try { + const cliCfg = loadConfig({ + observe: { + intervalMs, + changeThreshold: 0.05, + }, + }); + if (!cliCfg.ai.apiKey) { + throw new Error('No AI API key configured (set GEMINI_API_KEY / OPENAI_API_KEY).'); + } + + hawkeye = createHawkeye(buildHawkeyeConfig(cliCfg)); + + // Best-effort event subscriptions. We pick the events that actually + // exist on the Hawkeye class (verified in packages/core/src/hawkeye.ts). + const events = [ + 'ready', + 'perceiving', + 'intents:detected', + 'plan:generated', + 'execution:step:start', + 'execution:step:complete', + 'execution:step:error', + 'execution:completed', + 'autonomous:suggestions', + 'autonomous:intent', + 'error', + ]; + for (const evt of events) { + hawkeye.on(evt, (data: unknown) => printEvent(evt, data)); + } + + await hawkeye.initialize(); + printEvent('daemon:started', { intervalMs }); + + // Poll the perception loop on the interval — events fire as a side + // effect and `intents:detected` is emitted from inside. + timer = setInterval(() => { + if (stopping || !hawkeye) return; + hawkeye.perceiveAndRecognize().catch((err) => { + printEvent('poll:error', { message: (err as Error).message }); + }); + }, intervalMs); + + // Block forever (until SIGINT/SIGTERM). + await new Promise(() => { + /* never resolves */ + }); + } catch (err) { + printError(err); + if (timer) clearInterval(timer); + try { + await hawkeye?.shutdown(); + } catch { + /* ignore */ + } + process.exit(1); + } + }); +} diff --git a/packages/cli/src/commands/execute.ts b/packages/cli/src/commands/execute.ts new file mode 100644 index 0000000..f092720 --- /dev/null +++ b/packages/cli/src/commands/execute.ts @@ -0,0 +1,57 @@ +import * as fs from 'node:fs'; +import type { Command } from 'commander'; +import { createHawkeye, type ExecutionPlan } from '@hawkeye/core'; +import { buildHawkeyeConfig, loadConfig } from '../config.js'; +import { printError, printEvent, printResult } from '../output.js'; + +function readPlan(source: string): ExecutionPlan { + const raw = fs.readFileSync(source, 'utf8'); + const parsed = JSON.parse(raw) as unknown; + if (!parsed || typeof parsed !== 'object') { + throw new Error('Plan file must be a JSON object matching ExecutionPlan.'); + } + return parsed as ExecutionPlan; +} + +export function registerExecute(program: Command): void { + program + .command('execute ') + .description('execute a previously generated plan, streaming step results') + .action(async (planFile: string) => { + let hawkeye: ReturnType | null = null; + try { + const plan = readPlan(planFile); + const cliCfg = loadConfig(); + if (!cliCfg.ai.apiKey) { + throw new Error('No AI API key configured (set GEMINI_API_KEY / OPENAI_API_KEY).'); + } + + hawkeye = createHawkeye(buildHawkeyeConfig(cliCfg)); + await hawkeye.initialize(); + + // Stream step events as they happen so JSON consumers see NDJSON. + hawkeye.on('execution:step:start', (data: unknown) => + printEvent('step:start', data) + ); + hawkeye.on('execution:step:complete', (data: unknown) => + printEvent('step:complete', data) + ); + hawkeye.on('execution:step:error', (data: unknown) => + printEvent('step:error', data) + ); + + const execution = await hawkeye.executePlan(plan); + printResult('execution', execution); + await hawkeye.shutdown(); + process.exit(0); + } catch (err) { + printError(err); + try { + await hawkeye?.shutdown(); + } catch { + /* ignore */ + } + process.exit(1); + } + }); +} diff --git a/packages/cli/src/commands/init.ts b/packages/cli/src/commands/init.ts new file mode 100644 index 0000000..232495b --- /dev/null +++ b/packages/cli/src/commands/init.ts @@ -0,0 +1,49 @@ +import * as fs from 'node:fs'; +import type { Command } from 'commander'; +import { + defaultConfigPath, + defaultDataDir, + writeDefaultConfig, +} from '../config.js'; +import { printError, printResult, printSuccess, printInfo } from '../output.js'; + +export function registerInit(program: Command): void { + program + .command('init') + .description('write a starter config to ~/.config/hawkeye/cli.json and create the data dir') + .option('-f, --force', 'overwrite an existing config') + .action(async (opts: { force?: boolean }) => { + try { + const dataDir = defaultDataDir(); + if (!fs.existsSync(dataDir)) { + fs.mkdirSync(dataDir, { recursive: true }); + printSuccess(`Created data directory: ${dataDir}`); + } else { + printInfo(`Data directory already exists: ${dataDir}`); + } + + const target = process.env.HAWKEYE_CONFIG || defaultConfigPath(); + const exists = fs.existsSync(target); + if (exists && !opts.force) { + printError( + new Error( + `Config already exists at ${target}. Re-run with --force to overwrite.` + ) + ); + process.exit(1); + return; + } + + const path = writeDefaultConfig(opts.force === true); + printSuccess(`Wrote starter config: ${path}`); + printResult('next-steps', { + edit: path, + envVars: ['GEMINI_API_KEY', 'GOOGLE_API_KEY', 'OPENAI_API_KEY'], + tryNext: 'hawkeye perceive --help', + }); + } catch (err) { + printError(err); + process.exit(1); + } + }); +} diff --git a/packages/cli/src/commands/perceive.ts b/packages/cli/src/commands/perceive.ts new file mode 100644 index 0000000..fbb4d5b --- /dev/null +++ b/packages/cli/src/commands/perceive.ts @@ -0,0 +1,36 @@ +import type { Command } from 'commander'; +import { createHawkeye } from '@hawkeye/core'; +import { buildHawkeyeConfig, loadConfig } from '../config.js'; +import { printError, printResult } from '../output.js'; + +export function registerPerceive(program: Command): void { + program + .command('perceive') + .description('capture the current screen + context and recognize user intents') + .action(async () => { + let hawkeye: ReturnType | null = null; + try { + const cliCfg = loadConfig(); + if (!cliCfg.ai.apiKey) { + throw new Error( + 'No AI API key found. Set GEMINI_API_KEY (or OPENAI_API_KEY), or run `hawkeye init` and edit the config file.' + ); + } + + hawkeye = createHawkeye(buildHawkeyeConfig(cliCfg)); + await hawkeye.initialize(); + const intents = await hawkeye.perceiveAndRecognize(); + printResult('intents', intents); + await hawkeye.shutdown(); + process.exit(0); + } catch (err) { + printError(err); + try { + await hawkeye?.shutdown(); + } catch { + /* ignore secondary errors during cleanup */ + } + process.exit(1); + } + }); +} diff --git a/packages/cli/src/commands/plan.ts b/packages/cli/src/commands/plan.ts new file mode 100644 index 0000000..497cc1e --- /dev/null +++ b/packages/cli/src/commands/plan.ts @@ -0,0 +1,56 @@ +import * as fs from 'node:fs'; +import type { Command } from 'commander'; +import { createHawkeye, type UserIntent } from '@hawkeye/core'; +import { buildHawkeyeConfig, loadConfig } from '../config.js'; +import { printError, printResult } from '../output.js'; + +async function readIntent(source: string): Promise { + const raw = source === '-' ? await readStdin() : fs.readFileSync(source, 'utf8'); + const parsed = JSON.parse(raw) as unknown; + if (!parsed || typeof parsed !== 'object') { + throw new Error('Intent file must contain a JSON object matching UserIntent.'); + } + return parsed as UserIntent; +} + +function readStdin(): Promise { + return new Promise((resolve, reject) => { + let data = ''; + process.stdin.setEncoding('utf8'); + process.stdin.on('data', (chunk) => { + data += chunk; + }); + process.stdin.on('end', () => resolve(data)); + process.stdin.on('error', reject); + }); +} + +export function registerPlan(program: Command): void { + program + .command('plan ') + .description('generate an execution plan for a stored UserIntent (use "-" to read stdin)') + .action(async (intentFile: string) => { + let hawkeye: ReturnType | null = null; + try { + const intent = await readIntent(intentFile); + const cliCfg = loadConfig(); + if (!cliCfg.ai.apiKey) { + throw new Error('No AI API key configured (set GEMINI_API_KEY / OPENAI_API_KEY).'); + } + hawkeye = createHawkeye(buildHawkeyeConfig(cliCfg)); + await hawkeye.initialize(); + const plan = await hawkeye.generatePlan(intent); + printResult('plan', plan); + await hawkeye.shutdown(); + process.exit(0); + } catch (err) { + printError(err); + try { + await hawkeye?.shutdown(); + } catch { + /* ignore */ + } + process.exit(1); + } + }); +} diff --git a/packages/cli/src/commands/run.ts b/packages/cli/src/commands/run.ts new file mode 100644 index 0000000..a1f2ef1 --- /dev/null +++ b/packages/cli/src/commands/run.ts @@ -0,0 +1,56 @@ +import type { Command } from 'commander'; +import { createHawkeye, type UserIntent } from '@hawkeye/core'; +import { buildHawkeyeConfig, loadConfig } from '../config.js'; +import { printError, printEvent, printResult } from '../output.js'; + +function pickTopIntent(intents: UserIntent[]): UserIntent | null { + if (intents.length === 0) return null; + return [...intents].sort((a, b) => b.confidence - a.confidence)[0]; +} + +export function registerRun(program: Command): void { + program + .command('run ') + .description('end-to-end: perceive → pick top intent → plan → execute') + .action(async (taskDescription: string) => { + let hawkeye: ReturnType | null = null; + try { + const cliCfg = loadConfig(); + if (!cliCfg.ai.apiKey) { + throw new Error('No AI API key configured (set GEMINI_API_KEY / OPENAI_API_KEY).'); + } + + hawkeye = createHawkeye(buildHawkeyeConfig(cliCfg)); + await hawkeye.initialize(); + + printEvent('phase', { name: 'perceive', task: taskDescription }); + const intents = await hawkeye.perceiveAndRecognize(); + const top = pickTopIntent(intents); + if (!top) { + throw new Error('Perception returned no intents — nothing to plan.'); + } + // Override the description with the user's actual ask so the plan matches it. + const intent: UserIntent = { ...top, description: taskDescription }; + printEvent('intent:selected', intent); + + printEvent('phase', { name: 'plan' }); + const plan = await hawkeye.generatePlan(intent); + printEvent('plan:generated', { id: plan.id, steps: plan.steps.length }); + + printEvent('phase', { name: 'execute' }); + const execution = await hawkeye.executePlan(plan); + printResult('execution', execution); + + await hawkeye.shutdown(); + process.exit(execution.status === 'completed' ? 0 : 1); + } catch (err) { + printError(err); + try { + await hawkeye?.shutdown(); + } catch { + /* ignore */ + } + process.exit(1); + } + }); +} diff --git a/packages/cli/src/config.ts b/packages/cli/src/config.ts new file mode 100644 index 0000000..76f5b59 --- /dev/null +++ b/packages/cli/src/config.ts @@ -0,0 +1,227 @@ +/** + * CLI configuration loader. + * + * Three-layer resolution (highest priority wins): + * 1. CLI args (passed in by command handlers as `overrides`) + * 2. Environment variables + * 3. JSON file at $HAWKEYE_CONFIG or ~/.config/hawkeye/cli.json + * 4. Built-in defaults + */ + +import * as fs from 'node:fs'; +import * as os from 'node:os'; +import * as path from 'node:path'; +import type { HawkeyeConfig } from '@hawkeye/core'; + +export interface CliConfig { + ai: { + provider: 'gemini' | 'openai'; + apiKey: string; + model?: string; + baseUrl?: string; + }; + perception: { + enableScreen: boolean; + enableOCR: boolean; + }; + storage: { + dataDir: string; + }; + observe: { + intervalMs: number; + changeThreshold: number; + }; +} + +const BUILTIN_DEFAULTS: CliConfig = { + ai: { + provider: 'gemini', + apiKey: '', + model: 'gemini-2.5-flash', + baseUrl: undefined, + }, + perception: { + enableScreen: true, + enableOCR: true, + }, + storage: { + // Set lazily in resolveDefaults so we honor $HOME at call time. + dataDir: '', + }, + observe: { + intervalMs: 3000, + changeThreshold: 0.05, + }, +}; + +export function defaultConfigPath(): string { + return path.join(os.homedir(), '.config', 'hawkeye', 'cli.json'); +} + +export function defaultDataDir(): string { + return process.env.HAWKEYE_DATA_DIR || path.join(os.homedir(), '.hawkeye'); +} + +function resolveDefaults(): CliConfig { + return { + ...BUILTIN_DEFAULTS, + storage: { dataDir: defaultDataDir() }, + }; +} + +function readJsonFileSafe(filePath: string): Partial | null { + try { + if (!fs.existsSync(filePath)) return null; + const raw = fs.readFileSync(filePath, 'utf8'); + return JSON.parse(raw) as Partial; + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new Error(`Failed to parse config at ${filePath}: ${msg}`); + } +} + +function envOverrides(): Partial { + const out: Partial = {}; + + const apiKey = + process.env.GEMINI_API_KEY || + process.env.GOOGLE_API_KEY || + process.env.OPENAI_API_KEY || + ''; + const provider: 'gemini' | 'openai' | undefined = process.env.OPENAI_API_KEY + ? 'openai' + : process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY + ? 'gemini' + : undefined; + + if (apiKey || provider) { + out.ai = { + provider: provider ?? 'gemini', + apiKey, + }; + } + + if (process.env.HAWKEYE_DATA_DIR) { + out.storage = { dataDir: process.env.HAWKEYE_DATA_DIR }; + } + + return out; +} + +function deepMerge( + base: Record, + patch: Record | undefined +): Record { + if (!patch) return base; + const out: Record = { ...base }; + for (const key of Object.keys(patch)) { + const v = patch[key]; + if (v === undefined) continue; + if ( + v !== null && + typeof v === 'object' && + !Array.isArray(v) && + typeof out[key] === 'object' && + out[key] !== null && + !Array.isArray(out[key]) + ) { + out[key] = deepMerge( + out[key] as Record, + v as Record + ); + } else { + out[key] = v; + } + } + return out; +} + +export function loadConfig(overrides?: Partial): CliConfig { + const fileTarget = process.env.HAWKEYE_CONFIG || defaultConfigPath(); + const fileLayer = readJsonFileSafe(fileTarget) || {}; + const envLayer = envOverrides(); + + let merged: Record = resolveDefaults() as unknown as Record< + string, + unknown + >; + merged = deepMerge(merged, fileLayer as Record); + merged = deepMerge(merged, envLayer as Record); + if (overrides) merged = deepMerge(merged, overrides as Record); + + const result = merged as unknown as CliConfig; + // Final safety: ensure dataDir is always populated. + if (!result.storage.dataDir) result.storage.dataDir = defaultDataDir(); + return result; +} + +const STARTER_CONFIG: CliConfig = { + ai: { + provider: 'gemini', + apiKey: 'YOUR_API_KEY_HERE', + model: 'gemini-2.5-flash', + }, + perception: { + enableScreen: true, + enableOCR: true, + }, + storage: { + dataDir: path.join('~', '.hawkeye'), + }, + observe: { + intervalMs: 3000, + changeThreshold: 0.05, + }, +}; + +/** Writes a starter config and returns the absolute path. Throws if the file already exists. */ +export function writeDefaultConfig(force = false): string { + const target = process.env.HAWKEYE_CONFIG || defaultConfigPath(); + if (fs.existsSync(target) && !force) { + throw new Error( + `Config already exists at ${target}. Re-run with --force to overwrite.` + ); + } + fs.mkdirSync(path.dirname(target), { recursive: true }); + fs.writeFileSync(target, JSON.stringify(STARTER_CONFIG, null, 2) + '\n', 'utf8'); + return target; +} + +/** Translates the simplified CliConfig into the full HawkeyeConfig that core expects. */ +export function buildHawkeyeConfig(cli: CliConfig): HawkeyeConfig { + return { + ai: { + providers: [ + { + type: cli.ai.provider, + apiKey: cli.ai.apiKey, + model: cli.ai.model, + baseUrl: cli.ai.baseUrl, + }, + ], + preferredProvider: cli.ai.provider, + enableFailover: false, + }, + // PerceptionEngineConfig has many more fields, but the engine constructor + // accepts Partial internally. Cast to keep TS happy. + perception: { + enableScreen: cli.perception.enableScreen, + enableOCR: cli.perception.enableOCR, + } as HawkeyeConfig['perception'], + storage: { + database: { + dbPath: path.join(cli.storage.dataDir, 'hawkeye.db'), + }, + }, + // Disable heavy modules by default for the CLI: they pull in native deps and + // are not useful for one-shot perceive/plan/execute runs. + enableBehaviorTracking: false, + enableMemory: false, + enableDashboard: false, + enableWorkflow: false, + enablePlugins: false, + enableAutonomous: false, + enableTaskQueue: false, + autoStartSync: false, + }; +} diff --git a/packages/cli/src/main.ts b/packages/cli/src/main.ts new file mode 100644 index 0000000..dbf2698 --- /dev/null +++ b/packages/cli/src/main.ts @@ -0,0 +1,47 @@ +/** + * Hawkeye CLI entry point. + * + * Wires up commander and dispatches to the per-subcommand modules. + */ + +import { Command } from 'commander'; +import { setOutputMode } from './output.js'; +import { printError } from './output.js'; +import { registerInit } from './commands/init.js'; +import { registerPerceive } from './commands/perceive.js'; +import { registerPlan } from './commands/plan.js'; +import { registerExecute } from './commands/execute.js'; +import { registerRun } from './commands/run.js'; +import { registerChat } from './commands/chat.js'; +import { registerDaemon } from './commands/daemon.js'; + +async function main(): Promise { + const program = new Command(); + + program + .name('hawkeye') + .description('Hawkeye CLI — perception, planning, and execution from your shell') + .version('0.1.0', '-v, --version', 'print the CLI version') + .option('--json', 'emit machine-readable NDJSON output instead of pretty text') + .hook('preAction', (thisCommand) => { + const opts = thisCommand.opts<{ json?: boolean }>(); + if (opts.json) setOutputMode('json'); + }); + + registerInit(program); + registerPerceive(program); + registerPlan(program); + registerExecute(program); + registerRun(program); + registerChat(program); + registerDaemon(program); + + program.showHelpAfterError(); + + await program.parseAsync(process.argv); +} + +main().catch((err) => { + printError(err); + process.exit(1); +}); diff --git a/packages/cli/src/output.ts b/packages/cli/src/output.ts new file mode 100644 index 0000000..bada2fa --- /dev/null +++ b/packages/cli/src/output.ts @@ -0,0 +1,97 @@ +/** + * Output formatter. Two modes: + * - pretty: ANSI color, human-readable. Default. + * - json: one JSON value per write to stdout, machine-readable. + */ + +type OutputMode = 'pretty' | 'json'; + +let mode: OutputMode = 'pretty'; + +const COLORS = { + reset: '\x1b[0m', + bold: '\x1b[1m', + dim: '\x1b[2m', + red: '\x1b[31m', + green: '\x1b[32m', + yellow: '\x1b[33m', + blue: '\x1b[34m', + magenta: '\x1b[35m', + cyan: '\x1b[36m', + gray: '\x1b[90m', +}; + +const isTTY = (): boolean => Boolean(process.stdout.isTTY); + +function color(text: string, code: keyof typeof COLORS): string { + if (!isTTY()) return text; + return `${COLORS[code]}${text}${COLORS.reset}`; +} + +export function setOutputMode(next: OutputMode): void { + mode = next; +} + +export function getOutputMode(): OutputMode { + return mode; +} + +export function printResult(label: string, value: unknown): void { + if (mode === 'json') { + process.stdout.write(JSON.stringify({ label, value }) + '\n'); + return; + } + process.stdout.write(color(`▸ ${label}`, 'cyan') + '\n'); + process.stdout.write(formatPretty(value) + '\n'); +} + +export function printError(err: unknown): void { + const message = err instanceof Error ? err.message : String(err); + if (mode === 'json') { + process.stderr.write( + JSON.stringify({ + error: message, + stack: err instanceof Error ? err.stack : undefined, + }) + '\n' + ); + return; + } + process.stderr.write(color(`✖ ${message}`, 'red') + '\n'); + if (err instanceof Error && err.stack && process.env.DEBUG) { + process.stderr.write(color(err.stack, 'gray') + '\n'); + } +} + +export function printEvent(event: string, data: unknown): void { + if (mode === 'json') { + process.stdout.write( + JSON.stringify({ event, ts: Date.now(), data }) + '\n' + ); + return; + } + const ts = new Date().toISOString(); + process.stdout.write( + `${color(ts, 'gray')} ${color(event, 'magenta')} ${formatPretty(data)}\n` + ); +} + +export function printInfo(message: string): void { + if (mode === 'json') return; // Don't pollute JSON streams with chatter. + process.stdout.write(color(`ℹ ${message}`, 'blue') + '\n'); +} + +export function printSuccess(message: string): void { + if (mode === 'json') return; + process.stdout.write(color(`✓ ${message}`, 'green') + '\n'); +} + +function formatPretty(value: unknown): string { + if (value === undefined) return color('(undefined)', 'gray'); + if (value === null) return color('null', 'gray'); + if (typeof value === 'string') return value; + try { + return JSON.stringify(value, null, 2); + } catch { + return String(value); + } +} diff --git a/packages/cli/tsconfig.json b/packages/cli/tsconfig.json new file mode 100644 index 0000000..eb02d8e --- /dev/null +++ b/packages/cli/tsconfig.json @@ -0,0 +1,12 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "outDir": "./dist", + "rootDir": "./src", + "module": "ESNext", + "moduleResolution": "bundler", + "noEmit": true + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} diff --git a/packages/cli/tsup.config.ts b/packages/cli/tsup.config.ts new file mode 100644 index 0000000..35caeca --- /dev/null +++ b/packages/cli/tsup.config.ts @@ -0,0 +1,25 @@ +import { defineConfig } from 'tsup'; + +export default defineConfig({ + entry: ['src/main.ts'], + format: ['esm'], + target: 'node20', + banner: { + js: '#!/usr/bin/env node', + }, + shims: false, + clean: true, + sourcemap: true, + dts: false, + splitting: false, + // @hawkeye/core has heavy native deps (better-sqlite3, screenshot-desktop, etc.) + // Keep it external so Node resolves it from node_modules at runtime. + external: [ + '@hawkeye/core', + 'better-sqlite3', + 'sqlite-vec', + 'screenshot-desktop', + 'node-llama-cpp', + /\.node$/, + ], +}); diff --git a/packages/desktop-tauri/AGENT_INTEGRATION.md b/packages/desktop-tauri/AGENT_INTEGRATION.md new file mode 100644 index 0000000..3746525 --- /dev/null +++ b/packages/desktop-tauri/AGENT_INTEGRATION.md @@ -0,0 +1,196 @@ +# Agent (cua-driver) Integration + +This document describes the **computer-use agent** layer added to Hawkeye's +Tauri build: how it's wired, how to install the dependency binary, how to +test it end-to-end, and what the security model looks like. + +## What got built + +Hawkeye's Tauri app now has a **"hand"** to match its existing **"eye"** +(WebGazer) and **"brain"** (Gemini chat). When the user enables Agent mode +in the chat panel, Gemini receives a tool catalog and can: + +- **screenshot** the desktop +- **list_windows** / **get_window_state** to see what's running +- **click** / **type_text** / **press_key** / **scroll** to act +- **launch_app** to open something + +All actions go through [trycua/cua's `cua-driver`](https://github.com/trycua/cua) +— a Swift daemon that drives native macOS apps **in the background without +stealing focus or moving the cursor** by using private SkyLight SPIs and +focus-without-raise tricks. We talk to it over a Unix socket using the line- +delimited JSON protocol it already exposes for its CLI. + +## Architecture + +``` +┌─────────────────┐ invoke ┌─────────────────────┐ +│ ChatPanel + │ ───────────► │ chat_with_agent │ +│ useAgent hook │ │ (commands/agent_cmd)│ +└─────────────────┘ └─────────┬───────────┘ + ▲ │ + │ agent:tool-call-{start,end} │ + │ Tauri events ▼ +┌───────┴─────────┐ ┌─────────────────────┐ +│ Live tool │ │ run_user_turn │ ← agent/runner.rs +│ stream UI │ │ (loop) │ +└─────────────────┘ └─────┬───────────┬───┘ + │ │ + ┌────▼──┐ ┌────▼──────┐ + │Gemini │ │ CuaDriver │ + │chat_ │ │ Client │ ← agent/cua_driver.rs + │with_ │ └─────┬─────┘ + │tools │ │ Unix socket + └───────┘ │ JSON-line protocol + ▼ + ┌──────────────────────────────────┐ + │ cua-driver daemon (Swift) │ + │ ~/Library/Caches/cua-driver/ │ + │ cua-driver.sock │ + │ → CGEvent / SkyLight / AX │ + │ → 28 MCP tools (we use 8) │ + └──────────────────────────────────┘ +``` + +### New files + +- `src-tauri/src/agent/protocol.rs` — wire types (`DaemonRequest`, `DaemonResponse`, `CallResult`, `ContentBlock`) +- `src-tauri/src/agent/cua_driver.rs` — async Unix-socket client + `DaemonSupervisor` (binary discovery, spawn, health check) +- `src-tauri/src/agent/tools.rs` — curated 8-tool catalog mapped to Gemini `FunctionDeclaration`s +- `src-tauri/src/agent/runner.rs` — tool-use loop (`run_user_turn`), max 8 rounds, emits `agent:tool-call-{start,end}` events +- `src-tauri/src/agent/mod.rs` — module exports +- `src-tauri/src/commands/agent_cmd.rs` — Tauri commands: `get_agent_status`, `start_agent`, `chat_with_agent`, `invoke_cua_tool` +- `src/hooks/useAgent.ts` — React hook with live tool-call streaming +- `src/hooks/useTauri.ts` (extended) — TypeScript types and invoke wrappers + +### Modified files + +- `src-tauri/src/ai/types.rs` — `FunctionDeclaration`, `FunctionCall`, `FunctionResult`, `ToolMessage`, `ToolTurn`; Gemini wire types extended with `tools`, `tool_config`, `function_call`/`function_response` parts +- `src-tauri/src/ai/provider.rs` — `AiProvider::chat_with_tools` (default: unsupported error) + `supports_tools()` +- `src-tauri/src/ai/gemini.rs` — full `chat_with_tools` impl with multi-round tool conversation translation +- `src-tauri/src/state.rs` — `ai_client: RwLock>>` (was `Box`); new `agent_supervisor: RwLock>` +- `src-tauri/src/commands/chat_cmd.rs` — wraps providers in `Arc` instead of `Box` +- `src-tauri/src/lib.rs` — registers `agent` module + 4 new commands; spawns supervisor in setup +- `src-tauri/src/events.rs` — `AGENT_TOOL_CALL_{START,END}`, `AGENT_DAEMON_{READY,ERROR}` +- `src/components/ChatPanel.tsx` — Agent mode toggle, tool-call audit trail, live stream + +## Installing cua-driver + +The Hawkeye binary does **not** ship with cua-driver. Install it once: + +```bash +/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)" +``` + +This downloads a signed/notarized release tarball, places `CuaDriver.app` +in `/Applications/`, and symlinks `/usr/local/bin/cua-driver`. After the +first launch, macOS will prompt for **Accessibility** and **Screen +Recording** permissions — both required. + +Override the version with `CUA_DRIVER_VERSION=0.0.5` and override the +binary location with `CUA_DRIVER_BIN=/path/to/cua-driver` if needed. + +To verify: + +```bash +cua-driver --version # prints the build number +cua-driver serve & # starts the daemon +ls ~/Library/Caches/cua-driver/cua-driver.sock # should exist +``` + +## Running Hawkeye + the agent + +```bash +cd packages/desktop-tauri +pnpm tauri:dev +``` + +In the chat panel: + +1. Toggle the **Agent** checkbox at the top. +2. The status badge will show one of: `⚠ driver missing` / `start daemon` / `● ready`. + - If "start daemon", click it to spawn the daemon (or call `start_agent` from devtools). +3. Type a request. Examples: + - `Take a screenshot and describe what's on my screen.` + - `Open Safari.` + - `What apps are currently running?` + - `Click the "Send" button in the focused window.` +4. As Gemini decides to call tools, you'll see them stream in real time + under the conversation (`screenshot — Captured 1440x900 PNG`, `click — + Clicked successfully`, etc.). +5. The final assistant text appears once the model emits text-only output. + +## Security model + +- **Allow-list**: Only the 8 curated tool names in `agent::tools::allowed_tool_names()` + can be invoked; anything else returns `{ok: false, error: "not in allow-list"}` + *to the model* (not to the daemon), letting it recover. +- **Round cap**: `MAX_TOOL_ROUNDS = 8`. The loop hard-stops past that limit. +- **Socket permissions**: cua-driver creates the socket with mode `0o600`, + so only the owning user can speak to it. +- **No daemon auth**: the socket is filesystem-permission-gated only. + TCC (Accessibility/Screen Recording) lives at the OS level on the + cua-driver app bundle. +- **Failures degrade**: tool errors become `{ok:false, error:…}` payloads + fed back to the model, not exceptions to the user. The model can then + apologize, retry, or change strategy. + +## Known limitations / future work + +- **OpenAI / local llama.cpp** providers still respond with "tool calling + not supported". Adding it is straightforward: implement + `chat_with_tools` for each via OpenAI's `tools` field / a JSON-formatted + prompt for local models. +- **Curated tool set is static**. We ignore the daemon's `list` and + `describe` methods; we could surface the full 28-tool catalog + dynamically with a richer Gemini schema translation. +- **No conversation persistence**. Both plain chat and agent chat lose + history on reload. Hawkeye memory says this is also true today for the + non-agent path. +- **Image return path**: when the model calls `screenshot`, we attach the + PNG as a *follow-up* user image part (since Gemini doesn't support + inline images inside `function_response`). Works in practice; mention + this in prompts if needed. +- **macOS only**. cua-driver is macOS-exclusive (uses Apple + Virtualization-adjacent SkyLight SPIs). Linux/Windows would need a + different driver. + +## Verifying the integration + +```bash +# Backend tests +cd packages/desktop-tauri/src-tauri +cargo test --lib agent:: + +# Expected: 5 passed +# default_socket_under_cache_dir +# encodes_call_request_as_one_line +# decodes_call_success_response +# decodes_error_response +# decodes_screenshot_image_block +``` + +```bash +# Manual smoke (requires cua-driver installed) +cua-driver serve & +sleep 1 + +# From any Hawkeye chat with agent mode on: +"List my open windows" +# → list_windows tool call → assistant reports the windows + +"Take a screenshot" +# → screenshot tool call → assistant describes what's on screen +``` + +## File-level test plan + +| Layer | Test | +|---|---| +| `agent/protocol.rs` | unit tests pass (encode/decode JSON-line protocol) | +| `agent/cua_driver.rs` | `is_running()` returns false with no daemon; `default_path()` resolves under `~/Library/Caches/` | +| `agent/runner.rs` | (manual) tool-use loop runs to completion within `MAX_TOOL_ROUNDS` | +| `ai/gemini.rs::do_chat_with_tools` | (manual) supplies a `tools` block, parses `function_call` parts | +| `commands/agent_cmd.rs` | `get_agent_status` returns sane values; `invoke_cua_tool` blocks unauthorized tools | +| `useAgent.ts` | live tool stream populates from `agent:tool-call-*` events | +| `ChatPanel.tsx` | Agent toggle switches to `chat_with_agent`; clear tool trail rendering | diff --git a/packages/desktop-tauri/src-tauri/Cargo.lock b/packages/desktop-tauri/src-tauri/Cargo.lock index 9309044..1bfc5e5 100644 --- a/packages/desktop-tauri/src-tauri/Cargo.lock +++ b/packages/desktop-tauri/src-tauri/Cargo.lock @@ -246,7 +246,7 @@ dependencies = [ "anyhow", "arrayvec", "log", - "nom", + "nom 8.0.0", "num-rational", "v_frame", ] @@ -272,6 +272,26 @@ version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "bindgen" +version = "0.72.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895" +dependencies = [ + "bitflags 2.11.0", + "cexpr", + "clang-sys", + "itertools 0.13.0", + "log", + "prettyplease", + "proc-macro2", + "quote", + "regex", + "rustc-hash", + "shlex", + "syn 2.0.115", +] + [[package]] name = "bit_field" version = "0.10.3" @@ -471,6 +491,15 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" +[[package]] +name = "cexpr" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fac387a98bb7c37292057cffc56d62ecb629900026402633ae9160df93a8766" +dependencies = [ + "nom 7.1.3", +] + [[package]] name = "cfb" version = "0.7.3" @@ -512,6 +541,57 @@ dependencies = [ "windows-link 0.2.1", ] +[[package]] +name = "clang-sys" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b023947811758c97c59bf9d1c188fd619ad4718dcaa767947df1cadb14f39f4" +dependencies = [ + "glob", + "libc", + "libloading 0.8.9", +] + +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "clap_lex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" + [[package]] name = "clipboard-win" version = "5.4.1" @@ -521,6 +601,15 @@ dependencies = [ "error-code", ] +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + [[package]] name = "cocoa" version = "0.26.1" @@ -1052,6 +1141,26 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "enumflags2" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1027f7680c853e056ebcec683615fb6fbbc07dbaa13b4d5d9442b146ded4ecef" +dependencies = [ + "enumflags2_derive", +] + +[[package]] +name = "enumflags2_derive" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67c78a4d8fdf9953a5c9d458f9efe940fd97a0cab0941c075a813ac594733827" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + [[package]] name = "env_filter" version = "1.0.0" @@ -1205,6 +1314,15 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "find_cuda_helper" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9f9e65c593dd01ac77daad909ea4ad17f0d6d1776193fc8ea766356177abdad" +dependencies = [ + "glob", +] + [[package]] name = "fixedbitset" version = "0.5.7" @@ -1768,11 +1886,15 @@ dependencies = [ "async-trait", "base64 0.22.1", "chrono", + "clap", "cocoa", "dirs", + "encoding_rs", "env_logger", "futures-util", + "glob", "image 0.25.9", + "llama-cpp-2", "log", "objc", "open", @@ -2247,6 +2369,15 @@ version = "1.70.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + [[package]] name = "itertools" version = "0.14.0" @@ -2532,6 +2663,34 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "llama-cpp-2" +version = "0.1.138" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2947ab625c59d1fdf42e61f538c3fa66f43de2f78316971920873f359483d1d8" +dependencies = [ + "encoding_rs", + "enumflags2", + "llama-cpp-sys-2", + "thiserror 2.0.18", + "tracing", + "tracing-core", +] + +[[package]] +name = "llama-cpp-sys-2" +version = "0.1.138" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84a529006bf16af70c7485ba957820dc2bc9467d75697e97970c81d2da73c76f" +dependencies = [ + "bindgen", + "cc", + "cmake", + "find_cuda_helper", + "glob", + "walkdir", +] + [[package]] name = "lock_api" version = "0.4.14" @@ -2651,6 +2810,12 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "minisign-verify" version = "0.2.4" @@ -2781,6 +2946,16 @@ version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nom" version = "8.0.0" @@ -3813,7 +3988,7 @@ dependencies = [ "built", "cfg-if", "interpolate_name", - "itertools", + "itertools 0.14.0", "libc", "libfuzzer-sys", "log", @@ -4077,6 +4252,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + [[package]] name = "rustc_version" version = "0.4.1" @@ -5539,9 +5720,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" dependencies = [ "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + [[package]] name = "tracing-core" version = "0.1.36" @@ -5549,6 +5742,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" dependencies = [ "once_cell", + "valuable", ] [[package]] @@ -5580,7 +5774,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8765b90061cba6c22b5831f675da109ae5561588290f9fa2317adab2714d5a6" dependencies = [ "memchr", - "nom", + "nom 8.0.0", "petgraph", ] @@ -5733,6 +5927,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + [[package]] name = "vcpkg" version = "0.2.15" diff --git a/packages/desktop-tauri/src-tauri/Cargo.toml b/packages/desktop-tauri/src-tauri/Cargo.toml index 56f83ad..850793f 100644 --- a/packages/desktop-tauri/src-tauri/Cargo.toml +++ b/packages/desktop-tauri/src-tauri/Cargo.toml @@ -12,6 +12,18 @@ rust-version = "1.70" name = "hawkeye_lib" crate-type = ["staticlib", "cdylib", "rlib"] +# The Tauri webview app — same as the auto-discovered default but explicit +# so we can sit alongside `hawkeye-cli` without ambiguity. +[[bin]] +name = "hawkeye-desktop" +path = "src/main.rs" + +# Headless / scriptable CLI that drives the same subsystems (observe loop, +# AI providers, agent runner) without booting Tauri or a webview. +[[bin]] +name = "hawkeye-cli" +path = "src/bin/cli.rs" + [build-dependencies] tauri-build = { version = "2", features = [] } @@ -54,12 +66,20 @@ async-trait = "0.1" log = "0.4" env_logger = "0.11" +# Local LLM inference (llama.cpp Rust bindings, Metal for macOS GPU) +llama-cpp-2 = { version = "0.1", default-features = false, features = ["metal"] } +encoding_rs = "0.8" + # Utilities uuid = { version = "1", features = ["v4"] } chrono = "0.4" base64 = "0.22" open = "5" dirs = "6" +glob = "0.3" + +# CLI argument parsing for the headless `hawkeye-cli` binary. +clap = { version = "4", features = ["derive"] } [target.'cfg(target_os = "macos")'.dependencies] cocoa = "0.26" diff --git a/packages/desktop-tauri/src-tauri/src/agent/cua_driver.rs b/packages/desktop-tauri/src-tauri/src/agent/cua_driver.rs new file mode 100644 index 0000000..8a42ae1 --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/agent/cua_driver.rs @@ -0,0 +1,233 @@ +//! Async client for the cua-driver daemon. +//! +//! Connects to `~/Library/Caches/cua-driver/cua-driver.sock`, sends a single +//! JSON request line, reads a single JSON response line, and closes the +//! socket. This mirrors the CLI's `cua-driver call …` pattern. + +use anyhow::{anyhow, bail, Context, Result}; +use serde_json::Value; +use std::collections::HashMap; +use std::path::PathBuf; +use std::process::Stdio; +use std::time::Duration; +use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader}; +use tokio::net::UnixStream; +use tokio::process::Command; +use tokio::time::timeout; + +use super::protocol::{ + CallResult, DaemonRequest, DaemonResponse, DaemonResult, ToolDescriptor, +}; + +/// Default socket path: `$HOME/Library/Caches/cua-driver/cua-driver.sock`. +pub fn default_socket_path() -> Option { + dirs::cache_dir().map(|p| p.join("cua-driver").join("cua-driver.sock")) +} + +/// Default PID file path. +pub fn default_pid_path() -> Option { + dirs::cache_dir().map(|p| p.join("cua-driver").join("cua-driver.pid")) +} + +/// Resolve the cua-driver binary location. We check, in order: +/// 1. `$CUA_DRIVER_BIN` env override +/// 2. `/usr/local/bin/cua-driver` (install.sh symlink) +/// 3. `/Applications/CuaDriver.app/Contents/MacOS/cua-driver` +pub fn resolve_binary() -> Option { + if let Ok(env) = std::env::var("CUA_DRIVER_BIN") { + let p = PathBuf::from(env); + if p.exists() { + return Some(p); + } + } + let candidates = [ + "/usr/local/bin/cua-driver", + "/Applications/CuaDriver.app/Contents/MacOS/cua-driver", + ]; + candidates.iter().map(PathBuf::from).find(|p| p.exists()) +} + +/// Async client speaking the cua-driver daemon protocol. +#[derive(Debug, Clone)] +pub struct CuaDriverClient { + socket_path: PathBuf, + /// Per-request timeout (connect + send + receive). + request_timeout: Duration, +} + +impl CuaDriverClient { + pub fn new(socket_path: PathBuf) -> Self { + Self { socket_path, request_timeout: Duration::from_secs(30) } + } + + /// Convenience: client at the default socket path. + pub fn default_path() -> Result { + let p = default_socket_path().ok_or_else(|| anyhow!("no $HOME cache dir"))?; + Ok(Self::new(p)) + } + + pub fn with_timeout(mut self, t: Duration) -> Self { + self.request_timeout = t; + self + } + + pub fn socket_path(&self) -> &PathBuf { + &self.socket_path + } + + /// Probe whether the daemon is reachable. Connects with a short timeout + /// and immediately closes on success. + pub async fn is_running(&self) -> bool { + timeout(Duration::from_millis(500), UnixStream::connect(&self.socket_path)) + .await + .map(|r| r.is_ok()) + .unwrap_or(false) + } + + /// Send a request and return the raw daemon response. + pub async fn send(&self, request: &DaemonRequest) -> Result { + timeout(self.request_timeout, self.send_inner(request)) + .await + .map_err(|_| anyhow!("cua-driver request timed out after {:?}", self.request_timeout))? + } + + async fn send_inner(&self, request: &DaemonRequest) -> Result { + let stream = UnixStream::connect(&self.socket_path) + .await + .with_context(|| format!("connect to cua-driver socket {}", self.socket_path.display()))?; + + let (read_half, mut write_half) = stream.into_split(); + + let mut payload = serde_json::to_vec(request)?; + payload.push(b'\n'); + write_half.write_all(&payload).await?; + // Half-close write side so the daemon knows the request is complete + // (it scans for `\n`, so this is belt-and-suspenders). + write_half.flush().await?; + // Drop the writer to half-shutdown — the daemon already has a full + // line and will respond, so this is safe. + drop(write_half); + + let mut reader = BufReader::new(read_half); + let mut line = String::new(); + let n = reader + .read_line(&mut line) + .await + .context("read cua-driver response line")?; + if n == 0 { + bail!("cua-driver closed connection without responding"); + } + let resp: DaemonResponse = serde_json::from_str(line.trim_end_matches('\n')) + .with_context(|| format!("parse cua-driver response: {}", line))?; + Ok(resp) + } + + /// Invoke a tool and unwrap to a `CallResult`. Returns `Err` if the + /// daemon returned `ok=false` or if the result kind isn't `call`. + pub async fn call(&self, tool: &str, args: HashMap) -> Result { + let resp = self.send(&DaemonRequest::call(tool, args)).await?; + if !resp.ok { + let err = resp.error.unwrap_or_else(|| "(no error message)".into()); + bail!("cua-driver tool '{}' failed (exit={:?}): {}", tool, resp.exit_code, err); + } + match resp.result { + Some(DaemonResult::Call(r)) => { + if r.is_error { + bail!("cua-driver tool '{}' reported isError=true: {}", tool, r.text()); + } + Ok(r) + } + other => bail!("cua-driver returned unexpected result kind: {:?}", other), + } + } + + /// `list` method — enumerate available tools. + pub async fn list_tools(&self) -> Result> { + let resp = self.send(&DaemonRequest::list()).await?; + if !resp.ok { + bail!(resp.error.unwrap_or_else(|| "list failed".into())); + } + match resp.result { + Some(DaemonResult::List(t)) => Ok(t), + other => bail!("expected List result, got {:?}", other), + } + } + + /// `describe` — schema for a single tool. + pub async fn describe(&self, tool: &str) -> Result { + let resp = self.send(&DaemonRequest::describe(tool)).await?; + if !resp.ok { + bail!(resp.error.unwrap_or_else(|| "describe failed".into())); + } + match resp.result { + Some(DaemonResult::Describe(t)) => Ok(t), + other => bail!("expected Describe result, got {:?}", other), + } + } +} + +/// Daemon lifecycle helper. Spawns `cua-driver serve` in the background if +/// the socket isn't already accepting connections. +pub struct DaemonSupervisor { + binary: Option, + client: CuaDriverClient, +} + +impl DaemonSupervisor { + pub fn new(client: CuaDriverClient) -> Self { + Self { binary: resolve_binary(), client } + } + + pub fn binary_path(&self) -> Option<&PathBuf> { + self.binary.as_ref() + } + + pub async fn ensure_running(&self) -> Result<()> { + if self.client.is_running().await { + return Ok(()); + } + let binary = self + .binary + .as_ref() + .ok_or_else(|| anyhow!( + "cua-driver binary not found. Install via:\n /bin/bash -c \"$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)\"\nor set CUA_DRIVER_BIN env var." + ))?; + + log::info!("[cua-driver] spawning daemon: {} serve", binary.display()); + + // Detach: stdout/stderr to null, no stdin. The daemon writes a PID + // file and listens for SIGINT/SIGTERM for shutdown. + Command::new(binary) + .arg("serve") + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .context("spawn cua-driver serve")?; + + // Poll the socket until it accepts a connection (max ~3s). + for attempt in 0..30 { + tokio::time::sleep(Duration::from_millis(100)).await; + if self.client.is_running().await { + log::info!("[cua-driver] daemon ready after {} ms", (attempt + 1) * 100); + return Ok(()); + } + } + bail!("cua-driver daemon failed to start within 3s"); + } + + pub fn client(&self) -> &CuaDriverClient { + &self.client + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn default_socket_under_cache_dir() { + let p = default_socket_path().unwrap(); + assert!(p.ends_with("cua-driver/cua-driver.sock")); + } +} diff --git a/packages/desktop-tauri/src-tauri/src/agent/mod.rs b/packages/desktop-tauri/src-tauri/src/agent/mod.rs new file mode 100644 index 0000000..13cbed6 --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/agent/mod.rs @@ -0,0 +1,18 @@ +//! Agent module — bridges the AI provider to cua-driver for desktop control. +//! +//! Layers: +//! - [`protocol`]: Wire types (`DaemonRequest`, `DaemonResponse`, …) for +//! the cua-driver Unix-socket protocol. +//! - [`cua_driver`]: Async client + daemon supervisor. +//! - [`tools`]: Curated catalog mapped to `FunctionDeclaration`s +//! surfaced to the LLM. +//! - [`runner`]: Tool-use loop orchestrating `chat_with_tools` ↔ +//! cua-driver tool execution. + +pub mod cua_driver; +pub mod protocol; +pub mod runner; +pub mod tools; + +pub use cua_driver::{CuaDriverClient, DaemonSupervisor}; +pub use runner::{run_user_turn, AgentTurnResult, ToolCallRecord, MAX_TOOL_ROUNDS}; diff --git a/packages/desktop-tauri/src-tauri/src/agent/protocol.rs b/packages/desktop-tauri/src-tauri/src/agent/protocol.rs new file mode 100644 index 0000000..e86c175 --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/agent/protocol.rs @@ -0,0 +1,189 @@ +//! cua-driver daemon wire protocol types. +//! +//! The daemon listens on a Unix domain socket at +//! `~/Library/Caches/cua-driver/cua-driver.sock` (mode 0o600) and speaks a +//! line-delimited JSON protocol — each message is a single JSON object +//! followed by `\n`. This is intentionally simpler than MCP framing. +//! +//! References (Swift sources): `libs/cua-driver/Sources/CuaDriverServer/{DaemonProtocol,DaemonServer}.swift`. + +use serde::{Deserialize, Serialize}; +use serde_json::Value; +use std::collections::HashMap; + +/// Top-level request sent to the daemon. +/// +/// `method` is one of `"call"`, `"list"`, `"describe"`, `"shutdown"`. +#[derive(Debug, Clone, Serialize)] +pub struct DaemonRequest { + pub method: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub name: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub args: Option>, +} + +impl DaemonRequest { + pub fn list() -> Self { + Self { method: "list".into(), name: None, args: None } + } + + pub fn describe(tool: impl Into) -> Self { + Self { method: "describe".into(), name: Some(tool.into()), args: None } + } + + pub fn call(tool: impl Into, args: HashMap) -> Self { + Self { method: "call".into(), name: Some(tool.into()), args: Some(args) } + } + + pub fn shutdown() -> Self { + Self { method: "shutdown".into(), name: None, args: None } + } +} + +/// Top-level response from the daemon. +/// +/// On success, `ok=true` and `result` is set. On failure, `ok=false` and +/// `error` carries the message; `exit_code` follows sysexits.h conventions +/// (1=tool error, 64=usage, 65=data, 70=software). +#[derive(Debug, Clone, Deserialize)] +pub struct DaemonResponse { + pub ok: bool, + #[serde(default)] + pub result: Option, + #[serde(default)] + pub error: Option, + #[serde(rename = "exitCode", default)] + pub exit_code: Option, +} + +/// Result discriminator — matches Swift's `DaemonResult` enum. +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "kind", content = "payload", rename_all = "lowercase")] +pub enum DaemonResult { + Call(CallResult), + List(Vec), + Describe(ToolDescriptor), +} + +/// Result of a tool invocation. Mirrors MCP's `CallTool.Result`: a list of +/// content blocks (text, image, etc.) plus an `isError` flag. +#[derive(Debug, Clone, Deserialize)] +pub struct CallResult { + #[serde(default)] + pub content: Vec, + #[serde(rename = "isError", default)] + pub is_error: bool, +} + +impl CallResult { + /// Concatenate all text blocks for human-readable summaries. + pub fn text(&self) -> String { + self.content + .iter() + .filter_map(|b| match b { + ContentBlock::Text { text } => Some(text.as_str()), + _ => None, + }) + .collect::>() + .join("\n") + } + + /// First image block (e.g. `screenshot` returns inline base64 PNG). + pub fn first_image(&self) -> Option<(&str, &str)> { + self.content.iter().find_map(|b| match b { + ContentBlock::Image { data, mime_type } => Some((data.as_str(), mime_type.as_str())), + _ => None, + }) + } +} + +/// A single content block in a tool result. +#[derive(Debug, Clone, Deserialize)] +#[serde(tag = "type", rename_all = "lowercase")] +pub enum ContentBlock { + Text { + text: String, + }, + Image { + data: String, + #[serde(rename = "mimeType")] + mime_type: String, + }, + /// Forward-compat: unknown block types are preserved as raw JSON. + #[serde(other, deserialize_with = "deserialize_unknown")] + Other, +} + +fn deserialize_unknown<'de, D>(deserializer: D) -> Result<(), D::Error> +where + D: serde::Deserializer<'de>, +{ + serde::de::IgnoredAny::deserialize(deserializer).map(|_| ()) +} + +/// Tool descriptor returned by `list` / `describe`. The full schema is rich; +/// we keep only fields we use for surfacing to the LLM. +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct ToolDescriptor { + pub name: String, + #[serde(default)] + pub description: Option, + #[serde(rename = "inputSchema", default)] + pub input_schema: Option, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn encodes_call_request_as_one_line() { + let mut args = HashMap::new(); + args.insert("pid".into(), Value::from(1234)); + args.insert("x".into(), Value::from(100)); + args.insert("y".into(), Value::from(200)); + let req = DaemonRequest::call("click", args); + let json = serde_json::to_string(&req).unwrap(); + assert!(json.contains("\"method\":\"call\"")); + assert!(json.contains("\"name\":\"click\"")); + assert!(!json.contains('\n')); + } + + #[test] + fn decodes_call_success_response() { + let body = r#"{"ok":true,"result":{"kind":"call","payload":{"content":[{"type":"text","text":"Clicked"}],"isError":false}}}"#; + let resp: DaemonResponse = serde_json::from_str(body).unwrap(); + assert!(resp.ok); + match resp.result.unwrap() { + DaemonResult::Call(r) => { + assert!(!r.is_error); + assert_eq!(r.text(), "Clicked"); + } + _ => panic!("expected Call result"), + } + } + + #[test] + fn decodes_error_response() { + let body = r#"{"ok":false,"error":"Unknown tool","exitCode":64}"#; + let resp: DaemonResponse = serde_json::from_str(body).unwrap(); + assert!(!resp.ok); + assert_eq!(resp.error.as_deref(), Some("Unknown tool")); + assert_eq!(resp.exit_code, Some(64)); + } + + #[test] + fn decodes_screenshot_image_block() { + let body = r#"{"ok":true,"result":{"kind":"call","payload":{"content":[{"type":"image","data":"iVBOR…","mimeType":"image/png"}],"isError":false}}}"#; + let resp: DaemonResponse = serde_json::from_str(body).unwrap(); + match resp.result.unwrap() { + DaemonResult::Call(r) => { + let (data, mime) = r.first_image().unwrap(); + assert!(data.starts_with("iVBOR")); + assert_eq!(mime, "image/png"); + } + _ => panic!(), + } + } +} diff --git a/packages/desktop-tauri/src-tauri/src/agent/runner.rs b/packages/desktop-tauri/src-tauri/src/agent/runner.rs new file mode 100644 index 0000000..8040aff --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/agent/runner.rs @@ -0,0 +1,236 @@ +//! Tool-use orchestration loop. +//! +//! Drives a single user turn through the model + cua-driver until the model +//! emits final text (or we hit the safety cap). Emits events through an +//! [`EventSink`] on each tool call so the host (Tauri UI, CLI stdout, …) +//! can render progress in real time. + +use anyhow::{anyhow, bail, Result}; +use serde::Serialize; +use serde_json::{json, Value}; +use std::collections::HashMap; +use std::sync::Arc; + +use super::cua_driver::CuaDriverClient; +use super::tools::{gemini_function_declarations, is_allowed}; +use crate::ai::types::{ + FunctionCall, FunctionResult, ToolMessage, ToolTurn, UsageInfo, +}; +use crate::ai::AiProvider; +use crate::event_sink::EventSink; +use crate::events; + +/// Maximum number of tool-call rounds in a single user turn. +pub const MAX_TOOL_ROUNDS: usize = 8; + +/// Final outcome of a tool-using turn. +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct AgentTurnResult { + pub text: String, + pub rounds: usize, + pub tool_calls: Vec, + pub usage: Option, +} + +/// One entry in the per-turn audit log emitted to the frontend. +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct ToolCallRecord { + pub round: usize, + pub name: String, + pub args: Value, + pub ok: bool, + pub summary: String, +} + +/// Execute a single user turn end-to-end. +/// +/// `history` is the prior conversation (text-only roles); we append the new +/// user input ourselves. `cua_driver` may be `None`, in which case the model +/// will be given an empty tool list and forced to answer textually. +pub async fn run_user_turn( + sink: Arc, + provider: Arc, + cua_driver: Option, + history: Vec, + user_input: String, +) -> Result { + if !provider.supports_tools() { + bail!("Provider '{}' does not support tool calling", provider.provider_name()); + } + + let tools = if cua_driver.is_some() { + gemini_function_declarations() + } else { + Vec::new() + }; + + let mut messages = history; + messages.push(ToolMessage::User(user_input)); + + let mut tool_calls: Vec = Vec::new(); + let mut last_usage: Option = None; + + for round in 1..=MAX_TOOL_ROUNDS { + let turn = provider.chat_with_tools(messages.clone(), &tools).await?; + + match turn { + ToolTurn::Text { text, usage } => { + if let Some(u) = usage.clone() { + last_usage = Some(u); + } + return Ok(AgentTurnResult { + text, + rounds: round, + tool_calls, + usage: last_usage, + }); + } + ToolTurn::ToolCalls { calls, usage } => { + if let Some(u) = usage.clone() { + last_usage = Some(u); + } + + if calls.is_empty() { + bail!("model returned empty tool-call list"); + } + + let driver = cua_driver + .as_ref() + .ok_or_else(|| anyhow!("model requested a tool but no cua-driver client is available"))?; + + // Record the model's tool calls in history (Gemini requires + // function_call → function_response symmetry). + messages.push(ToolMessage::AssistantToolCalls(calls.clone())); + + for call in calls { + let record = + execute_tool(sink.as_ref(), driver, &call, round, &mut messages).await; + tool_calls.push(record); + } + } + } + } + + bail!( + "agent exceeded {} tool-call rounds without final answer", + MAX_TOOL_ROUNDS + ) +} + +/// Execute one tool call, append its result to `messages`, and return an +/// audit record. Errors are not propagated — they are reported back to the +/// model as `{ok: false, error: …}` so it can recover. +async fn execute_tool( + sink: &dyn EventSink, + driver: &CuaDriverClient, + call: &FunctionCall, + round: usize, + messages: &mut Vec, +) -> ToolCallRecord { + log::info!("[agent] round {} tool call: {} {}", round, call.name, call.args); + + // Notify host that a tool call started. + sink.emit( + events::AGENT_TOOL_CALL_START, + json!({ + "round": round, + "name": call.name, + "args": call.args, + }), + ); + + if !is_allowed(&call.name) { + let err = format!("tool '{}' is not in the allow-list", call.name); + let response = json!({ "ok": false, "error": err }); + messages.push(ToolMessage::ToolResult(FunctionResult { + name: call.name.clone(), + response: response.clone(), + })); + sink.emit( + events::AGENT_TOOL_CALL_END, + json!({ "round": round, "name": call.name, "ok": false, "summary": err }), + ); + return ToolCallRecord { + round, + name: call.name.clone(), + args: call.args.clone(), + ok: false, + summary: err, + }; + } + + let args_map = match call.args.as_object() { + Some(m) => m + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect::>(), + None => HashMap::new(), + }; + + match driver.call(&call.name, args_map).await { + Ok(result) => { + // Build response payload for the model. + let mut payload = json!({ + "ok": true, + "summary": result.text(), + }); + + // For screenshots, also feed the image back so the model can + // actually see the screen. Gemini doesn't support inline images + // inside function_response, so we attach the image as a + // follow-up user image part in the next turn. + let mut attached_image: Option<(String, String)> = None; + if let Some((data, mime)) = result.first_image() { + attached_image = Some((mime.to_string(), data.to_string())); + if let Some(obj) = payload.as_object_mut() { + obj.insert("image".into(), json!({ + "mimeType": mime, + "note": "image attached as follow-up user image part", + })); + } + } + + let summary = result.text(); + messages.push(ToolMessage::ToolResult(FunctionResult { + name: call.name.clone(), + response: payload, + })); + if let Some((mime, data)) = attached_image { + messages.push(ToolMessage::UserImage { mime_type: mime, data }); + } + + sink.emit( + events::AGENT_TOOL_CALL_END, + json!({ "round": round, "name": call.name, "ok": true, "summary": summary }), + ); + ToolCallRecord { + round, + name: call.name.clone(), + args: call.args.clone(), + ok: true, + summary, + } + } + Err(e) => { + let err = e.to_string(); + log::warn!("[agent] tool '{}' failed: {}", call.name, err); + messages.push(ToolMessage::ToolResult(FunctionResult { + name: call.name.clone(), + response: json!({ "ok": false, "error": err }), + })); + sink.emit( + events::AGENT_TOOL_CALL_END, + json!({ "round": round, "name": call.name, "ok": false, "summary": err }), + ); + ToolCallRecord { + round, + name: call.name.clone(), + args: call.args.clone(), + ok: false, + summary: err, + } + } + } +} diff --git a/packages/desktop-tauri/src-tauri/src/agent/tools.rs b/packages/desktop-tauri/src-tauri/src/agent/tools.rs new file mode 100644 index 0000000..4388e3a --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/agent/tools.rs @@ -0,0 +1,209 @@ +//! Curated catalog of cua-driver tools surfaced to the LLM. +//! +//! cua-driver exposes ~28 MCP tools. Most LLMs pick better tools when the +//! catalog is small, well-described, and consistent. We hand-curate the +//! subset most useful for assistive automation, and translate them to +//! Gemini's `FunctionDeclaration` shape (which is OpenAPI 3.0 JSON Schema). + +use serde_json::{json, Value}; + +use crate::ai::types::FunctionDeclaration; + +/// Build the curated function-declaration list to send to Gemini. +pub fn gemini_function_declarations() -> Vec { + vec![ + screenshot(), + list_windows(), + get_window_state(), + click(), + type_text(), + press_key(), + scroll(), + launch_app(), + ] +} + +/// Allow-list of cua-driver tool names that the LLM may invoke. Anything +/// outside this set is rejected before reaching the daemon. +pub fn allowed_tool_names() -> &'static [&'static str] { + &[ + "screenshot", + "list_windows", + "get_window_state", + "click", + "type_text", + "press_key", + "scroll", + "launch_app", + ] +} + +pub fn is_allowed(tool: &str) -> bool { + allowed_tool_names().contains(&tool) +} + +// --- individual tool declarations ------------------------------------------ + +fn fd(name: &str, description: &str, parameters: Value) -> FunctionDeclaration { + FunctionDeclaration { + name: name.to_string(), + description: description.to_string(), + parameters, + } +} + +fn screenshot() -> FunctionDeclaration { + fd( + "screenshot", + "Capture a PNG screenshot of the current screen or a specific window. Use to see the user's desktop before deciding what to do. The result image is automatically attached to the next turn.", + json!({ + "type": "object", + "properties": { + "window_id": { + "type": "integer", + "description": "Optional CGWindowID. Omit to capture the full primary display." + } + } + }), + ) +} + +fn list_windows() -> FunctionDeclaration { + fd( + "list_windows", + "Enumerate visible windows across all running apps. Returns title, pid, window_id, bounds, and minimized state. Use to find the window you want to interact with.", + json!({ + "type": "object", + "properties": { + "pid": { + "type": "integer", + "description": "Optional: limit to a single process." + }, + "on_screen_only": { + "type": "boolean", + "description": "If true, exclude minimized/hidden windows. Default true." + } + } + }), + ) +} + +fn get_window_state() -> FunctionDeclaration { + fd( + "get_window_state", + "Snapshot a window's accessibility (AX) tree plus a PNG. Returns interactive elements with stable element_index values you can pass to click/type_text without re-resolving coordinates. Required before AX-element-addressed clicks.", + json!({ + "type": "object", + "properties": { + "pid": { "type": "integer", "description": "Process id (from list_windows)." }, + "window_id": { "type": "integer", "description": "Window id (from list_windows)." }, + "query": { + "type": "string", + "description": "Optional substring to filter element titles/roles (e.g., \"button\")." + } + }, + "required": ["pid", "window_id"] + }), + ) +} + +fn click() -> FunctionDeclaration { + fd( + "click", + "Click on a UI element or at a pixel coordinate WITHOUT stealing focus or moving the user's cursor. Two modes: (1) AX-addressed: pass {pid, window_id, element_index} from a recent get_window_state result; (2) pixel-addressed: pass {pid, x, y} where x/y are window-local pixel coordinates from a screenshot.", + json!({ + "type": "object", + "properties": { + "pid": { "type": "integer", "description": "Target process id." }, + "window_id": { "type": "integer", "description": "Target window id (AX mode)." }, + "element_index": { + "type": "integer", + "description": "AX element index from get_window_state (AX mode)." + }, + "x": { + "type": "number", + "description": "Window-local x in screenshot pixels (pixel mode)." + }, + "y": { + "type": "number", + "description": "Window-local y in screenshot pixels (pixel mode)." + }, + "button": { + "type": "string", + "enum": ["left", "right"], + "description": "Mouse button. Default left." + } + }, + "required": ["pid"] + }), + ) +} + +fn type_text() -> FunctionDeclaration { + fd( + "type_text", + "Type a string into the focused text field of a window without raising or stealing focus. If element_index is given, it is focused first. Otherwise the current focus inside the window is used.", + json!({ + "type": "object", + "properties": { + "pid": { "type": "integer", "description": "Target process id." }, + "text": { "type": "string", "description": "Text to type." }, + "window_id": { "type": "integer", "description": "Optional target window id." }, + "element_index": { + "type": "integer", + "description": "Optional AX element index to focus before typing." + } + }, + "required": ["pid", "text"] + }), + ) +} + +fn press_key() -> FunctionDeclaration { + fd( + "press_key", + "Send a single keystroke or hotkey combo (e.g., \"cmd+s\", \"return\", \"escape\"). Format: lowercase modifier names joined with '+' followed by the key.", + json!({ + "type": "object", + "properties": { + "pid": { "type": "integer", "description": "Target process id." }, + "keys": { + "type": "string", + "description": "Hotkey string, e.g. \"cmd+s\" or \"return\"." + } + }, + "required": ["pid", "keys"] + }), + ) +} + +fn scroll() -> FunctionDeclaration { + fd( + "scroll", + "Scroll within a window. Positive dy scrolls down (content moves up).", + json!({ + "type": "object", + "properties": { + "pid": { "type": "integer", "description": "Target process id." }, + "window_id": { "type": "integer", "description": "Optional target window id." }, + "dx": { "type": "number", "description": "Horizontal scroll delta in pixels." }, + "dy": { "type": "number", "description": "Vertical scroll delta in pixels." } + }, + "required": ["pid"] + }), + ) +} + +fn launch_app() -> FunctionDeclaration { + fd( + "launch_app", + "Launch a macOS app by bundle id (e.g., com.apple.Safari) or absolute path. Activates if already running.", + json!({ + "type": "object", + "properties": { + "bundle_id": { "type": "string", "description": "App bundle identifier." }, + "path": { "type": "string", "description": "Optional absolute .app path (alternative to bundle_id)." } + } + }), + ) +} diff --git a/packages/desktop-tauri/src-tauri/src/ai/gemini.rs b/packages/desktop-tauri/src-tauri/src/ai/gemini.rs index 8e441df..92091c6 100644 --- a/packages/desktop-tauri/src-tauri/src/ai/gemini.rs +++ b/packages/desktop-tauri/src-tauri/src/ai/gemini.rs @@ -45,31 +45,11 @@ impl GeminiClient { max_output_tokens: Some(8192), temperature: Some(0.7), }), + tools: None, + tool_config: None, }; - let url = format!( - "{}/models/{}:generateContent?key={}", - self.base_url, self.model, self.api_key - ); - - let response = self - .client - .post(&url) - .json(&request) - .send() - .await - .map_err(|e| anyhow!("HTTP request failed: {}", e))?; - - if !response.status().is_success() { - let status = response.status(); - let body = response.text().await.unwrap_or_default(); - return Err(anyhow!("Gemini API error ({}): {}", status, body)); - } - - let gemini_response: GeminiResponse = response - .json() - .await - .map_err(|e| anyhow!("Failed to parse response: {}", e))?; + let gemini_response = self.post_generate_content(&request).await?; let text = gemini_response .candidates @@ -113,6 +93,8 @@ impl GeminiClient { mime_type: "image/png".to_string(), data: image_base64.to_string(), }), + function_call: None, + function_response: None, }); } @@ -122,31 +104,11 @@ impl GeminiClient { max_output_tokens: Some(8192), temperature: Some(0.7), }), + tools: None, + tool_config: None, }; - let url = format!( - "{}/models/{}:generateContent?key={}", - self.base_url, self.model, self.api_key - ); - - let response = self - .client - .post(&url) - .json(&request) - .send() - .await - .map_err(|e| anyhow!("HTTP request failed: {}", e))?; - - if !response.status().is_success() { - let status = response.status(); - let body = response.text().await.unwrap_or_default(); - return Err(anyhow!("Gemini API error ({}): {}", status, body)); - } - - let gemini_response: GeminiResponse = response - .json() - .await - .map_err(|e| anyhow!("Failed to parse response: {}", e))?; + let gemini_response = self.post_generate_content(&request).await?; let text = gemini_response .candidates @@ -173,12 +135,82 @@ impl GeminiClient { }) } + /// Tool-using single-turn chat. + async fn do_chat_with_tools( + &self, + messages: Vec, + tools: &[FunctionDeclaration], + ) -> Result { + let contents = self.convert_tool_messages(messages); + + let (tools_payload, tool_config) = if tools.is_empty() { + (None, None) + } else { + ( + Some(vec![GeminiTool { + function_declarations: tools.to_vec(), + }]), + Some(GeminiToolConfig { + function_calling_config: GeminiFunctionCallingConfig { + mode: "AUTO".to_string(), + allowed_function_names: None, + }, + }), + ) + }; + + let request = GeminiRequest { + contents, + generation_config: Some(GeminiGenerationConfig { + max_output_tokens: Some(2048), + temperature: Some(0.4), + }), + tools: tools_payload, + tool_config, + }; + + let response = self.post_generate_content(&request).await?; + + let usage = response.usage_metadata.as_ref().map(|u| UsageInfo { + prompt_tokens: u.prompt_token_count.unwrap_or(0), + completion_tokens: u.candidates_token_count.unwrap_or(0), + total_tokens: u.total_token_count.unwrap_or(0), + }); + + let parts = response + .candidates + .as_ref() + .and_then(|c| c.first()) + .and_then(|c| c.content.as_ref()) + .and_then(|c| c.parts.as_ref()) + .cloned() + .unwrap_or_default(); + + // Collect function calls; Gemini may emit multiple in parallel. + let mut calls = Vec::new(); + let mut text_buf = String::new(); + for p in &parts { + if let Some(fc) = &p.function_call { + calls.push(FunctionCall { name: fc.name.clone(), args: fc.args.clone() }); + } + if let Some(t) = &p.text { + if !text_buf.is_empty() { + text_buf.push('\n'); + } + text_buf.push_str(t); + } + } + + if !calls.is_empty() { + Ok(ToolTurn::ToolCalls { calls, usage }) + } else { + Ok(ToolTurn::Text { text: text_buf, usage }) + } + } + /// Validate the API key by making a test request async fn do_validate(&self) -> Result<()> { - let url = format!( - "{}/models?key={}", - self.base_url, self.api_key - ); + let url = format!("{}/models?key={}", self.base_url, self.api_key); let response = self .client @@ -195,6 +227,32 @@ impl GeminiClient { } } + async fn post_generate_content(&self, request: &GeminiRequest) -> Result { + let url = format!( + "{}/models/{}:generateContent?key={}", + self.base_url, self.model, self.api_key + ); + + let response = self + .client + .post(&url) + .json(request) + .send() + .await + .map_err(|e| anyhow!("HTTP request failed: {}", e))?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(anyhow!("Gemini API error ({}): {}", status, body)); + } + + response + .json::() + .await + .map_err(|e| anyhow!("Failed to parse response: {}", e)) + } + /// Convert ChatMessages to Gemini format fn convert_messages(&self, messages: Vec) -> Vec { let mut contents = Vec::new(); @@ -217,6 +275,8 @@ impl GeminiClient { parts: vec![GeminiPart { text: Some(text), inline_data: None, + function_call: None, + function_response: None, }], }); } @@ -226,6 +286,8 @@ impl GeminiClient { parts: vec![GeminiPart { text: Some(msg.content), inline_data: None, + function_call: None, + function_response: None, }], }); } @@ -236,6 +298,8 @@ impl GeminiClient { parts: vec![GeminiPart { text: Some(msg.content), inline_data: None, + function_call: None, + function_response: None, }], }); } @@ -244,6 +308,95 @@ impl GeminiClient { contents } + + /// Convert tool-aware messages to Gemini contents. + fn convert_tool_messages(&self, messages: Vec) -> Vec { + let mut contents: Vec = Vec::new(); + + // Helper: get a mutable handle to the last content if its role matches. + fn append_to_last(contents: &mut Vec, role: &str, parts: Vec) { + if let Some(last) = contents.last_mut() { + if last.role == role { + last.parts.extend(parts); + return; + } + } + contents.push(GeminiContent { role: role.to_string(), parts }); + } + + for msg in messages { + match msg { + ToolMessage::User(text) => { + append_to_last( + &mut contents, + "user", + vec![GeminiPart { + text: Some(text), + inline_data: None, + function_call: None, + function_response: None, + }], + ); + } + ToolMessage::Assistant(text) => { + append_to_last( + &mut contents, + "model", + vec![GeminiPart { + text: Some(text), + inline_data: None, + function_call: None, + function_response: None, + }], + ); + } + ToolMessage::UserImage { mime_type, data } => { + append_to_last( + &mut contents, + "user", + vec![GeminiPart { + text: None, + inline_data: Some(GeminiInlineData { mime_type, data }), + function_call: None, + function_response: None, + }], + ); + } + ToolMessage::AssistantToolCalls(calls) => { + let parts: Vec = calls + .into_iter() + .map(|c| GeminiPart { + text: None, + inline_data: None, + function_call: Some(GeminiFunctionCall { + name: c.name, + args: c.args, + }), + function_response: None, + }) + .collect(); + append_to_last(&mut contents, "model", parts); + } + ToolMessage::ToolResult(r) => { + append_to_last( + &mut contents, + "user", + vec![GeminiPart { + text: None, + inline_data: None, + function_call: None, + function_response: Some(GeminiFunctionResponse { + name: r.name, + response: r.response, + }), + }], + ); + } + } + } + + contents + } } #[async_trait] @@ -260,6 +413,18 @@ impl AiProvider for GeminiClient { self.do_chat_with_vision(messages, image_base64).await } + async fn chat_with_tools( + &self, + messages: Vec, + tools: &[FunctionDeclaration], + ) -> Result { + self.do_chat_with_tools(messages, tools).await + } + + fn supports_tools(&self) -> bool { + true + } + async fn validate(&self) -> Result<()> { self.do_validate().await } diff --git a/packages/desktop-tauri/src-tauri/src/ai/mod.rs b/packages/desktop-tauri/src-tauri/src/ai/mod.rs index 36f9efd..c7b95f9 100644 --- a/packages/desktop-tauri/src-tauri/src/ai/mod.rs +++ b/packages/desktop-tauri/src-tauri/src/ai/mod.rs @@ -1,11 +1,13 @@ -//! AI module — multi-provider chat (Gemini, OpenAI-compatible) +//! AI module — multi-provider chat (Gemini, OpenAI-compatible, local llama.cpp) pub mod gemini; +pub mod local; pub mod openai; pub mod provider; pub mod types; pub use gemini::GeminiClient; +pub use local::LocalProvider; pub use openai::OpenAiClient; pub use provider::AiProvider; pub use types::{ChatMessage, ChatResponse}; diff --git a/packages/desktop-tauri/src-tauri/src/ai/provider.rs b/packages/desktop-tauri/src-tauri/src/ai/provider.rs index 5e947a5..d9eb99c 100644 --- a/packages/desktop-tauri/src-tauri/src/ai/provider.rs +++ b/packages/desktop-tauri/src-tauri/src/ai/provider.rs @@ -1,9 +1,9 @@ //! AI provider trait — abstraction over Gemini, OpenAI, etc. -use anyhow::Result; +use anyhow::{anyhow, Result}; use async_trait::async_trait; -use super::types::{ChatMessage, ChatResponse}; +use super::types::{ChatMessage, ChatResponse, FunctionDeclaration, ToolMessage, ToolTurn}; /// Trait for AI chat providers #[async_trait] @@ -18,6 +18,28 @@ pub trait AiProvider: Send + Sync + std::fmt::Debug { image_base64: &str, ) -> Result; + /// Tool-using chat. Returns the next single turn from the model. The + /// caller is responsible for executing any returned tool calls and + /// feeding results back via the next invocation. + /// + /// Default implementation returns an unsupported error so existing + /// providers (local llama.cpp, OpenAI legacy) compile without change. + async fn chat_with_tools( + &self, + _messages: Vec, + _tools: &[FunctionDeclaration], + ) -> Result { + Err(anyhow!( + "Provider '{}' does not yet support tool calling", + self.provider_name() + )) + } + + /// Whether this provider supports tool calling. + fn supports_tools(&self) -> bool { + false + } + /// Validate the API key / connectivity async fn validate(&self) -> Result<()>; diff --git a/packages/desktop-tauri/src-tauri/src/ai/types.rs b/packages/desktop-tauri/src-tauri/src/ai/types.rs index f43d939..b104dbf 100644 --- a/packages/desktop-tauri/src-tauri/src/ai/types.rs +++ b/packages/desktop-tauri/src-tauri/src/ai/types.rs @@ -1,6 +1,7 @@ //! Types for AI chat and Gemini API wire format use serde::{Deserialize, Serialize}; +use serde_json::Value; // --- Public types (used by commands + frontend) --- @@ -30,6 +31,63 @@ pub struct UsageInfo { pub total_tokens: u32, } +// --- Tool / function-calling types --- + +/// A tool the model may call. Shape mirrors Gemini's `FunctionDeclaration` +/// but is provider-neutral (OpenAI's "function" parameters has the same +/// JSON-Schema-flavored body). +#[derive(Debug, Clone, Serialize)] +pub struct FunctionDeclaration { + pub name: String, + pub description: String, + /// JSON Schema for the arguments (OpenAPI 3.0 subset). + pub parameters: Value, +} + +/// A model-emitted call to a registered tool. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FunctionCall { + pub name: String, + /// Arguments object — caller is responsible for shape validation. + #[serde(default)] + pub args: Value, +} + +/// Result of executing a tool call, returned to the model on the next turn. +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct FunctionResult { + pub name: String, + /// Free-form result body — usually `{ "ok": true, "summary": "...", ... }`. + pub response: Value, +} + +/// Result of a single tool-use turn from `chat_with_tools`. Either the model +/// responded with text (terminal) or asked to call one or more tools. +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase", tag = "kind")] +pub enum ToolTurn { + /// Final assistant text. Conversation can stop. + Text { text: String, usage: Option }, + /// Model requested tool invocations. Caller must execute and feed + /// results back via the next call. + ToolCalls { calls: Vec, usage: Option }, +} + +/// Provider-neutral history entry for tool-use conversations. +#[derive(Debug, Clone)] +pub enum ToolMessage { + User(String), + Assistant(String), + /// PNG image (base64 data URL body, not the prefix). Attached to the + /// preceding user turn or, if first, as a fresh user turn. + UserImage { mime_type: String, data: String }, + /// Model previously emitted these tool calls (for replay across turns). + AssistantToolCalls(Vec), + /// Tool execution result fed back to the model. + ToolResult(FunctionResult), +} + // --- Gemini API wire types --- #[derive(Debug, Serialize)] @@ -38,30 +96,71 @@ pub(crate) struct GeminiRequest { pub contents: Vec, #[serde(skip_serializing_if = "Option::is_none")] pub generation_config: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tools: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub tool_config: Option, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] pub(crate) struct GeminiContent { pub role: String, pub parts: Vec, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] #[serde(rename_all = "camelCase")] pub(crate) struct GeminiPart { #[serde(skip_serializing_if = "Option::is_none")] pub text: Option, #[serde(skip_serializing_if = "Option::is_none")] pub inline_data: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub function_call: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub function_response: Option, } -#[derive(Debug, Serialize)] +#[derive(Debug, Serialize, Clone)] #[serde(rename_all = "camelCase")] pub(crate) struct GeminiInlineData { pub mime_type: String, pub data: String, } +#[derive(Debug, Serialize, Clone)] +pub(crate) struct GeminiFunctionCall { + pub name: String, + pub args: Value, +} + +#[derive(Debug, Serialize, Clone)] +pub(crate) struct GeminiFunctionResponse { + pub name: String, + pub response: Value, +} + +#[derive(Debug, Serialize)] +pub(crate) struct GeminiTool { + #[serde(rename = "functionDeclarations")] + pub function_declarations: Vec, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub(crate) struct GeminiToolConfig { + pub function_calling_config: GeminiFunctionCallingConfig, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub(crate) struct GeminiFunctionCallingConfig { + /// "AUTO" | "ANY" | "NONE" + pub mode: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub allowed_function_names: Option>, +} + #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] pub(crate) struct GeminiGenerationConfig { @@ -90,9 +189,18 @@ pub(crate) struct GeminiResponseContent { pub parts: Option>, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Clone, Deserialize)] +#[serde(rename_all = "camelCase")] pub(crate) struct GeminiResponsePart { pub text: Option, + pub function_call: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub(crate) struct GeminiResponseFunctionCall { + pub name: String, + #[serde(default)] + pub args: Value, } #[derive(Debug, Deserialize)] diff --git a/packages/desktop-tauri/src-tauri/src/bin/cli.rs b/packages/desktop-tauri/src-tauri/src/bin/cli.rs new file mode 100644 index 0000000..2e05132 --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/bin/cli.rs @@ -0,0 +1,173 @@ +//! `hawkeye-cli` — headless command-line entry point for the same Rust +//! subsystems used by the Tauri desktop app. +//! +//! This binary intentionally avoids any Tauri / webview dependency: it +//! drives the observe loop, agent runner, and AI providers through the +//! provider-neutral [`EventSink`] abstraction. + +use std::sync::Arc; + +use clap::{Parser, Subcommand}; + +use hawkeye_lib::{ + agent::{run_user_turn, CuaDriverClient, DaemonSupervisor}, + ai::{AiProvider, ChatMessage, GeminiClient, OpenAiClient}, + config, + event_sink::{EventSink, SharedSink, StdoutSink}, + observe::ObserveLoop, + state::AppState, +}; + +#[derive(Parser)] +#[command(name = "hawkeye-cli", version, about = "Hawkeye headless CLI")] +struct Cli { + #[command(subcommand)] + cmd: Cmd, +} + +#[derive(Subcommand)] +enum Cmd { + /// Print the effective configuration (after env + file resolution). + Config, + + /// Run the continuous observe loop, emitting NDJSON events to stdout. + Observe { + /// Sleep interval in milliseconds between captures. + #[arg(long, default_value_t = 3000)] + interval_ms: u64, + /// Perceptual-hash change threshold (0.0–1.0). Frames below this + /// ratio are skipped without OCR. + #[arg(long, default_value_t = 0.05)] + change_threshold: f64, + }, + + /// One-turn AI chat (no tools). + Chat { + /// User text to send. + text: String, + }, + + /// Tool-using agent turn (requires the cua-driver daemon). + Agent { + /// User prompt. + text: String, + }, + + /// Verify cua-driver daemon connectivity. + AgentStatus, +} + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + env_logger::init(); + let cli = Cli::parse(); + + // All sub-commands need config + state (cheap to build). + let cfg = config::load_config().unwrap_or_default(); + let state = AppState::new(cfg); + + match cli.cmd { + Cmd::Config => { + let cfg = state.config.read().await; + println!("{}", serde_json::to_string_pretty(&*cfg)?); + } + + Cmd::Observe { + interval_ms, + change_threshold, + } => { + let sink: SharedSink = Arc::new(StdoutSink); + let _loop = ObserveLoop::start(sink, state.clone(), interval_ms, change_threshold); + eprintln!("[hawkeye-cli] observe loop running — Ctrl-C to stop"); + tokio::signal::ctrl_c().await?; + eprintln!("[hawkeye-cli] shutting down"); + } + + Cmd::Chat { text } => { + let provider = build_provider(&state).await?; + let messages = vec![ChatMessage { + role: "user".into(), + content: text, + }]; + let resp = provider.chat(messages).await?; + println!("{}", resp.text); + } + + Cmd::Agent { text } => { + let provider = build_provider(&state).await?; + let driver = CuaDriverClient::default_path()?; + let supervisor = DaemonSupervisor::new(driver.clone()); + supervisor.ensure_running().await?; + + let sink: SharedSink = Arc::new(StdoutSink); + let result = run_user_turn(sink, provider, Some(driver), Vec::new(), text).await?; + + // Tool-call audit on stderr (so callers can pipe stdout = answer) + eprintln!("{}", serde_json::to_string_pretty(&result.tool_calls)?); + println!("{}", result.text); + } + + Cmd::AgentStatus => { + let driver = CuaDriverClient::default_path()?; + let supervisor = DaemonSupervisor::new(driver.clone()); + let running = driver.is_running().await; + let status = serde_json::json!({ + "binaryInstalled": supervisor.binary_path().is_some(), + "binaryPath": supervisor + .binary_path() + .map(|p| p.display().to_string()), + "daemonRunning": running, + "socketPath": driver.socket_path().display().to_string(), + }); + println!("{}", serde_json::to_string_pretty(&status)?); + } + } + + Ok(()) +} + +/// Build the configured AI provider. Mirrors the resolution rules used in +/// `commands::chat_cmd::init_ai`, minus the local-model path which still +/// requires lifecycle hooks not exposed through this CLI yet. +async fn build_provider(state: &Arc) -> anyhow::Result> { + let cfg = state.config.read().await; + let provider = cfg.ai_provider.clone(); + + match provider.as_str() { + "openai" => { + let key = cfg + .openai_api_key + .clone() + .ok_or_else(|| anyhow::anyhow!("OPENAI_API_KEY missing"))?; + Ok(Arc::new(OpenAiClient::new( + key, + cfg.openai_model.clone(), + cfg.openai_base_url.clone(), + ))) + } + + "local" | "llama-cpp" => { + anyhow::bail!("local model not supported in CLI yet") + } + + // Default + "gemini" + _ => { + let key = cfg + .gemini_api_key + .clone() + .ok_or_else(|| anyhow::anyhow!("GEMINI_API_KEY missing"))?; + Ok(Arc::new(GeminiClient::new( + key, + cfg.gemini_model.clone(), + cfg.gemini_base_url.clone(), + ))) + } + } +} + +// Unused-import suppression — `EventSink` is brought into scope as it's the +// trait that `SharedSink = Arc` uses for method dispatch in +// downstream code paths called via the loop. Without an explicit reference +// the compiler may warn under `--no-default-features` profiles. +#[allow(dead_code)] +fn _trait_in_scope(_: &dyn EventSink) {} diff --git a/packages/desktop-tauri/src-tauri/src/commands/agent_cmd.rs b/packages/desktop-tauri/src-tauri/src/commands/agent_cmd.rs new file mode 100644 index 0000000..1bfb20a --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/commands/agent_cmd.rs @@ -0,0 +1,152 @@ +//! Agent commands — cua-driver lifecycle + tool-using chat. + +use std::sync::Arc; +use tauri::{command, AppHandle, Emitter, State}; + +use crate::agent::{run_user_turn, AgentTurnResult, CuaDriverClient}; +use crate::ai::types::{FunctionResult, ToolMessage}; +use crate::ai::ChatMessage; +use crate::event_sink::{SharedSink, TauriSink}; +use crate::events; +use crate::state::AppState; + +/// Status of the cua-driver integration, surfaced to the frontend. +#[derive(Debug, Clone, serde::Serialize)] +#[serde(rename_all = "camelCase")] +pub struct AgentStatus { + pub binary_installed: bool, + pub binary_path: Option, + pub daemon_running: bool, + pub socket_path: String, +} + +/// Returns the current cua-driver status. +#[command] +pub async fn get_agent_status(state: State<'_, Arc>) -> Result { + let supervisor = state.agent_supervisor.read().await; + let supervisor = supervisor + .as_ref() + .ok_or_else(|| "agent supervisor not initialized".to_string())?; + + let daemon_running = supervisor.client().is_running().await; + Ok(AgentStatus { + binary_installed: supervisor.binary_path().is_some(), + binary_path: supervisor.binary_path().map(|p| p.display().to_string()), + daemon_running, + socket_path: supervisor.client().socket_path().display().to_string(), + }) +} + +/// Ensure the cua-driver daemon is running. Spawns it if needed. +#[command] +pub async fn start_agent( + state: State<'_, Arc>, + app: AppHandle, +) -> Result { + let supervisor = state.agent_supervisor.read().await; + let supervisor = supervisor + .as_ref() + .ok_or_else(|| "agent supervisor not initialized".to_string())?; + + match supervisor.ensure_running().await { + Ok(()) => { + let _ = app.emit(events::AGENT_DAEMON_READY, true); + Ok(true) + } + Err(e) => { + let msg = e.to_string(); + log::error!("[agent] start_agent failed: {}", msg); + let _ = app.emit(events::AGENT_DAEMON_ERROR, msg.clone()); + Err(msg) + } + } +} + +/// Tool-using chat. Pass the conversation so far (plain `ChatMessage`s) plus +/// the new user input. Returns final text + audit trail of tool calls. +#[command] +pub async fn chat_with_agent( + history: Vec, + user_input: String, + state: State<'_, Arc>, + app: AppHandle, +) -> Result { + // Resolve provider. + let provider = { + let ai = state.ai_client.read().await; + ai.as_ref() + .cloned() + .ok_or_else(|| "AI not initialized. Call init_ai first.".to_string())? + }; + + // Resolve cua-driver client (may be unavailable; that's a soft error). + let driver_client: Option = { + let sup = state.agent_supervisor.read().await; + match sup.as_ref() { + Some(s) if s.client().is_running().await => Some(s.client().clone()), + _ => None, + } + }; + + // Translate plain history into ToolMessages. + let mut tool_history: Vec = Vec::with_capacity(history.len()); + for msg in history { + match msg.role.as_str() { + "user" => tool_history.push(ToolMessage::User(msg.content)), + "assistant" => tool_history.push(ToolMessage::Assistant(msg.content)), + _ => {} // drop system here — handled inside provider for plain chat + } + } + + let sink: SharedSink = state + .event_sink + .read() + .await + .clone() + .unwrap_or_else(|| -> SharedSink { Arc::new(TauriSink::new(app)) }); + + run_user_turn(sink, provider, driver_client, tool_history, user_input) + .await + .map_err(|e| e.to_string()) +} + +// --- Direct passthrough for debugging ------------------------------------- + +/// Manually invoke a single cua-driver tool. Useful for UI buttons, +/// debugging, and unit-testing the bridge without going through the LLM. +#[command] +pub async fn invoke_cua_tool( + name: String, + args: serde_json::Value, + state: State<'_, Arc>, +) -> Result { + let sup = state.agent_supervisor.read().await; + let sup = sup + .as_ref() + .ok_or_else(|| "agent supervisor not initialized".to_string())?; + + if !crate::agent::tools::is_allowed(&name) { + return Err(format!("tool '{}' not in allow-list", name)); + } + + let args_map = args + .as_object() + .map(|m| m.iter().map(|(k, v)| (k.clone(), v.clone())).collect()) + .unwrap_or_default(); + + let result = sup + .client() + .call(&name, args_map) + .await + .map_err(|e| e.to_string())?; + + Ok(FunctionResult { + name: name.clone(), + response: serde_json::json!({ + "ok": !result.is_error, + "summary": result.text(), + "hasImage": result.first_image().is_some(), + }), + }) +} + diff --git a/packages/desktop-tauri/src-tauri/src/commands/chat_cmd.rs b/packages/desktop-tauri/src-tauri/src/commands/chat_cmd.rs index 5534ae5..abda347 100644 --- a/packages/desktop-tauri/src-tauri/src/commands/chat_cmd.rs +++ b/packages/desktop-tauri/src-tauri/src/commands/chat_cmd.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use tauri::{command, AppHandle, Emitter, State}; -use crate::ai::{ChatMessage, ChatResponse, GeminiClient, OpenAiClient}; +use crate::ai::{ChatMessage, ChatResponse, GeminiClient, LocalProvider, OpenAiClient}; use crate::events; use crate::state::AppState; @@ -17,7 +17,27 @@ pub async fn init_ai( let provider_type = config.ai_provider.as_str(); - let client: Box = match provider_type { + let client: Arc = match provider_type { + "local" | "llama-cpp" => { + let model_id = match &config.local_model_id { + Some(id) if !id.is_empty() => id.clone(), + _ => { + log::warn!("[AI] No local model ID configured"); + return Ok(false); + } + }; + drop(config); // release read lock before acquiring model_manager lock + + let mgr = state.model_manager.read().await; + let model_path = mgr.model_path(&model_id).ok_or_else(|| { + format!("Local model '{}' not downloaded. Download it first.", model_id) + })?; + + let provider = LocalProvider::load(model_path, Some(model_id)) + .map_err(|e| format!("Failed to load local model: {}", e))?; + + Arc::new(provider) + } "openai" => { let api_key = match &config.openai_api_key { Some(key) if !key.is_empty() => key.clone(), @@ -26,7 +46,7 @@ pub async fn init_ai( return Ok(false); } }; - Box::new(OpenAiClient::new( + Arc::new(OpenAiClient::new( api_key, config.openai_model.clone(), config.openai_base_url.clone(), @@ -41,7 +61,7 @@ pub async fn init_ai( return Ok(false); } }; - Box::new(GeminiClient::new( + Arc::new(GeminiClient::new( api_key, config.gemini_model.clone(), config.gemini_base_url.clone(), @@ -70,19 +90,20 @@ pub async fn init_ai( } } -/// Chat with AI +/// Chat with AI (no tools). #[command] pub async fn chat( messages: Vec, state: State<'_, Arc>, ) -> Result { - let ai = state.ai_client.read().await; - - let client = ai - .as_ref() - .ok_or_else(|| "AI not initialized. Call init_ai first.".to_string())?; + let provider = { + let ai = state.ai_client.read().await; + ai.as_ref() + .cloned() + .ok_or_else(|| "AI not initialized. Call init_ai first.".to_string())? + }; - client + provider .chat(messages) .await .map_err(|e| e.to_string()) diff --git a/packages/desktop-tauri/src-tauri/src/commands/mod.rs b/packages/desktop-tauri/src-tauri/src/commands/mod.rs index 5730b6a..49dfd02 100644 --- a/packages/desktop-tauri/src-tauri/src/commands/mod.rs +++ b/packages/desktop-tauri/src-tauri/src/commands/mod.rs @@ -1,6 +1,7 @@ //! Command modules — IPC handlers called from frontend via Tauri invoke pub mod adaptive_cmd; +pub mod agent_cmd; pub mod chat_cmd; pub mod config_cmd; pub mod intent_cmd; @@ -15,3 +16,5 @@ pub mod voice_cmd; pub mod gesture_cmd; pub mod updater_cmd; pub mod debug_cmd; +pub mod gaze_cmd; +pub mod training_cmd; diff --git a/packages/desktop-tauri/src-tauri/src/commands/observe_cmd.rs b/packages/desktop-tauri/src-tauri/src/commands/observe_cmd.rs index 26a93b4..71f5e91 100644 --- a/packages/desktop-tauri/src-tauri/src/commands/observe_cmd.rs +++ b/packages/desktop-tauri/src-tauri/src/commands/observe_cmd.rs @@ -4,6 +4,7 @@ use std::sync::Arc; use serde::Serialize; use tauri::{command, AppHandle, State}; +use crate::event_sink::{SharedSink, TauriSink}; use crate::observe::ObserveLoop; use crate::state::{AppState, ObservationResult}; @@ -27,8 +28,17 @@ pub async fn start_observe( return Ok(false); // Already running } + // Prefer the pre-installed sink; fall back to a fresh TauriSink so + // command callers don't have to wait for setup-time initialization. + let sink: SharedSink = state + .event_sink + .read() + .await + .clone() + .unwrap_or_else(|| -> SharedSink { Arc::new(TauriSink::new(app)) }); + let observe = ObserveLoop::start( - app, + sink, Arc::clone(&state), 3000, // 3s interval 0.05, // 5% change threshold diff --git a/packages/desktop-tauri/src-tauri/src/event_sink.rs b/packages/desktop-tauri/src-tauri/src/event_sink.rs new file mode 100644 index 0000000..e5ae2f5 --- /dev/null +++ b/packages/desktop-tauri/src-tauri/src/event_sink.rs @@ -0,0 +1,54 @@ +//! Provider-neutral event emission so non-UI runners (CLI/server) can host +//! observe + agent loops without pulling in Tauri. +//! +//! The Tauri desktop app uses [`TauriSink`] to forward events to the +//! webview frontend; the headless CLI uses [`StdoutSink`] to print NDJSON +//! events to stdout; one-shot commands that don't care about events use +//! [`NoopSink`]. + +use serde_json::Value; +use std::sync::Arc; + +/// Provider-neutral sink for backend → frontend (or backend → stdout) events. +pub trait EventSink: Send + Sync { + fn emit(&self, event: &str, payload: Value); +} + +/// Tauri implementation — forwards to `AppHandle::emit`. +pub struct TauriSink { + handle: tauri::AppHandle, +} + +impl TauriSink { + pub fn new(handle: tauri::AppHandle) -> Self { + Self { handle } + } +} + +impl EventSink for TauriSink { + fn emit(&self, event: &str, payload: Value) { + let _ = tauri::Emitter::emit(&self.handle, event, payload); + } +} + +/// CLI implementation — emits NDJSON to stdout (one JSON object per line). +pub struct StdoutSink; + +impl EventSink for StdoutSink { + fn emit(&self, event: &str, payload: Value) { + println!( + "{}", + serde_json::json!({ "event": event, "data": payload }) + ); + } +} + +/// No-op sink for one-shot commands that don't care about events. +pub struct NoopSink; + +impl EventSink for NoopSink { + fn emit(&self, _: &str, _: Value) {} +} + +/// Convenience alias for the shared trait object. +pub type SharedSink = Arc; diff --git a/packages/desktop-tauri/src-tauri/src/events.rs b/packages/desktop-tauri/src-tauri/src/events.rs index 95a6e0e..20cb534 100644 --- a/packages/desktop-tauri/src-tauri/src/events.rs +++ b/packages/desktop-tauri/src-tauri/src/events.rs @@ -38,3 +38,26 @@ pub const DEBUG_CLEARED: &str = "debug:cleared"; /// Status events pub const STATUS_CHANGED: &str = "status:changed"; + +/// Local model events +pub const LOCAL_MODEL_LOADING: &str = "local-model:loading"; +pub const LOCAL_MODEL_READY: &str = "local-model:ready"; +pub const LOCAL_MODEL_ERROR: &str = "local-model:error"; + +/// Training data events +pub const TRAINING_SAMPLE_SAVED: &str = "training:sample-saved"; +pub const TRAINING_EXPORT_COMPLETE: &str = "training:export-complete"; + +/// Gaze ANE events +pub const GAZE_TRAINING_STARTED: &str = "gaze:training-started"; +pub const GAZE_TRAINING_PROGRESS: &str = "gaze:training-progress"; +pub const GAZE_TRAINING_COMPLETE: &str = "gaze:training-complete"; +pub const GAZE_TRAINING_ERROR: &str = "gaze:training-error"; +pub const GAZE_MODEL_READY: &str = "gaze:model-ready"; +pub const GAZE_SAMPLE_ADDED: &str = "gaze:sample-added"; + +/// Agent (cua-driver tool-use) events +pub const AGENT_TOOL_CALL_START: &str = "agent:tool-call-start"; +pub const AGENT_TOOL_CALL_END: &str = "agent:tool-call-end"; +pub const AGENT_DAEMON_READY: &str = "agent:daemon-ready"; +pub const AGENT_DAEMON_ERROR: &str = "agent:daemon-error"; diff --git a/packages/desktop-tauri/src-tauri/src/lib.rs b/packages/desktop-tauri/src-tauri/src/lib.rs index cfed198..0997a55 100644 --- a/packages/desktop-tauri/src-tauri/src/lib.rs +++ b/packages/desktop-tauri/src-tauri/src/lib.rs @@ -1,22 +1,28 @@ //! Hawkeye Desktop - Tauri Backend //! //! This is the Rust backend for Hawkeye Desktop, providing: -//! - AI chat (Gemini, OpenAI-compatible) +//! - AI chat (Gemini, OpenAI-compatible, local llama.cpp with Metal) +//! - Local LLM inference via llama-cpp-2 (GGUF models, Apple Metal GPU) +//! - Training data collection for LoRA fine-tuning //! - Screen capture + OCR (macOS Vision API) //! - Smart observe loop with adaptive refresh //! - Menu bar tray panel //! - Configuration persistence -mod ai; -mod commands; -mod config; -mod events; -mod life_tree; -mod models; -mod observe; -mod perception; -mod state; -mod voice; +pub mod agent; +pub mod ai; +pub mod commands; +pub mod config; +pub mod event_sink; +pub mod events; +pub mod gaze; +pub mod life_tree; +pub mod models; +pub mod observe; +pub mod perception; +pub mod state; +pub mod training; +pub mod voice; use tauri::{Emitter, Manager}; use tauri::menu::{MenuBuilder, MenuItemBuilder}; @@ -39,13 +45,50 @@ pub fn run() { // Create and manage shared state let app_state = state::AppState::new(cfg); - app.manage(app_state); + app.manage(app_state.clone()); + + // Install the Tauri event sink so non-UI runners (agent runner, + // observe loop) can emit events through the same handle. + { + let sink: event_sink::SharedSink = std::sync::Arc::new( + event_sink::TauriSink::new(app.handle().clone()), + ); + let state = app_state.clone(); + tauri::async_runtime::spawn(async move { + *state.event_sink.write().await = Some(sink); + }); + } // Initialize perception engine + tauri::async_runtime::spawn(async { + if let Err(e) = perception::init().await { + log::error!("Failed to initialize perception: {}", e); + } + }); + + // Initialize cua-driver supervisor (does NOT auto-spawn the + // daemon — that's user-controlled via start_agent command). let handle = app.handle().clone(); tauri::async_runtime::spawn(async move { - if let Err(e) = perception::init(&handle).await { - log::error!("Failed to initialize perception: {}", e); + let state = handle.state::>(); + match agent::CuaDriverClient::default_path() { + Ok(client) => { + let supervisor = agent::DaemonSupervisor::new(client); + if supervisor.binary_path().is_none() { + log::warn!( + "[agent] cua-driver binary not found — desktop control unavailable. \ + Install: /bin/bash -c \"$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)\"" + ); + } else { + log::info!( + "[agent] cua-driver binary at {}", + supervisor.binary_path().unwrap().display() + ); + } + let mut sup = state.agent_supervisor.write().await; + *sup = Some(supervisor); + } + Err(e) => log::error!("[agent] failed to init supervisor: {}", e), } }); @@ -107,8 +150,18 @@ pub fn run() { let state = handle.state::>(); let mut loop_handle = state.observe_loop.write().await; if loop_handle.is_none() { + let sink: event_sink::SharedSink = state + .event_sink + .read() + .await + .clone() + .unwrap_or_else(|| { + std::sync::Arc::new(event_sink::TauriSink::new( + handle.clone(), + )) + }); let obs = observe::ObserveLoop::start( - handle.clone(), + sink, std::sync::Arc::clone(&state), 3000, 0.05, @@ -148,6 +201,11 @@ pub fn run() { // Chat commands::chat_cmd::chat, commands::chat_cmd::init_ai, + // Agent (cua-driver tool-use) + commands::agent_cmd::get_agent_status, + commands::agent_cmd::start_agent, + commands::agent_cmd::chat_with_agent, + commands::agent_cmd::invoke_cua_tool, // Observe commands::observe_cmd::start_observe, commands::observe_cmd::stop_observe, @@ -203,6 +261,18 @@ pub fn run() { commands::debug_cmd::pause_debug, commands::debug_cmd::resume_debug, commands::debug_cmd::clear_debug_events, + // Gaze ANE + commands::gaze_cmd::submit_gaze_sample, + commands::gaze_cmd::trigger_gaze_training, + commands::gaze_cmd::predict_gaze, + commands::gaze_cmd::get_gaze_training_status, + commands::gaze_cmd::clear_gaze_model, + commands::gaze_cmd::load_gaze_weights, + // Training data collection + commands::training_cmd::save_training_sample, + commands::training_cmd::rate_training_sample, + commands::training_cmd::get_training_stats, + commands::training_cmd::export_training_data, // Utilities commands::util_cmd::open_url, ]) diff --git a/packages/desktop-tauri/src-tauri/src/observe/loop_runner.rs b/packages/desktop-tauri/src-tauri/src/observe/loop_runner.rs index 741b2a9..e47c777 100644 --- a/packages/desktop-tauri/src-tauri/src/observe/loop_runner.rs +++ b/packages/desktop-tauri/src-tauri/src/observe/loop_runner.rs @@ -3,8 +3,7 @@ use std::sync::Arc; use tokio::sync::watch; -use tauri::{AppHandle, Emitter}; - +use crate::event_sink::EventSink; use crate::events; use crate::observe::change_detector; use crate::perception; @@ -16,9 +15,13 @@ pub struct ObserveLoop { } impl ObserveLoop { - /// Start the observe loop as a background task + /// Start the observe loop as a background task. + /// + /// Events are emitted through the supplied [`EventSink`] so the loop + /// runs identically under Tauri (with `TauriSink`) or under a CLI + /// (with `StdoutSink`). pub fn start( - app: AppHandle, + sink: Arc, state: Arc, interval_ms: u64, threshold: f64, @@ -26,7 +29,7 @@ impl ObserveLoop { let (stop_tx, stop_rx) = watch::channel(false); tokio::spawn(async move { - run_loop(app, state, stop_rx, interval_ms, threshold).await; + run_loop(sink, state, stop_rx, interval_ms, threshold).await; }); Self { stop_tx } @@ -39,7 +42,7 @@ impl ObserveLoop { } async fn run_loop( - app: AppHandle, + sink: Arc, state: Arc, mut stop_rx: watch::Receiver, _initial_interval_ms: u64, @@ -61,7 +64,7 @@ async fn run_loop( _ = stop_rx.changed() => { if *stop_rx.borrow() { log::info!("[Observe] Loop stopped by signal"); - let _ = app.emit(events::OBSERVE_STOPPED, ()); + sink.emit(events::OBSERVE_STOPPED, serde_json::Value::Null); return; } } @@ -106,7 +109,10 @@ async fn run_loop( } log::info!("[Observe] Change detected (ratio={:.2})", change_ratio); - let _ = app.emit(events::OBSERVE_CHANGE, change_ratio); + sink.emit( + events::OBSERVE_CHANGE, + serde_json::json!(change_ratio), + ); // Record activity for adaptive refresh { @@ -171,7 +177,9 @@ async fn run_loop( let intents = recognizer.recognize(&input); if !intents.is_empty() { log::debug!("[Observe] Intents: {:?}", intents.iter().map(|i| &i.description).collect::>()); - let _ = app.emit(events::INTENT_RECOGNIZED, &intents); + if let Ok(payload) = serde_json::to_value(&intents) { + sink.emit(events::INTENT_RECOGNIZED, payload); + } } } @@ -193,7 +201,9 @@ async fn run_loop( *last = Some(observation.clone()); } - // Emit to frontend - let _ = app.emit(events::OBSERVE_UPDATE, &observation); + // Emit to host (frontend / stdout) + if let Ok(payload) = serde_json::to_value(&observation) { + sink.emit(events::OBSERVE_UPDATE, payload); + } } } diff --git a/packages/desktop-tauri/src-tauri/src/perception/mod.rs b/packages/desktop-tauri/src-tauri/src/perception/mod.rs index 9dc118a..17b4814 100644 --- a/packages/desktop-tauri/src-tauri/src/perception/mod.rs +++ b/packages/desktop-tauri/src-tauri/src/perception/mod.rs @@ -7,12 +7,12 @@ pub mod window; use anyhow::Result; use screenshots::Screen; use std::sync::atomic::{AtomicBool, Ordering}; -use tauri::AppHandle; static INITIALIZED: AtomicBool = AtomicBool::new(false); -/// Initialize the perception engine -pub async fn init(_app: &AppHandle) -> Result<()> { +/// Initialize the perception engine. UI-agnostic — usable from Tauri, +/// CLI, or tests. +pub async fn init() -> Result<()> { if INITIALIZED.swap(true, Ordering::SeqCst) { return Ok(()); } diff --git a/packages/desktop-tauri/src-tauri/src/state.rs b/packages/desktop-tauri/src-tauri/src/state.rs index f76ab2c..1a6e75f 100644 --- a/packages/desktop-tauri/src-tauri/src/state.rs +++ b/packages/desktop-tauri/src-tauri/src/state.rs @@ -3,13 +3,17 @@ use std::sync::Arc; use tokio::sync::RwLock; +use crate::agent::DaemonSupervisor; use crate::ai::AiProvider; use crate::commands::debug_cmd::DebugTimeline; use crate::commands::gesture_cmd::GestureConfig; use crate::config::AppConfig; +use crate::gaze::data_buffer::GazeDataBuffer; +use crate::gaze::inference::GazeModel; use crate::life_tree::LifeTree; use crate::models::ModelManager; use crate::observe::{ActivityLog, AdaptiveRefresh, IntentRecognizer, ObserveLoop}; +use crate::training::TrainingCollector; /// Observation result emitted by the observe loop #[derive(Debug, Clone, serde::Serialize)] @@ -24,7 +28,9 @@ pub struct ObservationResult { /// Shared application state pub struct AppState { - pub ai_client: RwLock>>, + /// Active AI provider. `Arc` so commands and the agent runner can share + /// it without holding the lock for the duration of an HTTP request. + pub ai_client: RwLock>>, pub config: RwLock, pub observe_loop: RwLock>, pub last_observation: RwLock>, @@ -35,9 +41,30 @@ pub struct AppState { pub life_tree: RwLock, pub gesture_config: RwLock, pub debug_timeline: RwLock, + pub gaze_buffer: RwLock, + pub gaze_model: RwLock>, + pub gaze_training_active: RwLock, + pub training_collector: RwLock, + /// cua-driver supervisor — manages the daemon lifecycle and exposes the + /// `CuaDriverClient`. `None` until initialized in `setup`. + pub agent_supervisor: RwLock>, + /// Provider-neutral event sink. Populated in Tauri setup with a + /// `TauriSink`; left as `None` for headless tests / CLI sub-commands + /// that build sinks ad-hoc. + pub event_sink: RwLock>, } impl AppState { + fn load_gaze_buffer() -> GazeDataBuffer { + GazeDataBuffer::default_path() + .and_then(|p| GazeDataBuffer::load(&p).ok()) + .unwrap_or_default() + } + + fn load_gaze_model() -> Option { + GazeModel::default_path().and_then(|p| GazeModel::load(&p).ok()) + } + pub fn new(config: AppConfig) -> Arc { Arc::new(Self { ai_client: RwLock::new(None), @@ -51,6 +78,12 @@ impl AppState { life_tree: RwLock::new(LifeTree::default()), gesture_config: RwLock::new(GestureConfig::default()), debug_timeline: RwLock::new(DebugTimeline::default()), + gaze_buffer: RwLock::new(Self::load_gaze_buffer()), + gaze_model: RwLock::new(Self::load_gaze_model()), + gaze_training_active: RwLock::new(false), + training_collector: RwLock::new(TrainingCollector::default()), + agent_supervisor: RwLock::new(None), + event_sink: RwLock::new(None), }) } } diff --git a/packages/desktop-tauri/src/components/ChatPanel.tsx b/packages/desktop-tauri/src/components/ChatPanel.tsx index 6742ebe..d7a1539 100644 --- a/packages/desktop-tauri/src/components/ChatPanel.tsx +++ b/packages/desktop-tauri/src/components/ChatPanel.tsx @@ -1,28 +1,80 @@ import { useState, useRef, useEffect } from 'react'; import { motion } from 'framer-motion'; import { useChat } from '../hooks/useChat'; +import { useAgent, describeToolCall } from '../hooks/useAgent'; +import type { ChatMessage } from '../hooks/useTauri'; +import type { ToolCallRecord } from '../hooks/useTauri'; + +/// One thread entry — either a chat message or the audit trail of an agent +/// turn's tool calls. Rendered inline in the conversation so users see +/// exactly what the agent did. +type ThreadEntry = + | { kind: 'msg'; msg: ChatMessage } + | { kind: 'tools'; calls: ToolCallRecord[] }; export function ChatPanel() { - const { messages, isLoading, error, aiReady, initialize, sendMessage, clearChat } = useChat(); + const chat = useChat(); + const agent = useAgent(); + const [input, setInput] = useState(''); + const [agentMode, setAgentMode] = useState(false); + const [agentEntries, setAgentEntries] = useState([]); const messagesEndRef = useRef(null); const inputRef = useRef(null); // Auto-init AI on mount useEffect(() => { - initialize(); - }, [initialize]); + chat.initialize(); + }, [chat.initialize]); - // Auto-scroll to bottom + // Auto-scroll useEffect(() => { messagesEndRef.current?.scrollIntoView({ behavior: 'smooth' }); - }, [messages]); + }, [chat.messages, agentEntries, agent.inProgress]); + + // Resolve the assembled thread (chat history is canonical, agent entries + // are interleaved by recency so users see in-order). + const thread: ThreadEntry[] = agentMode + ? agentEntries + : chat.messages.map((m) => ({ kind: 'msg' as const, msg: m })); const handleSend = async () => { const text = input.trim(); - if (!text || isLoading) return; + if (!text || chat.isLoading || agent.isLoading) return; setInput(''); - await sendMessage(text); + + if (agentMode) { + // Pre-flight: ensure daemon running. + if (!agent.status?.daemonRunning) { + const ok = await agent.ensureRunning(); + if (!ok) return; + } + + // Build the history we send to the backend (text-only ChatMessages). + const priorMessages: ChatMessage[] = agentEntries + .filter((e): e is { kind: 'msg'; msg: ChatMessage } => e.kind === 'msg') + .map((e) => e.msg); + + // Optimistically show the user message. + const userMsg: ChatMessage = { role: 'user', content: text }; + setAgentEntries((es) => [...es, { kind: 'msg', msg: userMsg }]); + + const result = await agent.runTurn(priorMessages, text); + if (result) { + const updates: ThreadEntry[] = []; + if (result.toolCalls.length > 0) { + updates.push({ kind: 'tools', calls: result.toolCalls }); + } + if (result.text) { + updates.push({ kind: 'msg', msg: { role: 'assistant', content: result.text } }); + } + setAgentEntries((es) => [...es, ...updates]); + } + agent.clearInProgress(); + } else { + await chat.sendMessage(text); + } + inputRef.current?.focus(); }; @@ -33,50 +85,71 @@ export function ChatPanel() { } }; - if (!aiReady) { + const handleClear = () => { + if (agentMode) { + setAgentEntries([]); + agent.clearInProgress(); + } else { + chat.clearChat(); + } + }; + + if (!chat.aiReady) { return (
🤖
No AI configured. Add a Gemini API key in Settings.
-
); } + const isLoading = chat.isLoading || agent.isLoading; + const error = chat.error || agent.error; + return (
+ {/* Mode toggle bar */} +
+ + {agentMode && } +
+ {/* Messages */}
- {messages.length === 0 && ( + {thread.length === 0 && (
- Ask Hawkeye anything... + {agentMode + ? 'Tell the agent what to do (e.g., "open Safari and search for hawkeye")' + : 'Ask Hawkeye anything...'}
)} - {messages.map((msg, i) => ( - -
-
-                {msg.content}
-              
-
-
- ))} + {thread.map((entry, i) => + entry.kind === 'msg' ? ( + + ) : ( + + ) + )} + + {/* Live tool-call stream while waiting on a turn */} + {agent.inProgress.length > 0 && ( + + )} {isLoading && (
@@ -87,9 +160,7 @@ export function ChatPanel() { )} {error && ( -
- {error} -
+
{error}
)}
@@ -101,7 +172,7 @@ export function ChatPanel() { ref={inputRef} type="text" className="form-input flex-1 text-sm" - placeholder="Message..." + placeholder={agentMode ? 'Tell the agent…' : 'Message…'} value={input} onChange={(e) => setInput(e.target.value)} onKeyDown={handleKeyDown} @@ -114,8 +185,8 @@ export function ChatPanel() { > Send - {messages.length > 0 && ( - )} @@ -123,3 +194,107 @@ export function ChatPanel() {
); } + +// --- subcomponents -------------------------------------------------------- + +function MessageBubble({ msg }: { msg: ChatMessage }) { + return ( + +
+
+          {msg.content}
+        
+
+
+ ); +} + +function ToolCallTrail({ calls }: { calls: ToolCallRecord[] }) { + return ( + +
+
+ Agent ran {calls.length} tool{calls.length === 1 ? '' : 's'} +
+ {calls.map((c, i) => ( +
+ {c.ok ? '✓' : '✗'} + {describeToolCall(c)} +
+ ))} +
+
+ ); +} + +function LiveToolStream({ + entries, +}: { + entries: { round: number; name: string; status: 'running' | 'ok' | 'error'; summary?: string }[]; +}) { + return ( + +
+ {entries.map((e, i) => ( +
+ {e.status === 'running' ? ( + + ) : ( + + {e.status === 'ok' ? '✓' : '✗'} + + )} + {e.name} + {e.summary && ( + + — {e.summary} + + )} +
+ ))} +
+
+ ); +} + +function AgentStatusBadge({ agent }: { agent: ReturnType }) { + const status = agent.status; + if (!status) return ; + + if (!status.binaryInstalled) { + return ( + + ⚠ driver missing + + ); + } + if (!status.daemonRunning) { + return ( + + ); + } + return ● ready; +} diff --git a/packages/desktop-tauri/src/hooks/useAgent.ts b/packages/desktop-tauri/src/hooks/useAgent.ts new file mode 100644 index 0000000..ee6d918 --- /dev/null +++ b/packages/desktop-tauri/src/hooks/useAgent.ts @@ -0,0 +1,159 @@ +import { useCallback, useEffect, useRef, useState } from 'react'; +import { listen } from '@tauri-apps/api/event'; +import { + chatWithAgent, + getAgentStatus, + startAgent, + type AgentStatus, + type AgentTurnResult, + type ChatMessage, + type ToolCallRecord, +} from './useTauri'; + +/// A streaming view of an in-progress agent turn — tool calls arrive via +/// Tauri events while we wait for `chatWithAgent` to resolve. +export interface AgentTurnInProgress { + round: number; + name: string; + status: 'running' | 'ok' | 'error'; + summary?: string; + args?: Record; +} + +interface AgentState { + status: AgentStatus | null; + inProgress: AgentTurnInProgress[]; + lastResult: AgentTurnResult | null; + isLoading: boolean; + error: string | null; +} + +/// React hook for tool-using chat against cua-driver. +/// +/// Returns the daemon status, a `runAgentTurn` function that calls +/// `chat_with_agent`, and live-streamed `inProgress` tool-call updates from +/// the Tauri event bus (so the UI can show "screenshot…", "click…", etc. +/// as the agent works). +export function useAgent() { + const [state, setState] = useState({ + status: null, + inProgress: [], + lastResult: null, + isLoading: false, + error: null, + }); + + const inProgressRef = useRef(state.inProgress); + inProgressRef.current = state.inProgress; + + // --- subscribe to tool-call events --- + useEffect(() => { + const unsubs: Array<() => void> = []; + + listen<{ round: number; name: string; args: Record }>( + 'agent:tool-call-start', + (e) => { + setState((s) => ({ + ...s, + inProgress: [ + ...s.inProgress, + { + round: e.payload.round, + name: e.payload.name, + args: e.payload.args, + status: 'running', + }, + ], + })); + } + ).then((u) => unsubs.push(u)); + + listen<{ round: number; name: string; ok: boolean; summary: string }>( + 'agent:tool-call-end', + (e) => { + setState((s) => ({ + ...s, + inProgress: s.inProgress.map((t) => + t.round === e.payload.round && t.name === e.payload.name && t.status === 'running' + ? { + ...t, + status: e.payload.ok ? 'ok' : 'error', + summary: e.payload.summary, + } + : t + ), + })); + } + ).then((u) => unsubs.push(u)); + + return () => { + unsubs.forEach((fn) => fn()); + }; + }, []); + + const refreshStatus = useCallback(async () => { + try { + const s = await getAgentStatus(); + setState((st) => ({ ...st, status: s })); + return s; + } catch (e) { + setState((st) => ({ ...st, error: String(e) })); + return null; + } + }, []); + + // Refresh status on mount. + useEffect(() => { + refreshStatus(); + }, [refreshStatus]); + + const ensureRunning = useCallback(async () => { + try { + await startAgent(); + await refreshStatus(); + return true; + } catch (e) { + const msg = String(e); + setState((s) => ({ ...s, error: msg })); + return false; + } + }, [refreshStatus]); + + const runTurn = useCallback( + async (history: ChatMessage[], userInput: string): Promise => { + setState((s) => ({ ...s, isLoading: true, error: null, inProgress: [] })); + try { + const result = await chatWithAgent(history, userInput); + setState((s) => ({ ...s, isLoading: false, lastResult: result })); + return result; + } catch (e) { + setState((s) => ({ ...s, isLoading: false, error: String(e) })); + return null; + } + }, + [] + ); + + const clearInProgress = useCallback(() => { + setState((s) => ({ ...s, inProgress: [] })); + }, []); + + return { + status: state.status, + inProgress: state.inProgress, + lastResult: state.lastResult, + isLoading: state.isLoading, + error: state.error, + refreshStatus, + ensureRunning, + runTurn, + clearInProgress, + }; +} + +/// Convenience: turn a `ToolCallRecord` into a one-line UI label. +export function describeToolCall(rec: ToolCallRecord): string { + const argSnippet = JSON.stringify(rec.args); + const truncated = argSnippet.length > 60 ? argSnippet.slice(0, 60) + '…' : argSnippet; + return `${rec.name}(${truncated})`; +} diff --git a/packages/desktop-tauri/src/hooks/useTauri.ts b/packages/desktop-tauri/src/hooks/useTauri.ts index e5d57bb..c0405f4 100644 --- a/packages/desktop-tauri/src/hooks/useTauri.ts +++ b/packages/desktop-tauri/src/hooks/useTauri.ts @@ -116,6 +116,60 @@ export async function chat(messages: ChatMessage[]): Promise { return invoke('chat', { messages }); } +// --- Agent (cua-driver desktop control) --- + +export interface AgentStatus { + binaryInstalled: boolean; + binaryPath?: string; + daemonRunning: boolean; + socketPath: string; +} + +export interface ToolCallRecord { + round: number; + name: string; + args: Record; + ok: boolean; + summary: string; +} + +export interface AgentTurnResult { + text: string; + rounds: number; + toolCalls: ToolCallRecord[]; + usage?: { + promptTokens: number; + completionTokens: number; + totalTokens: number; + }; +} + +/// Inspect cua-driver state — whether binary is installed and daemon is running. +export async function getAgentStatus(): Promise { + return invoke('get_agent_status'); +} + +/// Spawn the cua-driver daemon (no-op if already running). Throws if binary missing. +export async function startAgent(): Promise { + return invoke('start_agent'); +} + +/// Tool-using chat. `history` is the prior conversation, `userInput` is the new message. +export async function chatWithAgent( + history: ChatMessage[], + userInput: string +): Promise { + return invoke('chat_with_agent', { history, userInput }); +} + +/// Direct tool invocation (debugging only — bypasses the LLM). +export async function invokeCuaTool( + name: string, + args: Record +): Promise<{ name: string; response: Record }> { + return invoke('invoke_cua_tool', { name, args }); +} + // Observe commands export async function startObserve(): Promise { return invoke('start_observe'); @@ -346,6 +400,7 @@ export interface LifeTreeNode { experimentPhase?: ExperimentPhase; observationCount: number; relatedApps: string[]; + entityIds?: string[]; } export interface TreeStats { @@ -354,6 +409,7 @@ export interface TreeStats { activeTasks: number; experimentsCompleted: number; mostActiveStage?: LifeStage; + entityCount?: number; } export interface LifeTreeSnapshot { @@ -361,6 +417,9 @@ export interface LifeTreeSnapshot { nodes: LifeTreeNode[]; stats: TreeStats; generatedAt: number; + knowledgeEntities?: KnowledgeEntity[]; + knowledgeEdges?: KnowledgeEdge[]; + crossEdges?: KnowledgeCrossEdge[]; } export interface ExperimentProposal { @@ -369,6 +428,36 @@ export interface ExperimentProposal { durationDays: number; } +// Knowledge graph types +export type KnowledgeNodeType = 'person' | 'project' | 'technology' | 'concept' | 'place'; + +export interface KnowledgeEntity { + id: string; + label: string; + type: KnowledgeNodeType; + aliases: string[]; + sourceNodeIds: string[]; + firstSeen: number; + lastSeen: number; + frequency: number; +} + +export interface KnowledgeEdge { + id: string; + sourceEntityId: string; + targetEntityId: string; + relation: string; + strength: number; + sourceNodeIds: string[]; +} + +export interface KnowledgeCrossEdge { + fromNodeId: string; + toNodeId: string; + entityLabel: string; + strength: number; +} + // Life Tree commands export async function getLifeTree(): Promise { return invoke('get_life_tree'); @@ -546,3 +635,52 @@ export async function resumeDebug(): Promise { export async function clearDebugEvents(): Promise { return invoke('clear_debug_events'); } + +// Gaze ANE types +export interface GazeSample { + features: number[]; + targetX: number; + targetY: number; + timestamp: number; +} + +export interface GazeTrainingStatus { + sampleCount: number; + newSampleCount: number; + isTraining: boolean; + trainLoss: number | null; + modelReady: boolean; + aneAvailable: boolean; +} + +export interface GazePrediction { + x: number; + y: number; + confidence: number; + latencyUs: number; +} + +// Gaze ANE commands +export async function submitGazeSample(sample: GazeSample): Promise { + return invoke('submit_gaze_sample', { sample }); +} + +export async function triggerGazeTraining(): Promise { + return invoke('trigger_gaze_training'); +} + +export async function predictGaze(features: number[]): Promise { + return invoke('predict_gaze', { features }); +} + +export async function getGazeTrainingStatus(): Promise { + return invoke('get_gaze_training_status'); +} + +export async function clearGazeModel(): Promise { + return invoke('clear_gaze_model'); +} + +export async function loadGazeWeights(): Promise { + return invoke('load_gaze_weights'); +}