From 52b0a34f9fdc1097070140a1c9f58f4f6eb14f2a Mon Sep 17 00:00:00 2001 From: baobaodae <2014596548@qq.com> Date: Wed, 17 Jun 2026 23:27:36 +0800 Subject: [PATCH 1/6] feat: Active Daemon - multi-source AI tool log watcher framework MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Background daemon monitors AI coding tool logs (Claude Code, Aider, Cursor, Continue.dev, generic JSONL) via watchdog filesystem observers - Incremental ETL pipeline: watch → filter → reconstruct → LLM extract → deduplicate → route → viking:// storage - Watcher abstraction with BaseFileWatcher ABC and registry-based factory - SQLite cursor persistence for incremental file reads across restarts - Server integration: GET /api/v1/daemon/status, --with-daemon CLI flag - Web UI: daemon status card on home dashboard with per-watcher metrics - Full test suite (18 test files) and documentation --- Echo2.md | 429 ++++++ docs/daemon/README.md | 61 + docs/daemon/configuration.md | 50 + docs/daemon/multi-source-design.md | 1227 +++++++++++++++++ examples/daemon/docker-compose-daemon.yml | 14 + examples/daemon/start-daemon.bat | 10 + examples/daemon/start-daemon.sh | 12 + openviking/daemon/__init__.py | 7 + .../daemon/conversation_reconstructor.py | 55 + openviking/daemon/cursor_manager.py | 92 ++ openviking/daemon/deduplicator.py | 35 + openviking/daemon/etl_pipeline.py | 64 + openviking/daemon/filters.py | 47 + openviking/daemon/knowledge_extractor.py | 121 ++ openviking/daemon/knowledge_router.py | 56 + openviking/daemon/models.py | 88 ++ openviking/daemon/service.py | 190 +++ openviking/daemon/storage_adapter.py | 155 +++ openviking/daemon/watchers/__init__.py | 27 + openviking/daemon/watchers/aider_watcher.py | 148 ++ .../daemon/watchers/base_file_watcher.py | 192 +++ .../daemon/watchers/claude_code_watcher.py | 60 + .../daemon/watchers/continue_dev_watcher.py | 68 + openviking/daemon/watchers/cursor_watcher.py | 87 ++ .../daemon/watchers/generic_jsonl_watcher.py | 104 ++ openviking/daemon/watchers/registry.py | 72 + openviking/server/app.py | 39 + openviking/server/bootstrap.py | 10 +- openviking/server/config.py | 82 +- openviking/server/routers/__init__.py | 2 + openviking/server/routers/daemon.py | 81 ++ tests/daemon/__init__.py | 0 tests/daemon/test_aider_watcher.py | 183 +++ tests/daemon/test_base_file_watcher.py | 163 +++ tests/daemon/test_claude_code_watcher.py | 134 ++ tests/daemon/test_config.py | 85 ++ tests/daemon/test_continue_dev_watcher.py | 193 +++ .../daemon/test_conversation_reconstructor.py | 58 + tests/daemon/test_cursor_manager.py | 61 + tests/daemon/test_cursor_watcher.py | 187 +++ tests/daemon/test_deduplicator.py | 42 + tests/daemon/test_filters.py | 53 + tests/daemon/test_generic_jsonl_watcher.py | 156 +++ tests/daemon/test_integration.py | 189 +++ tests/daemon/test_knowledge_extractor.py | 117 ++ tests/daemon/test_knowledge_router.py | 73 + .../daemon/test_multi_watcher_integration.py | 330 +++++ tests/daemon/test_registry.py | 31 + tests/daemon/test_service_multi.py | 31 + web-studio/src/i18n/locales/en.ts | 15 + web-studio/src/i18n/locales/zh-CN.ts | 15 + .../home/-components/daemon-status-card.tsx | 186 +++ web-studio/src/routes/home/route.tsx | 3 + 53 files changed, 5988 insertions(+), 2 deletions(-) create mode 100644 Echo2.md create mode 100644 docs/daemon/README.md create mode 100644 docs/daemon/configuration.md create mode 100644 docs/daemon/multi-source-design.md create mode 100644 examples/daemon/docker-compose-daemon.yml create mode 100644 examples/daemon/start-daemon.bat create mode 100644 examples/daemon/start-daemon.sh create mode 100644 openviking/daemon/__init__.py create mode 100644 openviking/daemon/conversation_reconstructor.py create mode 100644 openviking/daemon/cursor_manager.py create mode 100644 openviking/daemon/deduplicator.py create mode 100644 openviking/daemon/etl_pipeline.py create mode 100644 openviking/daemon/filters.py create mode 100644 openviking/daemon/knowledge_extractor.py create mode 100644 openviking/daemon/knowledge_router.py create mode 100644 openviking/daemon/models.py create mode 100644 openviking/daemon/service.py create mode 100644 openviking/daemon/storage_adapter.py create mode 100644 openviking/daemon/watchers/__init__.py create mode 100644 openviking/daemon/watchers/aider_watcher.py create mode 100644 openviking/daemon/watchers/base_file_watcher.py create mode 100644 openviking/daemon/watchers/claude_code_watcher.py create mode 100644 openviking/daemon/watchers/continue_dev_watcher.py create mode 100644 openviking/daemon/watchers/cursor_watcher.py create mode 100644 openviking/daemon/watchers/generic_jsonl_watcher.py create mode 100644 openviking/daemon/watchers/registry.py create mode 100644 openviking/server/routers/daemon.py create mode 100644 tests/daemon/__init__.py create mode 100644 tests/daemon/test_aider_watcher.py create mode 100644 tests/daemon/test_base_file_watcher.py create mode 100644 tests/daemon/test_claude_code_watcher.py create mode 100644 tests/daemon/test_config.py create mode 100644 tests/daemon/test_continue_dev_watcher.py create mode 100644 tests/daemon/test_conversation_reconstructor.py create mode 100644 tests/daemon/test_cursor_manager.py create mode 100644 tests/daemon/test_cursor_watcher.py create mode 100644 tests/daemon/test_deduplicator.py create mode 100644 tests/daemon/test_filters.py create mode 100644 tests/daemon/test_generic_jsonl_watcher.py create mode 100644 tests/daemon/test_integration.py create mode 100644 tests/daemon/test_knowledge_extractor.py create mode 100644 tests/daemon/test_knowledge_router.py create mode 100644 tests/daemon/test_multi_watcher_integration.py create mode 100644 tests/daemon/test_registry.py create mode 100644 tests/daemon/test_service_multi.py create mode 100644 web-studio/src/routes/home/-components/daemon-status-card.tsx diff --git a/Echo2.md b/Echo2.md new file mode 100644 index 0000000000..3f18cb473d --- /dev/null +++ b/Echo2.md @@ -0,0 +1,429 @@ +# Echo II: Multi-Source Watcher Extension + +> 从单一 Claude Code 监听到多工具 AI 编码助手日志的统一监听与知识提取框架 + +--- + +## 一、动机与背景 + +Active Daemon 的 v1 实现仅支持 Claude Code JSONL 日志监听。然而现实中,开发者的日常工具栈远不止于此: + +- **Claude Code CLI** — JSONL 日志(已实现) +- **Cursor** — 日志文件 + SQLite 数据库 +- **Aider** — `.aider.chat.history.md` Markdown 文件 +- **GitHub Copilot** — VS Code workspace 日志 +- **Continue.dev** — `~/.continue/` 下的 JSON 日志 +- **Windsurf / Cascade** — 类 Cursor 的日志目录 + +Echo.md 原文已明确提到需要支持 Cursor(SQLite polling)和 OpenCode(rotating text logs),但 v1 仅完成了最简单的 Claude Code 路径。本次扩展的核心目标是:**将 Daemon 从单源监听器升级为多源监听框架**,使开发者可以自由配置任意数量的工具监听路径。 + +--- + +## 二、目标工具可行性分析 + +| 工具 | 存储格式 | 驱动模式 | 可行性 | 优先级 | +|------|----------|----------|--------|--------| +| **Claude Code** | JSONL append-only | 文件增量监听 | ★★★★★ 已实现 | P0 | +| **Aider** | Markdown `.aider.chat.history.md` | 文件增量监听 | ★★★★☆ | P1 | +| **Cursor** | SQLite `state.vscdb` + 日志 | DB polling + 文件监听 | ★★★☆☆ | P1 | +| **Continue.dev** | JSON 日志 | 文件增量监听 | ★★★★☆ | P2 | +| **GitHub Copilot** | VS Code workspace logs | 文件增量监听 | ★★★☆☆ | P2 | +| **Generic JSONL** | 任意 JSONL | 文件增量监听 | ★★★★★ | P1 | + +### 驱动模式分类 + +根据日志格式差异,Watcher 分为两种驱动模式: + +**File-Append Watcher**(文件追加监听) +- 使用 watchdog 监听文件变更事件 +- 通过 FileCursor(byte offset)实现增量读取 +- 适用于:Claude Code、Aider、Continue.dev、Generic JSONL + +**Database Polling Watcher**(数据库轮询监听) +- 定时轮询 SQLite 数据库 +- 通过 timestamp cursor 实现增量查询 +- 适用于:Cursor(state.vscdb) + +--- + +## 三、架构设计 + +### 3.1 核心原则 + +**每个 Watcher 负责将自己的原始事件归一化为统一格式**,下游 ETL pipeline 完全不需要感知工具差异。 + +``` + ┌─ ClaudeCodeWatcher ──┐ + │ │ +AiderWatcher ────────┤ ├──→ normalized events ──→ ETL Pipeline ──→ viking:// + │ (shared queue) │ +GenericJSONLWatcher──┘ │ + │ +CursorWatcher (polling) ────────────────────┘ +``` + +### 3.2 归一化事件格式 (NormalizedEvent) + +所有 Watcher 将原始日志转换为以下统一格式的 dict: + +```python +{ + "role": "user" | "assistant", # 必须 + "content": "...", # 必须,消息正文 + "type": "message", # 必须,固定 "message" + "timestamp": "2026-06-16T10:30:00Z", # 可选 + "session_id": "...", # 可选,工具特定 + "project_name": "...", # 可选 + "tool_name": "claude_code", # 必须,来源工具标识 +} +``` + +新增 `tool_name` 字段用于区分来源。现有的 `ConversationReconstructor`、`LowValueFilter`、`KnowledgeExtractor` 均通过 `event.get("role")` / `event.get("content")` 读取,**完全兼容**。 + +### 3.3 BaseWatcher Protocol + +```python +# openviking/daemon/watchers/__init__.py + +from typing import Protocol, runtime_checkable + +@runtime_checkable +class BaseWatcher(Protocol): + """All tool-specific watchers must implement this interface.""" + + @property + def tool_name(self) -> str: ... + + def start(self) -> None: ... + def stop(self) -> None: ... + def flush(self) -> None: ... +``` + +### 3.4 BaseFileWatcher 抽象基类 + +为 File-Append 模式的 Watcher 提取公共逻辑(watchdog Observer + 增量读取 + 游标管理 + 批处理缓冲): + +```python +# openviking/daemon/watchers/base_file_watcher.py + +class BaseFileWatcher(ABC): + """Abstract base for file-append based watchers.""" + + def __init__(self, watch_dir, cursor_manager, batch_callback, + file_pattern, batch_trigger_lines, batch_trigger_seconds): + ... + + @abstractmethod + def parse_line(self, line: str) -> Optional[Dict]: + """Parse a raw line into a raw event dict. Tool-specific.""" + ... + + @abstractmethod + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """Convert raw event to NormalizedEvent format. Tool-specific.""" + ... + + def filter_event(self, event: Dict) -> bool: + """Optional: additional tool-specific filtering. Default: True.""" + return True + + # --- 公共逻辑(不需要子类重写) --- + def start(self) -> None: ... # 创建 Observer + Handler + def stop(self) -> None: ... + def flush(self) -> None: ... +``` + +子类只需实现 `parse_line()` 和 `normalize_event()` 两个方法。 + +### 3.5 Watcher 注册表 (WatcherRegistry) + +```python +# openviking/daemon/watchers/registry.py + +WATCHER_REGISTRY: Dict[str, Type[BaseWatcher]] = { + "claude_code": ClaudeCodeWatcher, + "aider": AiderWatcher, + "cursor": CursorWatcher, + "continue_dev": ContinueDevWatcher, + "generic_jsonl": GenericJSONLWatcher, +} + +def create_watcher(tool_name: str, **kwargs) -> BaseWatcher: + """Factory function to create a watcher by tool name.""" + cls = WATCHER_REGISTRY.get(tool_name) + if not cls: + raise ValueError(f"Unknown tool: {tool_name}. Available: {list(WATCHER_REGISTRY.keys())}") + return cls(**kwargs) +``` + +--- + +## 四、配置设计 + +### 4.1 新配置结构 + +`DaemonConfig` 从单一 `watch_dir` 演进为支持 `watchers` 列表: + +```python +class WatcherConfig(BaseModel): + tool_name: str # 必填:工具标识 + watch_dir: str # 必填:监听路径 + file_pattern: str = "*.jsonl" # 可选:文件匹配模式 + enabled: bool = True # 可选:是否启用 + batch_trigger_lines: int = Field(50, gt=0) # 可选:批处理行数 + batch_trigger_seconds: int = Field(300, gt=0) # 可选:批处理秒数 + extra: Dict[str, Any] = Field(default_factory=dict) # 可选:工具特定参数 + +class DaemonConfig(BaseModel): + enabled: bool = False + db_path: Optional[str] = None + watchers: List[WatcherConfig] = Field(default_factory=list) + + # 向后兼容:保留旧字段 + watch_dir: Optional[str] = None # deprecated + batch_trigger_lines: int = Field(50, gt=0) # deprecated, 全局默认值 + batch_trigger_seconds: int = Field(300, gt=0) # deprecated, 全局默认值 + + def get_effective_watchers(self) -> List[WatcherConfig]: + """向后兼容:如果 watchers 为空但 watch_dir 存在,自动构造单个 Claude Code watcher.""" + if self.watchers: + return [w for w in self.watchers if w.enabled] + elif self.watch_dir: + return [WatcherConfig( + tool_name="claude_code", + watch_dir=self.watch_dir, + batch_trigger_lines=self.batch_trigger_lines, + batch_trigger_seconds=self.batch_trigger_seconds, + )] + else: + return [WatcherConfig( + tool_name="claude_code", + watch_dir=str(Path.home() / ".claude" / "projects"), + batch_trigger_lines=self.batch_trigger_lines, + batch_trigger_seconds=self.batch_trigger_seconds, + )] +``` + +### 4.2 ov.conf 示例 + +```json +{ + "server": { + "daemon": { + "enabled": true, + "db_path": "~/.openviking/daemon/cursors.db", + "watchers": [ + { + "tool_name": "claude_code", + "watch_dir": "~/.claude/projects", + "file_pattern": "*.jsonl" + }, + { + "tool_name": "aider", + "watch_dir": "~/Projects", + "file_pattern": ".aider.chat.history.md" + }, + { + "tool_name": "cursor", + "watch_dir": "~/.cursor", + "extra": {"poll_interval_seconds": 60} + }, + { + "tool_name": "generic_jsonl", + "watch_dir": "~/ai-logs/copilot", + "file_pattern": "*.jsonl", + "extra": {"user_role_field": "author", "assistant_role_value": "copilot"} + } + ] + } + } +} +``` + +### 4.3 环境变量 + +| 变量 | 说明 | +|------|------| +| `OV_DAEMON_ENABLED` | 全局开关 | +| `OV_DAEMON_WATCHERS` | JSON 格式的 watchers 列表(覆盖 ov.conf) | + +--- + +## 五、实现计划 + +### Batch 1:基础抽象层(3 文件 + 6 tests) + +| 任务 | 文件 | 说明 | +|------|------|------| +| 1.1 | `watchers/__init__.py` | 定义 `BaseWatcher` Protocol | +| 1.2 | `watchers/base_file_watcher.py` | 抽象基类,提取 ClaudeCodeWatcher 的公共逻辑 | +| 1.3 | `watchers/registry.py` | Watcher 注册表 + `create_watcher()` 工厂 | +| 1.4 | `tests/daemon/test_base_file_watcher.py` | BaseFileWatcher 的 parse/normalize/buffer 测试 | +| 1.5 | `tests/daemon/test_registry.py` | 注册表查找、未知工具报错测试 | + +### Batch 2:重构 ClaudeCodeWatcher + GenericJSONLWatcher(3 文件 + 8 tests) + +| 任务 | 文件 | 说明 | +|------|------|------| +| 2.1 | `watchers/claude_code_watcher.py` | 重构为继承 BaseFileWatcher,实现 parse_line/normalize_event | +| 2.2 | `watchers/generic_jsonl_watcher.py` | 通用 JSONL Watcher,支持自定义字段映射 | +| 2.3 | `models.py` | 在 BatchBuffer 中增加 `tool_name` 字段 | +| 2.4 | `tests/daemon/test_claude_code_watcher.py` | 更新已有测试(保持兼容) | +| 2.5 | `tests/daemon/test_generic_jsonl_watcher.py` | GenericJSONLWatcher 的解析/归一化/字段映射测试 | + +### Batch 3:AiderWatcher + CursorWatcher(4 文件 + 10 tests) + +| 任务 | 文件 | 说明 | +|------|------|------| +| 3.1 | `watchers/aider_watcher.py` | Aider Markdown 历史解析 → 归一化事件 | +| 3.2 | `watchers/cursor_watcher.py` | Cursor SQLite polling + 日志文件监听 | +| 3.3 | `watchers/continue_dev_watcher.py` | Continue.dev JSON 日志监听 | +| 3.4 | `tests/daemon/test_aider_watcher.py` | Markdown 解析、对话重组、增量读取测试 | +| 3.5 | `tests/daemon/test_cursor_watcher.py` | SQLite polling、增量查询测试 | +| 3.6 | `tests/daemon/test_continue_dev_watcher.py` | JSON 日志解析测试 | + +### Batch 4:配置 + 服务层重构(4 文件 + 6 tests) + +| 任务 | 文件 | 说明 | +|------|------|------| +| 4.1 | `server/config.py` | 新增 WatcherConfig,重构 DaemonConfig | +| 4.2 | `daemon/service.py` | 单 watcher → watchers 列表,使用 registry 创建 | +| 4.3 | `daemon/knowledge_router.py` | URI 中包含 tool_name:`viking://skills/{tool_name}/...` | +| 4.4 | `daemon/storage_adapter.py` | Markdown 模板中 Source 字段参数化 | +| 4.5 | `daemon/models.py` | ExtractedKnowledge 增加 `source_tool` 字段 | +| 4.6 | `server/bootstrap.py` | 环境变量 OV_DAEMON_WATCHERS 支持 | +| 4.7 | `tests/daemon/test_config.py` | WatcherConfig 验证 + 向后兼容测试 | +| 4.8 | `tests/daemon/test_service_multi.py` | 多 watcher 启动/停止/路由测试 | + +### Batch 5:API + Web Studio(3 文件 + 0 tests) + +| 任务 | 文件 | 说明 | +|------|------|------| +| 5.1 | `server/routers/daemon.py` | 扩展 status 端点:返回每个 watcher 的状态 | +| 5.2 | `web-studio/.../daemon-status-card.tsx` | 重构为多 watcher 视图:表格 + 状态 | +| 5.3 | `web-studio/.../i18n` | 多 watcher 相关翻译 | + +### Batch 6:集成测试 + 收尾(2 文件) + +| 任务 | 文件 | 说明 | +|------|------|------| +| 6.1 | `tests/daemon/test_multi_watcher_integration.py` | 多 watcher → ETL → storage 全链路 | +| 6.2 | 文档更新 | README、Echo.md 引用更新 | + +--- + +## 六、关键设计决策 + +### 6.1 归一化 vs 分支管道 + +**选择:Watcher 层归一化**,而非 ETL 层分支。 + +理由: +- ETL Pipeline 的四个阶段(Filter → Reconstruct → Extract → Deduplicate)已经设计为通用的 +- 如果在 ETL 层引入工具分支,每个阶段都需要 if/else,复杂度 O(tools × stages) +- Watcher 层归一化后,新增工具只需实现 `parse_line()` + `normalize_event()`,ETL 零修改 + +### 6.2 向后兼容策略 + +旧的 `DaemonConfig` 字段(`watch_dir`、`batch_trigger_lines`、`batch_trigger_seconds`)保留为 deprecated,`get_effective_watchers()` 方法自动将旧配置转换为单个 `WatcherConfig`。这确保现有用户的 ov.conf 不需要修改。 + +### 6.3 Cursor SQLite Polling + +Cursor 使用 SQLite `state.vscdb` 存储对话历史。与文件监听不同,需要: +- 定时轮询(默认 60 秒) +- 通过 `last_timestamp` 游标实现增量查询 +- 复用 `CursorManager` 存储轮询位置 + +这是一个独立于 watchdog 的驱动模式,但通过实现相同的 `BaseWatcher` Protocol,对 DaemonService 完全透明。 + +### 6.4 GenericJSONLWatcher 字段映射 + +为了支持任意 JSONL 日志(Copilot、自研工具等),GenericJSONLWatcher 支持通过 `extra` 配置自定义字段映射: + +```json +{ + "tool_name": "generic_jsonl", + "watch_dir": "~/ai-logs/my-tool", + "extra": { + "role_field": "author", + "user_role_value": "human", + "assistant_role_value": "ai", + "content_field": "text", + "timestamp_field": "ts" + } +} +``` + +默认值兼容 Claude Code 格式(`role` / `user` / `assistant` / `content` / `timestamp`)。 + +--- + +## 七、API 变更 + +### 7.1 GET /api/v1/daemon/status(增强) + +```json +{ + "enabled": true, + "running": true, + "watchers": [ + { + "tool_name": "claude_code", + "watch_dir": "/Users/xxx/.claude/projects", + "enabled": true, + "running": true, + "cursor_count": 12, + "batch_trigger_lines": 50, + "batch_trigger_seconds": 300 + }, + { + "tool_name": "aider", + "watch_dir": "/Users/xxx/Projects", + "enabled": true, + "running": true, + "cursor_count": 3, + "batch_trigger_lines": 50, + "batch_trigger_seconds": 300 + } + ], + "db_path": "...", + "available_tools": ["claude_code", "aider", "cursor", "continue_dev", "generic_jsonl"] +} +``` + +--- + +## 八、Web Studio 变更 + +Home 页面 DaemonStatusCard 从单状态卡片升级为多 Watcher 表格视图: + +- 顶部:全局状态(enabled/running)+ watcher 数量统计 +- 中部:Watcher 列表表格(工具名、路径、状态、游标数、最后活动) +- 底部:可用工具列表(available_tools) + +每 30 秒自动刷新。 + +--- + +## 九、风险与缓解 + +| 风险 | 影响 | 缓解措施 | +|------|------|----------| +| Cursor SQLite schema 变更 | 解析失败 | 防御性编程 + 版本检测 + 降级到日志文件监听 | +| 多 watcher 并发写入 SQLite cursor DB | 竞态条件 | SQLite WAL mode + 每个 watcher 独立 cursor key | +| Aider Markdown 格式不稳定 | 解析错误 | 宽松正则 + 跳过不可解析段落 + 详细日志 | +| 大量 watcher 导致 ETL 队列拥堵 | 延迟增加 | Queue 大小限制 + 背压 + 优先级队列 | + +--- + +## 十、里程碑 + +| 阶段 | 内容 | 预计 commits | +|------|------|-------------| +| Batch 1 | 基础抽象层 | 3-4 | +| Batch 2 | 重构 + GenericJSONL | 4-5 | +| Batch 3 | Aider + Cursor + Continue | 5-6 | +| Batch 4 | 配置 + 服务层 | 4-5 | +| Batch 5 | API + Web Studio | 3-4 | +| Batch 6 | 集成 + 收尾 | 2-3 | +| **总计** | | **21-27 commits** | diff --git a/docs/daemon/README.md b/docs/daemon/README.md new file mode 100644 index 0000000000..a7e5173c1f --- /dev/null +++ b/docs/daemon/README.md @@ -0,0 +1,61 @@ +# OpenViking Active Daemon + +自动监听 Claude Code 会话日志,提取知识并写入 OpenViking 知识库。 + +## 快速开始 + +### 启用 Daemon + +设置环境变量: + +```bash +export OV_DAEMON_ENABLED=true +export OV_DAEMON_WATCH_DIR=~/.claude/projects # 监听目录(可选) +export OV_DAEMON_BATCH_LINES=50 # 批处理触发行数(可选) +export OV_DAEMON_BATCH_SECONDS=300 # 批处理触发秒数(可选) +``` + +### 启动 OpenViking Server + +```bash +openviking serve +``` + +Daemon 会在服务器启动时自动运行(如果 `OV_DAEMON_ENABLED=true`)。 + +## 工作原理 + +1. **文件监听** — 监控 `~/.claude/projects/` 下的 `.jsonl` 文件变化 +2. **增量读取** — 文件游标技术,只处理新增内容 +3. **批量处理** — 累积 50 行或 5 分钟后触发 ETL 管道 +4. **知识提取** — 调用 LLM 过滤噪声,提取有价值的知识 +5. **自动存储** — 写入 `viking://skills/`、`viking://memories/`、`viking://resources/` + +## 知识分类 + +| 类型 | 目标路径 | 说明 | +|------|---------|------| +| Skills | `viking://skills/claude_code/.md` | 可复用的操作指南 | +| Memories (有项目) | `viking://memories/projects/<project>/decisions.md` | 项目决策日志 | +| Memories (无项目) | `viking://memories/global/<title>.md` | 全局记忆 | +| Resources | `viking://resources/<tech>/<title>.md` | 参考资源 | + +## 架构 + +``` +Claude Code JSONL → File Watcher → Batch Buffer → Filter → Reconstruct + → LLM Extract → Deduplicate → Route → viking:// Storage +``` + +## 故障排查 + +### Daemon 未启动 +检查日志中是否有 `Active Daemon is disabled` 消息,确认 `OV_DAEMON_ENABLED=true`。 + +### 没有提取到知识 +- 确认 Claude Code 正在写入 JSONL 文件(`~/.claude/projects/` 下有 `.jsonl` 文件) +- 对话内容可能不够有价值(简单问答会被过滤) +- 查看日志中的 ETL 处理信息 + +### 知识写入失败 +检查 OpenViking ResourceService 是否正常运行。 diff --git a/docs/daemon/configuration.md b/docs/daemon/configuration.md new file mode 100644 index 0000000000..169bf341e1 --- /dev/null +++ b/docs/daemon/configuration.md @@ -0,0 +1,50 @@ +# Active Daemon 配置指南 + +## 环境变量 + +| 变量 | 说明 | 默认值 | +|------|------|--------| +| `OV_DAEMON_ENABLED` | 启用 Daemon | `false` | +| `OV_DAEMON_WATCH_DIR` | 监听目录 | `~/.claude/projects` | +| `OV_DAEMON_DB_PATH` | 游标数据库路径 | `~/.qoderworkcn/openviking/daemon_cursors.db` | +| `OV_DAEMON_BATCH_LINES` | 批处理触发行数 | `50` | +| `OV_DAEMON_BATCH_SECONDS` | 批处理触发秒数 | `300` | + +## YAML 配置 (ov.conf) + +```json +{ + "daemon": { + "enabled": true, + "watch_dir": "~/.claude/projects", + "batch_trigger_lines": 50, + "batch_trigger_seconds": 300 + } +} +``` + +## Docker 部署 + +```yaml +services: + openviking: + image: openviking:latest + environment: + - OV_DAEMON_ENABLED=true + - OV_DAEMON_WATCH_DIR=/data/claude-projects + - OV_DAEMON_DB_PATH=/data/daemon.db + volumes: + - ./claude-projects:/data/claude-projects + - ./daemon-data:/data + ports: + - "1933:1933" +``` + +## 日志 + +Daemon 使用 OpenViking 标准日志系统。关键日志: + +- `Claude Code watcher started on ...` — 监听器启动 +- `Flushing batch with N events` — 批处理触发 +- `Extracted N knowledge items` — 知识提取完成 +- `Knowledge ingested: viking://...` — 知识写入成功 diff --git a/docs/daemon/multi-source-design.md b/docs/daemon/multi-source-design.md new file mode 100644 index 0000000000..abf3828a09 --- /dev/null +++ b/docs/daemon/multi-source-design.md @@ -0,0 +1,1227 @@ +# Daemon 多源监听拓展设计 + +| 项目 | 信息 | +|-----|------| +| 状态 | `草案` | +| 创建日期 | 2026-06-16 | +| 基线分支 | `feature/active-daemon` | + +--- + +## 概述 + +将 Active Daemon 从单一 Claude Code CLI 监听源拓展为支持多个可配置监听源的通用知识采集框架。核心变更包括:引入 Watcher 抽象层、多源配置模型、Claude Desktop 专用 Watcher、以及 Source-aware 的知识路由。 + +--- + +## 目录 + +- [背景与问题](#背景与问题) +- [目标与非目标](#目标与非目标) +- [架构设计](#架构设计) +- [核心抽象](#核心抽象) +- [配置设计](#配置设计) +- [各 Watcher 实现](#各-watcher-实现) +- [ETL 管道适配](#etl-管道适配) +- [路由与存储适配](#路由与存储适配) +- [文件变更清单](#文件变更清单) +- [分阶段实施](#分阶段实施) +- [风险与权衡](#风险与权衡) +- [验证方案](#验证方案) + +--- + +## 背景与问题 + +### 当前架构 + +Daemon 目前仅支持一个监听源:Claude Code CLI 的 JSONL 会话日志(`~/.claude/projects/`)。架构是单 Watcher 硬编码: + +``` +DaemonService + └── ClaudeCodeWatcher(watch_dir="~/.claude/projects") + └── ClaudeCodeLogHandler → BatchBuffer → asyncio.Queue + ↓ + BatchETLPipeline (Filter → Reconstruct → Extract → Dedup) + ↓ + VikingStorageAdapter → viking://skills/claude_code/* +``` + +### 问题 + +| 问题 | 说明 | +|-----|------| +| 单源硬编码 | `DaemonService` 只实例化一个 `ClaudeCodeWatcher`,无法扩展 | +| 无抽象基类 | `watchers/__init__.py` 为空,没有 Watcher 协议定义 | +| 路由硬编码 | `KnowledgeRouter` 把 skills 统一路由到 `viking://skills/claude_code/`,不区分来源 | +| 模板硬编码 | `VikingStorageAdapter._format_skill()` 固定写 `Source: Claude Code session` | +| 配置扁平 | `DaemonConfig` 只有 `watch_dir`(单路径),无法表达多源 | +| Claude Desktop 未覆盖 | Claude Desktop 的会话存储在 IndexedDB 中(`%APPDATA%\Claude\`),当前完全忽略 | + +### 动机 + +用户可能同时使用多个 AI 编码工具(Claude Code CLI、Claude Desktop、Cursor、Aider 等),每个工具产生不同格式的日志。Daemon 应能从所有这些源中提取知识,统一汇入 OpenViking。 + +--- + +## 目标与非目标 + +### 目标 + +1. **Watcher 抽象**:定义统一接口,新增监听源只需实现一个 Watcher 类 +2. **多路径配置**:支持配置任意数量的监听源,每个源可指定独立路径和参数 +3. **Claude Desktop 支持**:提取 Claude Desktop 的会话数据(JSONL + IndexedDB) +4. **Source-aware 路由**:知识路由和存储模板反映数据来源 +5. **向后兼容**:单源配置自动迁移为多源格式,现有行为不变 +6. **共享 ETL**:所有 Watcher 的输出进入同一个 ETL 管道,无需管道层改动 + +### 非目标 + +1. **非 Cursor/Aider/Copilot 支持** —— 本期只做 Claude 生态(CLI + Desktop),架构预留扩展点但不实现 +2. **非实时处理** —— 保持批量处理模型(50 行 / 5 分钟阈值) +3. **不改变 LLM 提取逻辑** —— Prompt 和置信度阈值不变 +4. **不引入新依赖** —— IndexedDB 提取如需要 plyvel,作为 optional dependency + +--- + +## 架构设计 + +### 多源架构总览 + +``` +DaemonService + │ + ├── WatcherRegistry + │ │ + │ ├── ClaudeCodeWatcher (JSONL, ~/.claude/projects/) + │ │ └── ClaudeCodeLogHandler → normalize → ┐ + │ │ │ + │ ├── ClaudeDesktopWatcher (JSONL + IndexedDB) │ + │ │ ├── DesktopJsonlHandler → normalize → ──┤ + │ │ └── IndexedDBPoller → normalize → ──────┤ + │ │ │ + │ └── [未来] GenericJsonlWatcher (自定义路径) │ + │ └── GenericLogHandler → normalize → ────┤ + │ │ + │ ┌───────────────┘ + │ ↓ + │ asyncio.Queue[NormalizedEvent] + │ ↓ + └──────────────────→ BatchETLPipeline (Filter → Reconstruct → Extract → Dedup) + ↓ + SourceAwareRouter → viking://skills/{source}/* + ↓ + VikingStorageAdapter (source-aware templates) +``` + +### 数据流对比 + +``` +┌─ 当前 ───────────────────────────────────────────────────────┐ +│ │ +│ ~/.claude/projects/**/*.jsonl │ +│ ↓ ClaudeCodeWatcher │ +│ {role, content, type, timestamp} │ +│ ↓ Queue │ +│ ETL → viking://skills/claude_code/xxx │ +│ │ +└──────────────────────────────────────────────────────────────┘ + +┌─ 目标 ───────────────────────────────────────────────────────┐ +│ │ +│ Source A: ~/.claude/projects/**/*.jsonl │ +│ ↓ ClaudeCodeWatcher → normalize → { ..., source: │ +│ "claude_code" } │ +│ Source B: %APPDATA%/Claude/... (JSONL / IndexedDB) │ +│ ↓ ClaudeDesktopWatcher → normalize → { ..., source: │ +│ "claude_desktop"│ +│ } │ +│ Source C: /custom/path/**/*.jsonl (用户自定义) │ +│ ↓ GenericJsonlWatcher → normalize → { ..., source: │ +│ "custom_xxx" } │ +│ ↓ asyncio.Queue (共享) │ +│ ETL (unchanged) │ +│ ↓ │ +│ SourceAwareRouter → viking://skills/{source}/xxx │ +│ │ +└──────────────────────────────────────────────────────────────┘ +``` + +--- + +## 核心抽象 + +### 1. NormalizedEvent — 统一事件模型 + +在现有 `BatchBuffer` 的 `Dict[str, Any]` 事件基础上,增加 `source` 字段: + +```python +# openviking/daemon/models.py — 新增字段 + +@dataclass +class NormalizedEvent: + """所有 Watcher 产出的标准化事件。""" + role: str # "user" | "assistant" + content: str # 消息正文 + type: str # "message" + timestamp: Optional[str] # ISO-8601 + session_id: Optional[str] # 会话 ID + project_name: Optional[str] # 项目名 + source: str # 来源标识: "claude_code" | "claude_desktop" | "custom:xxx" + + def to_dict(self) -> Dict[str, Any]: + return { + "role": self.role, + "content": self.content, + "type": self.type, + "timestamp": self.timestamp, + "session_id": self.session_id, + "project_name": self.project_name, + "source": self.source, + } +``` + +**为什么新增 NormalizedEvent 而不是复用 Dict**:当前 pipeline 里事件是裸 Dict,各 Watcher 自行构造。增加 `source` 字段后需要类型约束,避免拼写错误导致路由分错。同时 `to_dict()` 保证与现有 `BatchETLPipeline.process_batch(events: List[Dict])` 接口兼容。 + +### 2. BaseWatcher — Watcher 抽象基类 + +```python +# openviking/daemon/watchers/base.py — 新文件 + +from abc import ABC, abstractmethod +from typing import Callable, List, Dict, Optional + +from openviking.daemon.cursor_manager import CursorManager + + +class BaseWatcher(ABC): + """所有日志监听器的抽象基类。""" + + def __init__( + self, + paths: List[str], + source_id: str, + cursor_manager: CursorManager, + batch_callback: Callable[[List[Dict]], None], + batch_trigger_lines: int = 50, + batch_trigger_seconds: int = 300, + ): + self.paths = paths + self.source_id = source_id + self.cursor_manager = cursor_manager + self.batch_callback = batch_callback + self.batch_trigger_lines = batch_trigger_lines + self.batch_trigger_seconds = batch_trigger_seconds + + @abstractmethod + def start(self): + """启动监听。可启动一个或多个 watchdog Observer 线程。""" + ... + + @abstractmethod + def stop(self): + """停止所有监听线程。""" + ... + + @abstractmethod + def flush(self): + """强制刷新所有缓冲区。""" + ... + + @property + @abstractmethod + def watcher_type(self) -> str: + """Watcher 类型标识,用于配置解析。""" + ... + + @property + def status(self) -> Dict: + """返回当前状态摘要,供 /daemon/status API 使用。""" + return { + "type": self.watcher_type, + "source_id": self.source_id, + "paths": self.paths, + } +``` + +**设计要点**: +- `paths: List[str]` — 支持多个监听路径(一个 Watcher 可监听多个目录) +- `source_id: str` — 来源标识,注入到 NormalizedEvent.source 和路由路径 +- `cursor_manager` 共享 — 所有 Watcher 共用同一个 SQLite 游标库,通过 file_path 主键天然隔离 +- `batch_callback` 共享 — 所有 Watcher 向同一个 `asyncio.Queue` 投递事件 + +### 3. WatcherFactory — 配置驱动的实例化 + +```python +# openviking/daemon/watchers/factory.py — 新文件 + +from typing import Dict, List, Type + +from openviking.daemon.watchers.base import BaseWatcher + + +class WatcherFactory: + """根据配置创建 Watcher 实例。""" + + _registry: Dict[str, Type[BaseWatcher]] = {} + + @classmethod + def register(cls, watcher_type: str, watcher_class: Type[BaseWatcher]): + cls._registry[watcher_type] = watcher_class + + @classmethod + def create( + cls, + source_config: Dict, + cursor_manager, + batch_callback, + default_lines: int = 50, + default_seconds: int = 300, + ) -> BaseWatcher: + watcher_type = source_config["type"] + watcher_class = cls._registry.get(watcher_type) + if not watcher_class: + raise ValueError( + f"Unknown watcher type: {watcher_type}. " + f"Available: {list(cls._registry.keys())}" + ) + return watcher_class( + paths=source_config.get("paths", []), + source_id=source_config.get("source_id", watcher_type), + cursor_manager=cursor_manager, + batch_callback=batch_callback, + batch_trigger_lines=source_config.get( + "batch_trigger_lines", default_lines + ), + batch_trigger_seconds=source_config.get( + "batch_trigger_seconds", default_seconds + ), + # source-specific options + options=source_config.get("options", {}), + ) +``` + +注册时机在 `DaemonService.start()` 中,通过 import 触发: + +```python +# openviking/daemon/watchers/__init__.py + +from openviking.daemon.watchers.factory import WatcherFactory +from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher +from openviking.daemon.watchers.claude_desktop_watcher import ClaudeDesktopWatcher + +WatcherFactory.register("claude_code", ClaudeCodeWatcher) +WatcherFactory.register("claude_desktop", ClaudeDesktopWatcher) +``` + +--- + +## 配置设计 + +### 新配置模型 + +```python +# openviking/server/config.py — DaemonConfig 重构 + +class SourceConfig(BaseModel): + """单个监听源的配置。""" + type: str # "claude_code" | "claude_desktop" + paths: List[str] # 监听路径列表 + source_id: Optional[str] = None # 来源标识(默认 = type) + enabled: bool = True + batch_trigger_lines: Optional[int] = None # 覆盖全局值 + batch_trigger_seconds: Optional[int] = None # 覆盖全局值 + options: Dict[str, Any] = {} # Watcher 特定选项 + +class DaemonConfig(BaseModel): + """Active Daemon 配置(多源)。""" + enabled: bool = False + sources: List[SourceConfig] = [] # 多源配置 + db_path: Optional[str] = None + batch_trigger_lines: int = Field(50, gt=0) + batch_trigger_seconds: int = Field(300, gt=0) + + @classmethod + def from_env(cls) -> "DaemonConfig": + ... # 见下方兼容性设计 +``` + +### ov.conf 示例 + +```json +{ + "daemon": { + "enabled": true, + "db_path": "~/.qoderworkcn/openviking/daemon_cursors.db", + "batch_trigger_lines": 50, + "batch_trigger_seconds": 300, + "sources": [ + { + "type": "claude_code", + "paths": ["~/.claude/projects"], + "source_id": "claude_code" + }, + { + "type": "claude_desktop", + "paths": ["%APPDATA%/Claude"], + "source_id": "claude_desktop", + "options": { + "enable_indexeddb": true, + "indexeddb_path": "IndexedDB/https_claude.ai_0.indexeddb.leveldb" + } + }, + { + "type": "claude_code", + "paths": ["D:/Work/other-claude/projects", "E:/Shared/logs/claude"], + "source_id": "claude_code_work" + } + ] + } +} +``` + +### 向后兼容:单源 → 多源自动迁移 + +旧配置格式仍可使用: + +```json +{ + "daemon": { + "enabled": true, + "watch_dir": "~/.claude/projects" + } +} +``` + +迁移逻辑在 `DaemonConfig` 的 model_validator 中: + +```python +class DaemonConfig(BaseModel): + enabled: bool = False + # 旧字段(兼容) + watch_dir: Optional[str] = None + # 新字段 + sources: List[SourceConfig] = [] + db_path: Optional[str] = None + ... + + @model_validator(mode="after") + def migrate_legacy_config(self): + if self.watch_dir and not self.sources: + self.sources = [ + SourceConfig( + type="claude_code", + paths=[self.watch_dir], + source_id="claude_code", + ) + ] + return self +``` + +### 环境变量兼容 + +```python +@classmethod +def from_env(cls) -> "DaemonConfig": + import os + + # 单源(旧): OV_DAEMON_WATCH_DIR + watch_dir = os.getenv("OV_DAEMON_WATCH_DIR") + + # 多源(新): OV_DAEMON_SOURCES (JSON 数组) + sources_json = os.getenv("OV_DAEMON_SOURCES") + + sources = [] + if sources_json: + sources = [SourceConfig(**s) for s in json.loads(sources_json)] + elif watch_dir: + sources = [SourceConfig( + type="claude_code", + paths=[watch_dir], + source_id="claude_code", + )] + + return cls( + enabled=os.getenv("OV_DAEMON_ENABLED", "false").lower() == "true", + sources=sources, + db_path=os.getenv("OV_DAEMON_DB_PATH"), + batch_trigger_lines=int(os.getenv("OV_DAEMON_BATCH_LINES", "50")), + batch_trigger_seconds=int(os.getenv("OV_DAEMON_BATCH_SECONDS", "300")), + ) +``` + +--- + +## 各 Watcher 实现 + +### 1. ClaudeCodeWatcher(重构) + +将现有 `ClaudeCodeWatcher` 从硬编码单路径重构为继承 `BaseWatcher`、支持多路径: + +```python +# openviking/daemon/watchers/claude_code_watcher.py — 重构 + +class ClaudeCodeWatcher(BaseWatcher): + """监听 Claude Code CLI 的 JSONL 会话日志。""" + + watcher_type = "claude_code" + + def __init__(self, paths, source_id, cursor_manager, batch_callback, + batch_trigger_lines=50, batch_trigger_seconds=300, + options=None): + super().__init__(paths, source_id or "claude_code", + cursor_manager, batch_callback, + batch_trigger_lines, batch_trigger_seconds) + self.observers: List[Observer] = [] + self.handlers: List[ClaudeCodeLogHandler] = [] + + def start(self): + for path in self.paths: + expanded = os.path.expanduser(os.path.expandvars(path)) + handler = ClaudeCodeLogHandler( + cursor_manager=self.cursor_manager, + batch_callback=self.batch_callback, + batch_trigger_lines=self.batch_trigger_lines, + batch_trigger_seconds=self.batch_trigger_seconds, + source_id=self.source_id, # 注入到 NormalizedEvent + ) + observer = Observer() + observer.schedule(handler, expanded, recursive=True) + observer.start() + self.observers.append(observer) + self.handlers.append(handler) + logger.info("ClaudeCodeWatcher started on %s (source=%s)", + expanded, self.source_id) + + def stop(self): + for obs in self.observers: + obs.stop() + obs.join() + + def flush(self): + for h in self.handlers: + h.force_flush() +``` + +**ClaudeCodeLogHandler 变更**:在 `_process_file` 产出的事件 Dict 中注入 `"source": self.source_id`。事件过滤逻辑(`_filter_events`)不变。 + +### 2. ClaudeDesktopWatcher(新实现) + +Claude Desktop 的会话数据有两个存储位置: + +| 位置 | 格式 | 内容 | +|-----|------|------| +| `%APPDATA%/Claude/logs/main.log` | 纯文本 | Electron 应用日志,包含 API 调用痕迹 | +| `%APPDATA%/Claude/IndexedDB/` | LevelDB | Chromium IndexedDB,存储 claude.ai Web 会话 | + +**策略:JSONL 子路径扫描 + IndexedDB 轮询双通道** + +```python +# openviking/daemon/watchers/claude_desktop_watcher.py — 新文件 + +class ClaudeDesktopWatcher(BaseWatcher): + """ + 监听 Claude Desktop 的会话数据。 + + 双通道: + 1. JSONL 扫描 — 扫描 paths 下的 .jsonl 文件(如果有外部导出) + 2. IndexedDB 轮询 — 读取 LevelDB 中的会话数据 + """ + + watcher_type = "claude_desktop" + + def __init__(self, paths, source_id, cursor_manager, batch_callback, + batch_trigger_lines=50, batch_trigger_seconds=300, + options=None): + super().__init__(paths, source_id or "claude_desktop", + cursor_manager, batch_callback, + batch_trigger_lines, batch_trigger_seconds) + self.options = options or {} + self.enable_indexeddb = self.options.get("enable_indexeddb", False) + self.indexeddb_relpath = self.options.get( + "indexeddb_path", + "IndexedDB/https_claude.ai_0.indexeddb.leveldb" + ) + self.observers: List[Observer] = [] + self.handlers: List[DesktopLogHandler] = [] + self._indexeddb_poller: Optional[IndexedDBPoller] = None + + def start(self): + # 通道 1: JSONL 文件监听 + for path in self.paths: + expanded = os.path.expanduser(os.path.expandvars(path)) + handler = DesktopLogHandler( + cursor_manager=self.cursor_manager, + batch_callback=self.batch_callback, + source_id=self.source_id, + ... + ) + observer = Observer() + observer.schedule(handler, expanded, recursive=True) + observer.start() + self.observers.append(observer) + self.handlers.append(handler) + + # 通道 2: IndexedDB 轮询(可选) + if self.enable_indexeddb: + self._indexeddb_poller = IndexedDBPoller( + base_paths=self.paths, + relpath=self.indexeddb_relpath, + cursor_manager=self.cursor_manager, + batch_callback=self.batch_callback, + source_id=self.source_id, + poll_interval=self.batch_trigger_seconds, + ) + self._indexeddb_poller.start() + + def stop(self): + for obs in self.observers: + obs.stop() + obs.join() + if self._indexeddb_poller: + self._indexeddb_poller.stop() + + def flush(self): + for h in self.handlers: + h.force_flush() + if self._indexeddb_poller: + self._indexeddb_poller.force_flush() +``` + +#### DesktopLogHandler — 纯文本日志解析 + +`main.log` 是 Electron 应用日志,格式如下: + +``` +2026-03-23 12:34:38 [info] Starting app { ... } +2026-03-23 12:34:39 [info] API call to /v1/messages { model: "claude-sonnet-4-20250514" } +``` + +DesktopLogHandler 的策略是**仅提取 JSON 负载中包含对话内容的行**,过滤掉纯应用日志: + +```python +class DesktopLogHandler(FileSystemEventHandler): + """解析 Claude Desktop 的 main.log 文件。""" + + # 匹配带 JSON 负载的日志行 + LOG_PATTERN = re.compile( + r"^(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+\[(\w+)\]\s+(.+)$" + ) + + def on_modified(self, event): + if event.is_directory or not event.src_path.endswith(".log"): + return + self._process_file(event.src_path) + + def _process_file(self, file_path): + cursor = self.cursor_manager.get_cursor(file_path) + current_size = os.path.getsize(file_path) + if current_size <= cursor.last_position: + return + + with open(file_path, "r", encoding="utf-8", errors="replace") as f: + f.seek(cursor.last_position) + for line in f: + event = self._parse_log_line(line) + if event: + self.buffer.add_line(event, len(line)) + + new_position = f.tell() + + self.cursor_manager.update_cursor(file_path, new_position) + self._check_batch_trigger() + + def _parse_log_line(self, line) -> Optional[Dict]: + """ + 从日志行中提取对话事件。 + 仅保留包含 user/assistant 消息内容的行。 + """ + match = self.LOG_PATTERN.match(line.strip()) + if not match: + return None + + timestamp, level, message = match.groups() + + # 尝试从 message 中提取 JSON 负载 + try: + # 找到第一个 { 并解析到末尾 + json_start = message.index("{") + payload = json.loads(message[json_start:]) + except (ValueError, json.JSONDecodeError): + return None + + # 仅保留包含对话角色的事件 + role = payload.get("role") + if role not in ("user", "assistant"): + return None + + return { + "role": role, + "content": payload.get("content", ""), + "type": "message", + "timestamp": timestamp, + "session_id": payload.get("sessionId"), + "project_name": payload.get("projectName"), + "source": self.source_id, + } +``` + +#### IndexedDBPoller — LevelDB 轮询提取(可选) + +```python +# openviking/daemon/watchers/indexeddb_poller.py — 新文件 + +class IndexedDBPoller: + """ + 定期轮询 Claude Desktop 的 IndexedDB LevelDB 存储。 + 提取新增/更新的会话记录,转换为 NormalizedEvent。 + + 需要 plyvel 库(pip install plyvel)。 + """ + + def __init__(self, base_paths, relpath, cursor_manager, + batch_callback, source_id, poll_interval=300): + self.base_paths = base_paths + self.relpath = relpath + self.cursor_manager = cursor_manager + self.batch_callback = batch_callback + self.source_id = source_id + self.poll_interval = poll_interval + self._thread: Optional[threading.Thread] = None + self._stop_event = threading.Event() + + def start(self): + self._thread = threading.Thread( + target=self._poll_loop, daemon=True + ) + self._thread.start() + logger.info("IndexedDB poller started (interval=%ds)", self.poll_interval) + + def stop(self): + self._stop_event.set() + if self._thread: + self._thread.join(timeout=10) + + def _poll_loop(self): + while not self._stop_event.is_set(): + for base_path in self.base_paths: + db_path = os.path.join( + os.path.expanduser(os.path.expandvars(base_path)), + self.relpath, + ) + if os.path.isdir(db_path): + try: + self._extract_from_leveldb(db_path) + except ImportError: + logger.error( + "plyvel not installed. " + "Run: pip install plyvel" + ) + return # 不重试 + except Exception as e: + logger.error("IndexedDB extraction failed: %s", e) + + self._stop_event.wait(self.poll_interval) + + def _extract_from_leveldb(self, db_path: str): + """ + 从 LevelDB 中提取会话数据。 + + IndexedDB 在 LevelDB 中的 key 格式: + _file_version + <database_id>-<object_store_id>-<index_id>-<key> + + Claude Desktop 的会话数据存储在 object store 中, + key 前缀通常为 conversation 或 message 相关。 + + 由于 schema 未公开文档化,采用启发式提取: + 1. 遍历所有 key-value 对 + 2. 尝试 JSON 解析 value + 3. 筛选包含 role=user/assistant 的记录 + """ + import plyvel + + cursor_key = f"indexeddb:{db_path}" + cursor = self.cursor_manager.get_cursor(cursor_key) + last_count = cursor.last_position # 复用 last_position 存已处理记录数 + + db = plyvel.DB(db_path, create_if_missing=False) + try: + events = [] + count = 0 + for key, value in db.iterator(): + count += 1 + if count <= last_count: + continue + + try: + record = json.loads(value) + except (json.JSONDecodeError, UnicodeDecodeError): + continue + + event = self._try_extract_event(record) + if event: + events.append(event) + + if events: + self.batch_callback(events) + + self.cursor_manager.update_cursor(cursor_key, count) + finally: + db.close() + + def _try_extract_event(self, record: Dict) -> Optional[Dict]: + """启发式提取:从 JSON 记录中查找对话消息。""" + # 场景 1: 记录本身是消息 + if record.get("role") in ("user", "assistant"): + return { + "role": record["role"], + "content": record.get("content", ""), + "type": "message", + "timestamp": record.get("timestamp"), + "session_id": record.get("conversationId", + record.get("sessionId")), + "project_name": None, + "source": self.source_id, + } + + # 场景 2: 记录包含 messages 数组 + messages = record.get("messages", []) + for msg in messages: + if isinstance(msg, dict) and msg.get("role") in ("user", "assistant"): + return { + "role": msg["role"], + "content": msg.get("content", ""), + "type": "message", + "timestamp": msg.get("timestamp"), + "session_id": record.get("id", + record.get("conversationId")), + "project_name": None, + "source": self.source_id, + } + + return None + + def force_flush(self): + """触发一次即时提取。""" + # 简化实现:设置 wait 为 0 让循环立即执行 + pass +``` + +### 3. GenericJsonlWatcher(预留,本期不实现) + +```python +# openviking/daemon/watchers/generic_jsonl_watcher.py — 预留接口 + +class GenericJsonlWatcher(BaseWatcher): + """ + 通用 JSONL 文件监听器。 + 支持自定义 JSON 字段映射(role_field, content_field 等)。 + 本期不实现,仅注册到 Factory 以便未来扩展。 + """ + watcher_type = "generic_jsonl" + # ... +``` + +--- + +## ETL 管道适配 + +### BatchETLPipeline — 最小改动 + +当前 `process_batch(events: List[Dict])` 接收裸 Dict 列表,不需要改接口。 + +需要改动的地方: + +**1. ConversationTurn 增加 source 字段** + +```python +# models.py + +@dataclass +class ConversationTurn: + user_prompt: str + assistant_response: str + session_id: Optional[str] = None + project_name: Optional[str] = None + timestamp: Optional[str] = None + source: Optional[str] = None # 新增 +``` + +**2. ConversationReconstructor 传递 source** + +```python +# conversation_reconstructor.py — reconstruct() 方法 + +def reconstruct(self, events: List[Dict]) -> List[ConversationTurn]: + # ... 现有配对逻辑 ... + turns.append(ConversationTurn( + user_prompt=user_event["content"], + assistant_response=assistant_event["content"], + session_id=user_event.get("session_id"), + project_name=user_event.get("project_name"), + timestamp=user_event.get("timestamp"), + source=user_event.get("source"), # 新增 + )) + return turns +``` + +**3. ExtractedKnowledge 增加 source 字段** + +```python +# models.py + +@dataclass +class ExtractedKnowledge: + status: str + category: str + title: str + content: str + confidence: float = 0.0 + project_name: Optional[str] = None + entity_links: List[str] = field(default_factory=list) + actionable_steps: List[str] = field(default_factory=list) + timestamp: Optional[str] = None + source: Optional[str] = None # 新增 +``` + +**4. KnowledgeExtractor 传递 source** + +`extract()` 方法在构造 `ExtractedKnowledge` 时,从 `ConversationTurn.source` 赋值到 `ExtractedKnowledge.source`。LLM prompt 不需要改。 + +--- + +## 路由与存储适配 + +### KnowledgeRouter — Source-aware 路由 + +```python +# knowledge_router.py — 重构 route() + +class KnowledgeRouter: + """Routes knowledge items to viking:// URIs based on category and source.""" + + def route(self, knowledge: ExtractedKnowledge) -> Optional[str]: + category = knowledge.category + source = knowledge.source or "unknown" + project_name = knowledge.project_name + title = self._sanitize_filename(knowledge.title) + + if category == "skills": + # viking://skills/{source}/{title}.md + # 例: viking://skills/claude_code/fix-docker-networking.md + # viking://skills/claude_desktop/design-review-pattern.md + return f"viking://skills/{self._sanitize_filename(source)}/{title}.md" + + elif category == "memories": + if project_name: + safe_project = self._sanitize_filename(project_name) + return f"viking://memories/projects/{safe_project}/decisions.md" + else: + return f"viking://memories/global/{title}.md" + + elif category == "resources": + entity_links = knowledge.entity_links + tech_stack = ( + self._sanitize_filename(entity_links[0]) + if entity_links else "general" + ) + return f"viking://resources/{tech_stack}/{title}.md" + + else: + logger.warning("Unknown category: %s", category) + return None +``` + +**变更影响**:原来 skills 统一写入 `viking://skills/claude_code/`,现在按 source 分目录。已有数据不受影响(路径不变),新数据会写入 `viking://skills/claude_desktop/` 等子目录。 + +### VikingStorageAdapter — Source-aware 模板 + +```python +# storage_adapter.py — 修改 _format_skill() 和 _format_resource() + +@staticmethod +def _format_skill(knowledge: ExtractedKnowledge, timestamp: str) -> str: + source_label = knowledge.source or "Claude Code" + # 将 source_id 映射为人类可读名称 + SOURCE_LABELS = { + "claude_code": "Claude Code CLI", + "claude_desktop": "Claude Desktop", + } + label = SOURCE_LABELS.get(source_label, source_label) + + steps = "" + if knowledge.actionable_steps: + steps = "\n## Steps\n\n" + for i, step in enumerate(knowledge.actionable_steps, 1): + steps += f"{i}. {step}\n" + else: + steps = "\n## Steps\n\n(No specific steps)\n" + + return f"""# {knowledge.title} + +**Extracted**: {timestamp} +**Source**: {label} session +**Confidence**: {knowledge.confidence} + +## Content + +{knowledge.content} +{steps} +--- +*Auto-generated by OpenViking Active Daemon* +""" +``` + +`_format_resource()` 同理,将 `Source: Claude Code session` 替换为动态 source label。 + +### REST API 扩展 + +```python +# routers/daemon.py — 扩展 status 响应 + +@router.get("/api/v1/daemon/status") +async def get_daemon_status(): + daemon = get_daemon_service() + if not daemon: + return {"enabled": False} + + return { + "enabled": True, + "running": daemon.is_running, + "sources": [w.status for w in daemon.watchers], # 多源状态 + "db_path": daemon.db_path, + "batch_trigger_lines": daemon.batch_trigger_lines, + "batch_trigger_seconds": daemon.batch_trigger_seconds, + "cursor_count": daemon.cursor_manager.count() if daemon.cursor_manager else 0, + } +``` + +--- + +## DaemonService 重构 + +```python +# service.py — 多源版本 + +class DaemonService: + + def __init__(self, resource_service, config: DaemonConfig): + self.resource_service = resource_service + self.config = config + self.db_path = config.db_path or str( + Path.home() / ".qoderworkcn" / "openviking" / "daemon_cursors.db" + ) + self.batch_trigger_lines = config.batch_trigger_lines + self.batch_trigger_seconds = config.batch_trigger_seconds + + self.cursor_manager: Optional[CursorManager] = None + self.watchers: List[BaseWatcher] = [] # 从单个变为列表 + self.etl_pipeline: Optional[BatchETLPipeline] = None + self.storage_adapter: Optional[VikingStorageAdapter] = None + + self._running = False + self._etl_task: Optional[asyncio.Task] = None + self._batch_queue: asyncio.Queue = asyncio.Queue() + + async def start(self): + logger.info("Starting OpenViking Active Daemon (multi-source)...") + + self.cursor_manager = CursorManager(self.db_path) + self.etl_pipeline = BatchETLPipeline() + self.storage_adapter = VikingStorageAdapter(self.resource_service) + + # 确保 Watcher 类型已注册 + import openviking.daemon.watchers # noqa: F401 触发注册 + + # 为每个 source 创建 Watcher + for source_config in self.config.sources: + if not source_config.enabled: + continue + + # 展开路径中的环境变量和 ~ + expanded_paths = [ + os.path.expanduser(os.path.expandvars(p)) + for p in source_config.paths + ] + + # 确保至少一个路径存在 + for p in expanded_paths: + Path(p).mkdir(parents=True, exist_ok=True) + + watcher = WatcherFactory.create( + source_config=source_config.model_dump(), + cursor_manager=self.cursor_manager, + batch_callback=self._enqueue_batch, + default_lines=self.batch_trigger_lines, + default_seconds=self.batch_trigger_seconds, + ) + watcher.start() + self.watchers.append(watcher) + + self._etl_task = asyncio.create_task(self._etl_loop()) + self._running = True + logger.info("Daemon started with %d source(s)", len(self.watchers)) + + async def stop(self): + self._running = False + for watcher in self.watchers: + watcher.stop() + if self._etl_task: + await self._batch_queue.put(None) + try: + await asyncio.wait_for(self._etl_task, timeout=10) + except asyncio.TimeoutError: + self._etl_task.cancel() + logger.info("Daemon stopped") + + async def flush(self): + for watcher in self.watchers: + watcher.flush() +``` + +--- + +## 文件变更清单 + +| 文件路径 | 操作 | 说明 | +|---------|------|------| +| `openviking/daemon/models.py` | 修改 | 新增 `NormalizedEvent`,`ConversationTurn` 和 `ExtractedKnowledge` 增加 `source` 字段 | +| `openviking/daemon/watchers/base.py` | **新建** | `BaseWatcher` 抽象基类 | +| `openviking/daemon/watchers/factory.py` | **新建** | `WatcherFactory` 配置驱动实例化 | +| `openviking/daemon/watchers/__init__.py` | 修改 | 注册所有 Watcher 类型 | +| `openviking/daemon/watchers/claude_code_watcher.py` | 修改 | 继承 `BaseWatcher`,支持多路径,注入 `source_id` | +| `openviking/daemon/watchers/claude_desktop_watcher.py` | **新建** | `ClaudeDesktopWatcher` + `DesktopLogHandler` | +| `openviking/daemon/watchers/indexeddb_poller.py` | **新建** | IndexedDB LevelDB 轮询提取器(可选) | +| `openviking/daemon/service.py` | 修改 | 多 Watcher 管理,配置驱动初始化 | +| `openviking/daemon/conversation_reconstructor.py` | 修改 | 传递 `source` 到 `ConversationTurn` | +| `openviking/daemon/knowledge_extractor.py` | 修改 | 传递 `source` 到 `ExtractedKnowledge` | +| `openviking/daemon/knowledge_router.py` | 修改 | Source-aware URI 路由 | +| `openviking/daemon/storage_adapter.py` | 修改 | Source-aware Markdown 模板 | +| `openviking/server/config.py` | 修改 | 新增 `SourceConfig`,重构 `DaemonConfig`,兼容迁移 | +| `openviking/server/routers/daemon.py` | 修改 | 扩展 status API 返回多源状态 | +| `openviking/server/app.py` | 修改 | 适配新的 `DaemonService(config=...)` 签名 | +| `tests/daemon/test_multi_source.py` | **新建** | 多源配置、Factory、路由测试 | +| `tests/daemon/test_claude_desktop_watcher.py` | **新建** | Desktop Watcher 单元测试 | +| `tests/daemon/test_indexeddb_poller.py` | **新建** | IndexedDB 提取测试(mock LevelDB) | +| `docs/daemon/configuration.md` | 修改 | 新增多源配置文档 | + +--- + +## 分阶段实施 + +### Phase 1: 抽象层与多源框架(Day 1-2) + +| 任务 | 文件 | 验收标准 | +|-----|------|---------| +| 创建 `BaseWatcher` ABC | `watchers/base.py` | 抽象方法定义完整,`status` 属性可用 | +| 创建 `WatcherFactory` | `watchers/factory.py` | `register()` + `create()` 工作正常 | +| 重构 `ClaudeCodeWatcher` | `watchers/claude_code_watcher.py` | 继承 `BaseWatcher`,支持 `paths: List[str]`,注入 `source`,现有测试仍通过 | +| 新增 `NormalizedEvent` | `models.py` | `to_dict()` 输出与旧 Dict 格式兼容 | +| 重构 `DaemonConfig` | `server/config.py` | 旧配置自动迁移,新配置正常解析 | +| 重构 `DaemonService` | `service.py` | 多 Watcher 列表管理,ETL 循环不变 | +| Source-aware 路由 | `knowledge_router.py` | skills 路由包含 source 前缀 | +| Source-aware 模板 | `storage_adapter.py` | Markdown 模板显示正确 source | +| 传递 source 到 Turn/Knowledge | `reconstructor.py`, `extractor.py` | `source` 字段端到端传递 | +| 集成测试 | `tests/daemon/test_multi_source.py` | 模拟双源配置,验证两条数据链路 | + +**Phase 1 里程碑**:配置多个 `claude_code` 类型源(不同路径),两个 Watcher 各自监听、共享 ETL、路由到不同 `viking://skills/{source_id}/` 路径。 + +### Phase 2: Claude Desktop Watcher(Day 3-4) + +| 任务 | 文件 | 验收标准 | +|-----|------|---------| +| 实现 `ClaudeDesktopWatcher` | `watchers/claude_desktop_watcher.py` | JSONL 通道可用 | +| 实现 `DesktopLogHandler` | 同上 | 正确解析 `main.log` 格式,过滤非对话行 | +| 单元测试 | `tests/daemon/test_claude_desktop_watcher.py` | 覆盖日志解析、过滤、缓冲触发 | +| REST API 扩展 | `routers/daemon.py` | `/daemon/status` 返回多源状态 | +| 文档更新 | `docs/daemon/configuration.md` | 多源配置示例完整 | + +**Phase 2 里程碑**:配置 `claude_desktop` 源后,能解析 Desktop 日志文件并提取知识。 + +### Phase 3: IndexedDB 提取(Day 5,可选) + +| 任务 | 文件 | 验收标准 | +|-----|------|---------| +| 实现 `IndexedDBPoller` | `watchers/indexeddb_poller.py` | 能从 LevelDB 提取会话 | +| plyvel 可选依赖 | `pyproject.toml` | 未安装时优雅降级(日志警告) | +| 单元测试 | `tests/daemon/test_indexeddb_poller.py` | mock LevelDB 数据,验证提取逻辑 | +| 端到端验证 | — | 实际 Claude Desktop IndexedDB 数据提取成功 | + +**Phase 3 里程碑**:Claude Desktop 的 IndexedDB 会话数据可被自动提取和入库。 + +--- + +## 风险与权衡 + +| 风险 | 影响 | 缓解方案 | +|-----|------|---------| +| Claude Desktop IndexedDB schema 未公开 | 提取逻辑可能在版本更新后失效 | 启发式提取 + 异常兜底 + 日志告警 | +| `main.log` 不含完整对话内容 | Desktop Watcher 可能提取不到知识 | IndexedDB 作为补充通道;纯 JSONL 作为 fallback | +| plyvel 在 Windows 上安装困难 | IndexedDB 提取不可用 | 设为 optional dependency,未安装时跳过 | +| 多 Watcher 共享 CursorManager 的并发写入 | SQLite 锁争用 | CursorManager 已使用线程安全连接;每个 Watcher 操作不同 file_path 主键 | +| 多源事件到达顺序不确定 | 同一会话的 user/assistant 可能被分到不同 batch | ConversationReconstructor 已有排序逻辑;共享 Queue 保证 FIFO | +| source_id 冲突(两个源用同一 ID) | 路由路径碰撞 | 配置校验:启动时检查 source_id 唯一性 | +| 旧配置迁移出错 | 现有部署启动失败 | `model_validator` 中 try-except 兜底,日志警告 | + +--- + +## 验证方案 + +### 单元测试 + +```bash +# Phase 1 验收 +python -m pytest tests/daemon/test_multi_source.py -v +# 预期:多源 Factory 创建、配置迁移、Source-aware 路由 全部通过 + +# Phase 2 验收 +python -m pytest tests/daemon/test_claude_desktop_watcher.py -v +# 预期:日志解析、过滤规则、缓冲触发 全部通过 + +# 回归测试 +python -m pytest tests/daemon/ -v +# 预期:所有现有测试(test_claude_code_watcher, test_cursor_manager, test_integration)仍通过 +``` + +### 集成测试场景 + +**场景 A:双 Claude Code 源** + +```json +{ + "sources": [ + {"type": "claude_code", "paths": ["~/.claude/projects"], "source_id": "cc_main"}, + {"type": "claude_code", "paths": ["D:/Work/claude-logs"], "source_id": "cc_work"} + ] +} +``` + +验证:两个目录的 JSONL 文件各自被监听,知识分别路由到 `viking://skills/cc_main/` 和 `viking://skills/cc_work/`。 + +**场景 B:CLI + Desktop 混合** + +```json +{ + "sources": [ + {"type": "claude_code", "paths": ["~/.claude/projects"]}, + {"type": "claude_desktop", "paths": ["%APPDATA%/Claude"]} + ] +} +``` + +验证:两个源各自正常采集,CLI 源走 JSONL 通道,Desktop 源走 log 解析通道,两者共享 ETL 管道。 + +### 手动验证步骤 + +1. 启动 Daemon:`openviking-server --with-daemon --config test-config.json` +2. 向 `~/.claude/projects` 下写入测试 JSONL → 观察 `cc_main` 路由 +3. 向 `%APPDATA%/Claude/logs/` 下追加模拟 `main.log` 行 → 观察 `claude_desktop` 路由 +4. 调用 `GET /api/v1/daemon/status` → 确认两个 source 状态正确 +5. 检查 `viking://skills/` 下的目录结构 → 确认按 source 分目录 + +--- + +## 一句话总结 + +通过引入 BaseWatcher 抽象层 + WatcherFactory + SourceConfig 配置模型,将 Daemon 从单源硬编码升级为多源可配置框架,在保持 ETL 管道不变的前提下支持 Claude Code CLI、Claude Desktop、以及未来任意 JSONL 日志源的知识采集。 diff --git a/examples/daemon/docker-compose-daemon.yml b/examples/daemon/docker-compose-daemon.yml new file mode 100644 index 0000000000..81dba7604b --- /dev/null +++ b/examples/daemon/docker-compose-daemon.yml @@ -0,0 +1,14 @@ +version: '3.8' + +services: + openviking: + image: openviking:latest + environment: + - OV_DAEMON_ENABLED=true + - OV_DAEMON_WATCH_DIR=/data/claude-projects + - OV_DAEMON_DB_PATH=/data/daemon.db + volumes: + - ./claude-projects:/data/claude-projects + - ./daemon-data:/data + ports: + - "1933:1933" diff --git a/examples/daemon/start-daemon.bat b/examples/daemon/start-daemon.bat new file mode 100644 index 0000000000..3e06e7e327 --- /dev/null +++ b/examples/daemon/start-daemon.bat @@ -0,0 +1,10 @@ +@echo off +REM Start OpenViking with Active Daemon enabled on Windows + +set OV_DAEMON_ENABLED=true +set OV_DAEMON_WATCH_DIR=%USERPROFILE%\.claude\projects +set OV_DAEMON_BATCH_LINES=50 +set OV_DAEMON_BATCH_SECONDS=300 + +echo Starting OpenViking with Active Daemon... +openviking serve %* diff --git a/examples/daemon/start-daemon.sh b/examples/daemon/start-daemon.sh new file mode 100644 index 0000000000..26129bb4f1 --- /dev/null +++ b/examples/daemon/start-daemon.sh @@ -0,0 +1,12 @@ +#!/bin/bash +# Start OpenViking with Active Daemon enabled +set -e + +echo "Starting OpenViking with Active Daemon..." + +export OV_DAEMON_ENABLED=true +export OV_DAEMON_WATCH_DIR="${OV_DAEMON_WATCH_DIR:-$HOME/.claude/projects}" +export OV_DAEMON_BATCH_LINES="${OV_DAEMON_BATCH_LINES:-50}" +export OV_DAEMON_BATCH_SECONDS="${OV_DAEMON_BATCH_SECONDS:-300}" + +openviking serve "$@" diff --git a/openviking/daemon/__init__.py b/openviking/daemon/__init__.py new file mode 100644 index 0000000000..ba94f0d050 --- /dev/null +++ b/openviking/daemon/__init__.py @@ -0,0 +1,7 @@ +""" +OpenViking Active Daemon package. +Monitors AI tool logs and automatically extracts knowledge into viking:// storage. +""" +from openviking.daemon.service import DaemonService + +__all__ = ["DaemonService"] diff --git a/openviking/daemon/conversation_reconstructor.py b/openviking/daemon/conversation_reconstructor.py new file mode 100644 index 0000000000..179873467d --- /dev/null +++ b/openviking/daemon/conversation_reconstructor.py @@ -0,0 +1,55 @@ +""" +Reconstruct conversation turns from flat event lists. +Pairs user prompts with assistant responses into structured ConversationTurn objects. +""" +from typing import Dict, List + +from openviking.daemon.models import ConversationTurn +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class ConversationReconstructor: + """Reconstructs paired conversation turns from chronological events.""" + + def reconstruct(self, events: List[Dict]) -> List[ConversationTurn]: + """ + Pair user prompts with assistant responses. + Events are sorted by timestamp. An assistant response is paired + with the most recent unpaired user prompt. + """ + turns: List[ConversationTurn] = [] + current_user_prompt = None + current_metadata: Dict = {} + + sorted_events = sorted(events, key=lambda e: e.get("timestamp", "")) + + for event in sorted_events: + role = event.get("role") + content = event.get("content", "") + + if role == "user": + current_user_prompt = content + current_metadata = { + "session_id": event.get("session_id"), + "project_name": event.get("project_name"), + "timestamp": event.get("timestamp"), + "source_tool": event.get("tool_name"), + } + elif role == "assistant" and current_user_prompt: + turns.append( + ConversationTurn( + user_prompt=current_user_prompt, + assistant_response=content, + session_id=current_metadata.get("session_id"), + project_name=current_metadata.get("project_name"), + timestamp=current_metadata.get("timestamp"), + source_tool=current_metadata.get("source_tool"), + ) + ) + current_user_prompt = None + current_metadata = {} + + logger.info("Reconstructed %d conversation turns from %d events", len(turns), len(events)) + return turns diff --git a/openviking/daemon/cursor_manager.py b/openviking/daemon/cursor_manager.py new file mode 100644 index 0000000000..6b76bc2565 --- /dev/null +++ b/openviking/daemon/cursor_manager.py @@ -0,0 +1,92 @@ +""" +Cursor Manager for tracking file read positions. +Persists state in SQLite so Daemon can resume after restart. +""" +import sqlite3 +import time +from pathlib import Path +from typing import Dict + +from openviking.daemon.models import FileCursor +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class CursorManager: + """Manages file cursor state with SQLite persistence.""" + + def __init__(self, db_path: str): + self.db_path = db_path + self._init_db() + + def _init_db(self): + """Initialize the SQLite database and schema.""" + Path(self.db_path).parent.mkdir(parents=True, exist_ok=True) + + conn = sqlite3.connect(self.db_path) + try: + conn.execute(""" + CREATE TABLE IF NOT EXISTS file_cursors ( + file_path TEXT PRIMARY KEY, + last_position INTEGER NOT NULL DEFAULT 0, + last_read_time REAL NOT NULL DEFAULT 0.0, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP + ) + """) + conn.commit() + finally: + conn.close() + + def get_cursor(self, file_path: str) -> FileCursor: + """Get the cursor state for a file. Returns zero-position cursor if not found.""" + conn = sqlite3.connect(self.db_path) + try: + row = conn.execute( + "SELECT last_position, last_read_time FROM file_cursors WHERE file_path = ?", + (file_path,), + ).fetchone() + + if row: + return FileCursor( + file_path=file_path, + last_position=row[0], + last_read_time=row[1], + ) + return FileCursor(file_path=file_path) + finally: + conn.close() + + def update_cursor(self, file_path: str, position: int): + """Update the cursor position for a file.""" + conn = sqlite3.connect(self.db_path) + try: + conn.execute( + """ + INSERT OR REPLACE INTO file_cursors (file_path, last_position, last_read_time) + VALUES (?, ?, ?) + """, + (file_path, position, time.time()), + ) + conn.commit() + finally: + conn.close() + + def get_all_cursors(self) -> Dict[str, FileCursor]: + """Get all tracked cursor states.""" + conn = sqlite3.connect(self.db_path) + try: + rows = conn.execute( + "SELECT file_path, last_position, last_read_time FROM file_cursors" + ).fetchall() + + return { + row[0]: FileCursor( + file_path=row[0], + last_position=row[1], + last_read_time=row[2], + ) + for row in rows + } + finally: + conn.close() diff --git a/openviking/daemon/deduplicator.py b/openviking/daemon/deduplicator.py new file mode 100644 index 0000000000..9066b57bb4 --- /dev/null +++ b/openviking/daemon/deduplicator.py @@ -0,0 +1,35 @@ +""" +Knowledge deduplication based on content hashing. +Prevents duplicate knowledge from being written to viking:// storage. +""" +import hashlib +from typing import Set + +from openviking.daemon.models import ExtractedKnowledge +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class KnowledgeDeduplicator: + """Deduplicates knowledge items based on MD5 content hash.""" + + def __init__(self): + self.seen_hashes: Set[str] = set() + + def is_duplicate(self, knowledge: ExtractedKnowledge) -> bool: + """Check if this knowledge is a duplicate of something already seen.""" + content_hash = hashlib.md5( + knowledge.content.encode("utf-8") + ).hexdigest() + + if content_hash in self.seen_hashes: + logger.debug("Duplicate knowledge skipped: %s", knowledge.title) + return True + + self.seen_hashes.add(content_hash) + return False + + def clear(self): + """Clear the dedup cache.""" + self.seen_hashes.clear() diff --git a/openviking/daemon/etl_pipeline.py b/openviking/daemon/etl_pipeline.py new file mode 100644 index 0000000000..35a47315f8 --- /dev/null +++ b/openviking/daemon/etl_pipeline.py @@ -0,0 +1,64 @@ +""" +Batch ETL pipeline for processing conversation events into structured knowledge. +Orchestrates: Filter -> Reconstruct -> Extract -> Deduplicate +""" +import asyncio +from typing import Dict, List + +from openviking.daemon.models import ExtractedKnowledge +from openviking.daemon.filters import LowValueFilter +from openviking.daemon.conversation_reconstructor import ConversationReconstructor +from openviking.daemon.knowledge_extractor import KnowledgeExtractor +from openviking.daemon.deduplicator import KnowledgeDeduplicator +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class BatchETLPipeline: + """Orchestrates the full ETL flow from raw events to structured knowledge.""" + + def __init__(self, vlm_config=None): + self.filter = LowValueFilter() + self.reconstructor = ConversationReconstructor() + self.extractor = KnowledgeExtractor(vlm_config=vlm_config) + self.deduplicator = KnowledgeDeduplicator() + + async def process_batch(self, events: List[Dict]) -> List[ExtractedKnowledge]: + """ + Process a batch of raw conversation events. + + Flow: events -> filter -> reconstruct -> extract (parallel) -> deduplicate + """ + logger.info("Processing batch with %d events", len(events)) + + # Step 1: Filter low-value content + filtered_events = self.filter.apply(events) + logger.info("After filtering: %d events", len(filtered_events)) + + if not filtered_events: + return [] + + # Step 2: Reconstruct conversation turns + turns = self.reconstructor.reconstruct(filtered_events) + logger.info("Reconstructed %d conversation turns", len(turns)) + + if not turns: + return [] + + # Step 3: Extract knowledge in parallel + tasks = [self.extractor.extract(turn) for turn in turns] + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Step 4: Filter errors and deduplicate + extracted: List[ExtractedKnowledge] = [] + for result in results: + if isinstance(result, Exception): + logger.error("Extraction failed: %s", result) + continue + + if result is not None and not self.deduplicator.is_duplicate(result): + extracted.append(result) + + logger.info("Extracted %d knowledge items from batch", len(extracted)) + return extracted diff --git a/openviking/daemon/filters.py b/openviking/daemon/filters.py new file mode 100644 index 0000000000..4a3a58bbe2 --- /dev/null +++ b/openviking/daemon/filters.py @@ -0,0 +1,47 @@ +""" +Rule-based filters for low-value conversations. +Removes noise before LLM processing to save cost and improve quality. +""" +import re +from typing import Dict, List + +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class LowValueFilter: + """Filters out low-value conversations using regex rules.""" + + NOISE_PATTERNS = [ + r"^npm\s+(install|update|remove)", + r"^yarn\s+(add|remove)", + r"^pip\s+(install|uninstall)", + r"^git\s+(commit|push|pull|merge)", + r"^(SyntaxError|TypeError|ImportError|ModuleNotFoundError)", + r"^Retry\s+\d+/", + r"^Loading\.+", + r"^(format|indent|align)\s+(this|the)\s+code", + ] + + MIN_CONTENT_LENGTH = 20 + + def apply(self, events: List[Dict]) -> List[Dict]: + """Apply filtering rules to a list of events.""" + filtered = [] + + for event in events: + content = event.get("content", "").strip() + + # Rule 1: too short + if len(content) < self.MIN_CONTENT_LENGTH: + continue + + # Rule 2: noise pattern match + if any(re.match(p, content, re.IGNORECASE) for p in self.NOISE_PATTERNS): + continue + + filtered.append(event) + + logger.debug("Filtered %d events down to %d", len(events), len(filtered)) + return filtered diff --git a/openviking/daemon/knowledge_extractor.py b/openviking/daemon/knowledge_extractor.py new file mode 100644 index 0000000000..260ca2622d --- /dev/null +++ b/openviking/daemon/knowledge_extractor.py @@ -0,0 +1,121 @@ +""" +LLM-based knowledge extraction from conversations. +Uses OpenViking's existing VLM configuration for intelligent filtering and summarization. +""" +from typing import Dict, Optional + +from openviking.daemon.models import ConversationTurn, ExtractedKnowledge +from openviking_cli.utils.config import get_openviking_config +from openviking_cli.utils.llm import parse_json_from_response +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + +KNOWLEDGE_EXTRACTION_PROMPT = """# Role: OpenViking Automated Context Extraction Expert + +# Task: +Analyze the AI-assisted development conversation below and convert it into structured knowledge for viking:// storage. + +# Input: +User question: {user_prompt} +AI answer: {assistant_response} + +# Filtering rules: +Output <IGNORE> if the conversation is just: +- Minor syntax error fixes (missing semicolons, typos) +- Invalid commands or retry loops +- Pure code formatting or indentation changes +- Simple factual questions ("what is Python") + +Extract ONLY when the conversation contains: +- Explicit system configuration decisions +- Root cause analysis for complex bugs +- New architectural rules for the project +- Core development intent +- Reusable skills or best practices + +# Output (strict JSON, no markdown code blocks): +{{ + "status": "EXTRACTED" | "IGNORED", + "category": "skills" | "memories" | "resources", + "confidence": 0.0-1.0, + "title": "One-line summary (max 30 chars)", + "project_name": "project name or null", + "entity_links": ["tech tags", "module names"], + "content": "Concise conclusion. What was the problem, what was the solution, why this choice.", + "actionable_steps": ["steps if skills, else empty"] +}} +""" + + +class KnowledgeExtractor: + """Extracts structured knowledge from conversation turns using LLM.""" + + def __init__(self, vlm_config=None): + """ + Args: + vlm_config: Optional VLMConfig instance. If None, uses OpenViking's global config. + Pass a mock for testing. + """ + self._vlm_config = vlm_config + + def _get_vlm(self): + """Lazy-load the VLM config from OpenViking if not provided.""" + if self._vlm_config is None: + self._vlm_config = get_openviking_config().vlm + return self._vlm_config + + async def extract(self, turn: ConversationTurn) -> Optional[ExtractedKnowledge]: + """Extract knowledge from a conversation turn. Returns None if not valuable.""" + prompt = KNOWLEDGE_EXTRACTION_PROMPT.format( + user_prompt=turn.user_prompt, + assistant_response=turn.assistant_response, + ) + + try: + response = await self._call_llm(prompt) + + if not response or response.get("status") != "EXTRACTED": + return None + + if response.get("confidence", 0) < 0.6: + return None + + return ExtractedKnowledge( + status=response["status"], + category=response["category"], + title=response.get("title", "")[:50], + content=self._clean_content(response.get("content", "")), + confidence=response.get("confidence", 0.0), + project_name=response.get("project_name"), + entity_links=response.get("entity_links", []), + actionable_steps=response.get("actionable_steps", []), + timestamp=turn.timestamp, + source_tool=turn.source_tool, + ) + + except Exception as e: + logger.error("Error extracting knowledge: %s", e) + return None + + async def _call_llm(self, prompt: str) -> Optional[Dict]: + """Call VLM via get_completion_async and parse JSON response.""" + try: + vlm = self._get_vlm() + # get_completion_async returns str when no tools are provided + raw_response = await vlm.get_completion_async(prompt=prompt) + + # parse_json_from_response handles markdown code block stripping, + # JSON extraction, and json_repair fallback + return parse_json_from_response(raw_response) + + except Exception as e: + logger.error("LLM call failed: %s", e) + return None + + @staticmethod + def _clean_content(content: str) -> str: + """Remove markdown artifacts from content.""" + import re + content = re.sub(r"```.*?```", "", content, flags=re.DOTALL) + return content.strip() diff --git a/openviking/daemon/knowledge_router.py b/openviking/daemon/knowledge_router.py new file mode 100644 index 0000000000..cd043d905a --- /dev/null +++ b/openviking/daemon/knowledge_router.py @@ -0,0 +1,56 @@ +""" +Route extracted knowledge to appropriate viking:// URIs based on category and project. +""" +import re +from typing import Optional + +from openviking.daemon.models import ExtractedKnowledge +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class KnowledgeRouter: + """Routes knowledge items to viking:// URIs based on category.""" + + def route(self, knowledge: ExtractedKnowledge) -> Optional[str]: + """ + Determine the target URI for a knowledge item. + + Routing rules: + - skills -> viking://skills/<source_tool>/<title>.md + - memories (with project) -> viking://memories/projects/<project>/decisions.md + - memories (no project) -> viking://memories/global/<title>.md + - resources -> viking://resources/<tech_stack>/<title>.md + """ + category = knowledge.category + project_name = knowledge.project_name + title = self._sanitize_filename(knowledge.title) + source = knowledge.source_tool or "general" + + if category == "skills": + safe_source = self._sanitize_filename(source) + return f"viking://skills/{safe_source}/{title}.md" + + elif category == "memories": + if project_name: + safe_project = self._sanitize_filename(project_name) + return f"viking://memories/projects/{safe_project}/decisions.md" + else: + return f"viking://memories/global/{title}.md" + + elif category == "resources": + entity_links = knowledge.entity_links + tech_stack = self._sanitize_filename(entity_links[0]) if entity_links else "general" + return f"viking://resources/{tech_stack}/{title}.md" + + else: + logger.warning("Unknown category: %s", category) + return None + + @staticmethod + def _sanitize_filename(name: str) -> str: + """Sanitize a string for use as a filename.""" + sanitized = re.sub(r'[<>:"/\\|?*]', '_', name) + sanitized = sanitized.strip().replace(' ', '_') + return sanitized[:50] diff --git a/openviking/daemon/models.py b/openviking/daemon/models.py new file mode 100644 index 0000000000..f409762b53 --- /dev/null +++ b/openviking/daemon/models.py @@ -0,0 +1,88 @@ +""" +Data models for OpenViking Active Daemon. +""" +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + + +@dataclass +class FileCursor: + """Tracks file read position for incremental processing.""" + file_path: str + last_position: int = 0 + last_read_time: float = 0.0 + + def to_dict(self) -> Dict[str, Any]: + return { + "file_path": self.file_path, + "last_position": self.last_position, + "last_read_time": self.last_read_time, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "FileCursor": + return cls( + file_path=data["file_path"], + last_position=data.get("last_position", 0), + last_read_time=data.get("last_read_time", 0.0), + ) + + +@dataclass +class BatchBuffer: + """Buffer for accumulating events before batch processing.""" + lines: List[Dict[str, Any]] = field(default_factory=list) + byte_count: int = 0 + created_at: float = 0.0 + + def add_line(self, line: Dict[str, Any], byte_size: int): + self.lines.append(line) + self.byte_count += byte_size + + def is_empty(self) -> bool: + return len(self.lines) == 0 + + def clear(self): + self.lines.clear() + self.byte_count = 0 + self.created_at = 0.0 + + +@dataclass +class ConversationTurn: + """A complete user-assistant conversation turn.""" + user_prompt: str + assistant_response: str + session_id: Optional[str] = None + project_name: Optional[str] = None + timestamp: Optional[str] = None + source_tool: Optional[str] = None + + +@dataclass +class ExtractedKnowledge: + """Structured knowledge extracted from a conversation.""" + status: str # "EXTRACTED" | "IGNORED" + category: str # "skills" | "memories" | "resources" + title: str + content: str + confidence: float = 0.0 + project_name: Optional[str] = None + entity_links: List[str] = field(default_factory=list) + actionable_steps: List[str] = field(default_factory=list) + timestamp: Optional[str] = None + source_tool: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + return { + "status": self.status, + "category": self.category, + "title": self.title, + "content": self.content, + "confidence": self.confidence, + "project_name": self.project_name, + "entity_links": self.entity_links, + "actionable_steps": self.actionable_steps, + "timestamp": self.timestamp, + "source_tool": self.source_tool, + } diff --git a/openviking/daemon/service.py b/openviking/daemon/service.py new file mode 100644 index 0000000000..8ca1c1e2d1 --- /dev/null +++ b/openviking/daemon/service.py @@ -0,0 +1,190 @@ +""" +OpenViking Active Daemon main service. +Orchestrates multi-tool file watching, ETL processing, and knowledge storage. +""" +import asyncio +import os +from pathlib import Path +from typing import List, Optional + +from openviking.daemon.cursor_manager import CursorManager +from openviking.daemon.etl_pipeline import BatchETLPipeline +from openviking.daemon.storage_adapter import VikingStorageAdapter +from openviking.daemon.watchers.registry import create_watcher +from openviking.daemon.watchers import BaseWatcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class DaemonService: + """ + OpenViking Active Daemon main service. + Monitors multiple AI tool logs and extracts knowledge into viking:// storage. + """ + + def __init__( + self, + resource_service, + watcher_configs: Optional[List] = None, + db_path: Optional[str] = None, + # Backward-compatible single watcher args + watch_dir: Optional[str] = None, + batch_trigger_lines: int = 50, + batch_trigger_seconds: int = 300, + ): + self.resource_service = resource_service + + home = Path.home() + self.db_path = db_path or str( + home / ".qoderworkcn" / "openviking" / "daemon_cursors.db" + ) + + self.batch_trigger_lines = batch_trigger_lines + self.batch_trigger_seconds = batch_trigger_seconds + + # Build watcher config list + if watcher_configs: + self._watcher_configs = watcher_configs + else: + # Backward compatible: single claude_code watcher + from openviking.server.config import WatcherConfig + wd = watch_dir or str(home / ".claude" / "projects") + self._watcher_configs = [WatcherConfig( + tool_name="claude_code", + watch_dir=wd, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + )] + + # Components + self.cursor_manager: Optional[CursorManager] = None + self.watchers: List[BaseWatcher] = [] + self.etl_pipeline: Optional[BatchETLPipeline] = None + self.storage_adapter: Optional[VikingStorageAdapter] = None + + self._running = False + self._etl_task: Optional[asyncio.Task] = None + self._batch_queue: asyncio.Queue = asyncio.Queue() + + async def start(self): + """Start the Daemon service with all configured watchers.""" + logger.info("Starting OpenViking Active Daemon...") + + self.cursor_manager = CursorManager(self.db_path) + self.etl_pipeline = BatchETLPipeline() + self.storage_adapter = VikingStorageAdapter(self.resource_service) + + # Ensure db directory exists + Path(self.db_path).parent.mkdir(parents=True, exist_ok=True) + + # Start ETL loop + self._etl_task = asyncio.create_task(self._etl_loop()) + + # Create and start each watcher + for wc in self._watcher_configs: + watch_dir = os.path.expanduser(wc.watch_dir) + Path(watch_dir).mkdir(parents=True, exist_ok=True) + + try: + watcher = create_watcher( + tool_name=wc.tool_name, + watch_dir=watch_dir, + cursor_manager=self.cursor_manager, + batch_callback=self._enqueue_batch, + file_pattern=wc.file_pattern, + batch_trigger_lines=wc.batch_trigger_lines, + batch_trigger_seconds=wc.batch_trigger_seconds, + extra=wc.extra, + ) + watcher.start() + self.watchers.append(watcher) + logger.info("Watcher started: %s -> %s", wc.tool_name, watch_dir) + except Exception as e: + logger.warning("Failed to start watcher %s: %s", wc.tool_name, e) + + self._running = True + logger.info("Daemon started with %d watcher(s)", len(self.watchers)) + + async def stop(self): + """Stop all watchers and the ETL loop.""" + logger.info("Stopping OpenViking Active Daemon...") + + self._running = False + + for watcher in self.watchers: + try: + watcher.stop() + except Exception as e: + logger.warning("Error stopping watcher: %s", e) + + if self._etl_task: + await self._batch_queue.put(None) + try: + await asyncio.wait_for(self._etl_task, timeout=10) + except asyncio.TimeoutError: + self._etl_task.cancel() + + logger.info("Daemon stopped") + + def _enqueue_batch(self, events): + """Sync callback from watcher thread - puts events onto async queue.""" + try: + self._batch_queue.put_nowait(events) + except Exception as e: + logger.error("Failed to enqueue batch: %s", e) + + async def _etl_loop(self): + """Background loop that processes batches from the queue.""" + logger.info("ETL processing loop started") + + while self._running: + try: + events = await asyncio.wait_for( + self._batch_queue.get(), timeout=5.0 + ) + except asyncio.TimeoutError: + continue + + if events is None: + break + + try: + extracted = await self.etl_pipeline.process_batch(events) + if not extracted: + logger.info("No knowledge extracted from batch") + continue + + for knowledge in extracted: + try: + from openviking.server.identity import RequestContext, Role + from openviking_cli.session.user_id import UserIdentifier + + ctx = RequestContext( + user=UserIdentifier.the_default_user(), + role=Role.ROOT, + ) + success = await self.storage_adapter.write_knowledge( + knowledge, ctx + ) + if success: + logger.info("Successfully wrote: %s", knowledge.title) + else: + logger.warning("Failed to write: %s", knowledge.title) + except Exception as e: + logger.error("Error writing knowledge: %s", e) + + except Exception as e: + logger.error("Error in ETL processing: %s", e, exc_info=True) + + logger.info("ETL processing loop stopped") + + async def flush(self): + """Force flush all watchers' buffers.""" + for watcher in self.watchers: + watcher.flush() + logger.info("Manual flush triggered for %d watchers", len(self.watchers)) + + @property + def is_running(self) -> bool: + return self._running diff --git a/openviking/daemon/storage_adapter.py b/openviking/daemon/storage_adapter.py new file mode 100644 index 0000000000..e1a76684ef --- /dev/null +++ b/openviking/daemon/storage_adapter.py @@ -0,0 +1,155 @@ +""" +Adapter for writing extracted knowledge to OpenViking via ResourceService. +Generates temporary Markdown files and ingests them through the standard resource pipeline. +""" +import os +import tempfile +from datetime import datetime +from typing import Optional + +from openviking.daemon.models import ExtractedKnowledge +from openviking.daemon.knowledge_router import KnowledgeRouter +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class VikingStorageAdapter: + """Writes structured knowledge to OpenViking via ResourceService.add_resource().""" + + def __init__(self, resource_service): + self.resource_service = resource_service + self.router = KnowledgeRouter() + + async def write_knowledge( + self, + knowledge: ExtractedKnowledge, + ctx, + ) -> bool: + """ + Write a knowledge item to viking:// storage. + + Args: + knowledge: The extracted knowledge to write + ctx: OpenViking RequestContext + + Returns: + True if successful, False otherwise + """ + uri = self.router.route(knowledge) + if not uri: + logger.warning("Cannot route knowledge: %s", knowledge.title) + return False + + temp_file = self._generate_temp_markdown(knowledge) + + try: + await self.resource_service.add_resource( + path=temp_file, + ctx=ctx, + to=uri, + build_index=True, + summarize=False, + ) + logger.info("Knowledge ingested: %s", uri) + return True + + except Exception as e: + logger.error("Failed to ingest knowledge to %s: %s", uri, e) + return False + + finally: + if os.path.exists(temp_file): + os.unlink(temp_file) + + def _generate_temp_markdown(self, knowledge: ExtractedKnowledge) -> str: + """Generate a temporary Markdown file for the knowledge item.""" + fd, path = tempfile.mkstemp(suffix=".md") + content = self._format_content(knowledge) + + with os.fdopen(fd, "w", encoding="utf-8") as f: + f.write(content) + + return path + + def _format_content(self, knowledge: ExtractedKnowledge) -> str: + """Format knowledge content as Markdown based on category.""" + timestamp = datetime.now().isoformat() + + if knowledge.category == "skills": + return self._format_skill(knowledge, timestamp) + elif knowledge.category == "memories": + return self._format_memory(knowledge, timestamp) + else: + return self._format_resource(knowledge, timestamp) + + @staticmethod + def _format_skill(knowledge: ExtractedKnowledge, timestamp: str) -> str: + """Format a skill knowledge item.""" + steps = "" + if knowledge.actionable_steps: + steps = "\n## Steps\n\n" + for i, step in enumerate(knowledge.actionable_steps, 1): + steps += f"{i}. {step}\n" + else: + steps = "\n## Steps\n\n(No specific steps)\n" + + source = knowledge.source_tool or "Claude Code" + + return f"""# {knowledge.title} + +**Extracted**: {timestamp} +**Source**: {source} session +**Confidence**: {knowledge.confidence} + +## Content + +{knowledge.content} +{steps} +--- +*Auto-generated by OpenViking Active Daemon* +""" + + @staticmethod + def _format_memory(knowledge: ExtractedKnowledge, timestamp: str) -> str: + """Format a memory knowledge item (append-friendly).""" + ts = "" + if knowledge.timestamp: + try: + ts = datetime.fromisoformat(knowledge.timestamp).strftime("%Y-%m-%d %H:%M") + except ValueError: + ts = timestamp + else: + ts = timestamp + + tags = ", ".join(knowledge.entity_links) if knowledge.entity_links else "none" + + return f"""## [{ts}] {knowledge.title} + +{knowledge.content} + +**Tags**: {tags} + +--- + +""" + + @staticmethod + def _format_resource(knowledge: ExtractedKnowledge, timestamp: str) -> str: + """Format a resource knowledge item.""" + tech = knowledge.entity_links[0] if knowledge.entity_links else "general" + source = knowledge.source_tool or "Claude Code" + + return f"""# {knowledge.title} + +**Category**: {tech} +**Extracted**: {timestamp} +**Source**: {source} session + +## Content + +{knowledge.content} + +--- +*Auto-generated by OpenViking Active Daemon* +""" diff --git a/openviking/daemon/watchers/__init__.py b/openviking/daemon/watchers/__init__.py new file mode 100644 index 0000000000..450f68faf4 --- /dev/null +++ b/openviking/daemon/watchers/__init__.py @@ -0,0 +1,27 @@ +""" +Watcher abstractions for OpenViking Active Daemon. +Provides BaseWatcher protocol and watcher registry for multi-tool support. +""" +from typing import Protocol, runtime_checkable + + +@runtime_checkable +class BaseWatcher(Protocol): + """Protocol that all tool-specific watchers must implement.""" + + @property + def tool_name(self) -> str: + """Return the identifier for this watcher's tool (e.g. 'claude_code', 'aider').""" + ... + + def start(self) -> None: + """Start watching for file/database changes.""" + ... + + def stop(self) -> None: + """Stop watching and release resources.""" + ... + + def flush(self) -> None: + """Force flush any buffered events.""" + ... diff --git a/openviking/daemon/watchers/aider_watcher.py b/openviking/daemon/watchers/aider_watcher.py new file mode 100644 index 0000000000..cd2e20dc95 --- /dev/null +++ b/openviking/daemon/watchers/aider_watcher.py @@ -0,0 +1,148 @@ +""" +Aider chat history watcher. +Monitors .aider.chat.history.md files in project directories. +""" +import re +import time +import os +from typing import Dict, List, Optional + +from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher +from openviking.daemon.watchers.registry import register_watcher +from openviking.daemon.models import FileCursor +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@register_watcher("aider") +class AiderWatcher(BaseFileWatcher): + """Watches Aider .aider.chat.history.md files.""" + + # Regex patterns + HEADER_RE = re.compile(r'^# aider chat started at (.+)$') + PROJECT_RE = re.compile(r'^> (.+)$') + USER_RE = re.compile(r'^#### user:\s*$') + ASSISTANT_RE = re.compile(r'^#### assistant:\s*$') + + def __init__(self, watch_dir, cursor_manager, batch_callback, + file_pattern=".aider.chat.history.md", + batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs): + super().__init__( + watch_dir=watch_dir, + cursor_manager=cursor_manager, + batch_callback=batch_callback, + file_pattern=file_pattern, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + + @property + def tool_name(self) -> str: + return "aider" + + def matches_file_pattern(self, file_path: str) -> bool: + """Match .aider.chat.history.md files.""" + filename = os.path.basename(file_path) + return filename == ".aider.chat.history.md" + + def parse_line(self, line: str) -> Optional[Dict]: + """Not used - Aider uses multi-line parsing via _process_file override.""" + return None + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """Not used directly - _process_file creates normalized events.""" + return raw_event + + def _process_file(self, file_path: str): + """ + Override: Parse Aider's multi-line markdown format. + Extracts user/assistant conversation blocks from .aider.chat.history.md. + """ + try: + cursor = self.cursor_manager.get_cursor(file_path) + file_size = os.path.getsize(file_path) + + if file_size <= cursor.last_position: + return + + with open(file_path, "r", encoding="utf-8", errors="replace") as f: + f.seek(cursor.last_position) + new_content = f.read() + + new_position = cursor.last_position + len(new_content.encode("utf-8")) + + # Parse conversation blocks + events = self._parse_aider_content(new_content) + + for event in events: + event["tool_name"] = self.tool_name + byte_size = len(event.get("content", "").encode("utf-8")) + self._buffer.add_line(event, byte_size) + + self.cursor_manager.update_cursor(file_path, new_position) + self._check_batch_trigger() + + except Exception as e: + logger.error("[%s] Error processing %s: %s", self.tool_name, file_path, e) + + def _parse_aider_content(self, content: str) -> List[Dict]: + """Parse Aider markdown content into normalized events.""" + events = [] + lines = content.splitlines() + + current_role = None + current_content = [] + current_timestamp = None + project_name = None + + for line in lines: + # Check for timestamp header + header_match = self.HEADER_RE.match(line) + if header_match: + current_timestamp = header_match.group(1).strip() + continue + + # Check for project path + project_match = self.PROJECT_RE.match(line) + if project_match: + project_name = project_match.group(1).strip() + continue + + # Check for role markers + if self.USER_RE.match(line): + # Flush previous block + if current_role and current_content: + events.append(self._make_event(current_role, current_content, current_timestamp, project_name)) + current_role = "user" + current_content = [] + continue + + if self.ASSISTANT_RE.match(line): + if current_role and current_content: + events.append(self._make_event(current_role, current_content, current_timestamp, project_name)) + current_role = "assistant" + current_content = [] + continue + + # Accumulate content for current role + if current_role: + current_content.append(line) + + # Flush last block + if current_role and current_content: + events.append(self._make_event(current_role, current_content, current_timestamp, project_name)) + + return events + + def _make_event(self, role: str, content_lines: List[str], + timestamp: Optional[str], project_name: Optional[str]) -> Dict: + """Create a normalized event dict from parsed content.""" + content = "\n".join(content_lines).strip() + return { + "role": role, + "content": content, + "type": "message", + "timestamp": timestamp, + "project_name": project_name, + } diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py new file mode 100644 index 0000000000..0e6cce1e0f --- /dev/null +++ b/openviking/daemon/watchers/base_file_watcher.py @@ -0,0 +1,192 @@ +""" +Abstract base class for file-append based watchers. +Handles watchdog Observer lifecycle, cursor management, and batch buffering. +Subclasses only need to implement parse_line() and normalize_event(). +""" +import os +import time +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Callable, Dict, List, Optional + +from watchdog.events import FileSystemEventHandler, FileModifiedEvent +from watchdog.observers import Observer + +from openviking.daemon.models import BatchBuffer, FileCursor +from openviking.daemon.cursor_manager import CursorManager +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class _FileHandler(FileSystemEventHandler): + """Internal watchdog handler that delegates to BaseFileWatcher methods.""" + + def __init__(self, watcher: "BaseFileWatcher"): + super().__init__() + self._watcher = watcher + + def on_modified(self, event): + if event.is_directory: + return + file_path = event.src_path + if not self._watcher.matches_file_pattern(file_path): + return + self._watcher._process_file(file_path) + + +class BaseFileWatcher(ABC): + """ + Abstract base for file-append based watchers. + + Subclasses must implement: + - tool_name (property): Return tool identifier string + - parse_line(line): Parse a raw text line into a raw event dict (or None) + - normalize_event(raw_event): Convert raw event to normalized dict (or None to skip) + + Optional overrides: + - filter_event(event): Additional filtering. Return True to keep, False to skip. + - matches_file_pattern(path): Custom file matching logic. + """ + + def __init__( + self, + watch_dir: str, + cursor_manager: CursorManager, + batch_callback: Callable[[List[Dict]], None], + file_pattern: str = "*.jsonl", + batch_trigger_lines: int = 50, + batch_trigger_seconds: int = 300, + ): + self.watch_dir = os.path.expanduser(watch_dir) + self.cursor_manager = cursor_manager + self.batch_callback = batch_callback + self.file_pattern = file_pattern + self.batch_trigger_lines = batch_trigger_lines + self.batch_trigger_seconds = batch_trigger_seconds + + self._buffer = BatchBuffer() + self._observer: Optional[Observer] = None + self._handler: Optional[_FileHandler] = None + + @property + @abstractmethod + def tool_name(self) -> str: + """Return tool identifier (e.g. 'claude_code', 'aider').""" + ... + + @abstractmethod + def parse_line(self, line: str) -> Optional[Dict]: + """Parse a raw text line into a raw event dict. Return None to skip.""" + ... + + @abstractmethod + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """ + Convert a raw event dict to normalized format. + Normalized format must have at minimum: role, content, type, tool_name. + Return None to skip this event. + """ + ... + + def filter_event(self, event: Dict) -> bool: + """Additional filtering. Override for tool-specific rules. Default: keep all.""" + return True + + def matches_file_pattern(self, file_path: str) -> bool: + """Check if file matches the watcher's file pattern.""" + filename = os.path.basename(file_path) + if self.file_pattern.startswith("*."): + return filename.endswith(self.file_pattern[1:]) + elif self.file_pattern.startswith("."): + return filename.startswith(self.file_pattern) or filename == self.file_pattern.lstrip(".") + return filename == self.file_pattern + + def start(self) -> None: + """Start the watchdog Observer.""" + self._handler = _FileHandler(self) + self._observer = Observer() + self._observer.schedule(self._handler, self.watch_dir, recursive=True) + self._observer.daemon = True + self._observer.start() + logger.info("[%s] Watcher started on %s", self.tool_name, self.watch_dir) + + def stop(self) -> None: + """Stop the watchdog Observer.""" + if self._observer: + self._observer.stop() + self._observer.join(timeout=5) + logger.info("[%s] Watcher stopped", self.tool_name) + + def flush(self) -> None: + """Force flush the buffer.""" + self._flush_buffer() + + def _process_file(self, file_path: str): + """Read new content from file using cursor, parse, normalize, buffer.""" + try: + cursor = self.cursor_manager.get_cursor(file_path) + file_size = os.path.getsize(file_path) + + if file_size <= cursor.last_position: + return + + with open(file_path, "r", encoding="utf-8", errors="replace") as f: + f.seek(cursor.last_position) + new_content = f.read() + + new_position = cursor.last_position + len(new_content.encode("utf-8")) + + for line in new_content.splitlines(): + line = line.strip() + if not line: + continue + + raw_event = self.parse_line(line) + if raw_event is None: + continue + + normalized = self.normalize_event(raw_event) + if normalized is None: + continue + + if not self.filter_event(normalized): + continue + + # Ensure tool_name is set + normalized["tool_name"] = self.tool_name + + byte_size = len(line.encode("utf-8")) + self._buffer.add_line(normalized, byte_size) + + self.cursor_manager.update_cursor(file_path, new_position) + + self._check_batch_trigger() + + except Exception as e: + logger.error("[%s] Error processing file %s: %s", self.tool_name, file_path, e) + + def _check_batch_trigger(self): + """Check if batch trigger conditions are met.""" + if self._buffer.is_empty(): + return + + line_count = len(self._buffer.lines) + age = time.time() - self._buffer.created_at if self._buffer.created_at > 0 else 0 + + if line_count >= self.batch_trigger_lines or age >= self.batch_trigger_seconds: + self._flush_buffer() + + def _flush_buffer(self): + """Flush buffered events via batch_callback.""" + if self._buffer.is_empty(): + return + + events = self._buffer.lines.copy() + self._buffer.clear() + + logger.info("[%s] Flushing %d events", self.tool_name, len(events)) + try: + self.batch_callback(events) + except Exception as e: + logger.error("[%s] Batch callback failed: %s", self.tool_name, e) diff --git a/openviking/daemon/watchers/claude_code_watcher.py b/openviking/daemon/watchers/claude_code_watcher.py new file mode 100644 index 0000000000..4174d559b7 --- /dev/null +++ b/openviking/daemon/watchers/claude_code_watcher.py @@ -0,0 +1,60 @@ +""" +Claude Code JSONL log watcher. +Monitors ~/.claude/projects/<project>/<session>.jsonl files. +""" +import json +from typing import Dict, List, Optional + +from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher +from openviking.daemon.watchers.registry import register_watcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@register_watcher("claude_code") +class ClaudeCodeWatcher(BaseFileWatcher): + """Watches Claude Code JSONL log files and extracts conversation events.""" + + @property + def tool_name(self) -> str: + return "claude_code" + + def __init__(self, watch_dir, cursor_manager, batch_callback, + batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs): + super().__init__( + watch_dir=watch_dir, + cursor_manager=cursor_manager, + batch_callback=batch_callback, + file_pattern="*.jsonl", + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + + def parse_line(self, line: str) -> Optional[Dict]: + """Parse a JSONL line into a raw event dict.""" + if not line: + return None + try: + return json.loads(line) + except (json.JSONDecodeError, ValueError): + return None + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """Normalize Claude Code event to common format.""" + role = raw_event.get("role") + event_type = raw_event.get("type", "") + + if role not in ("user", "assistant"): + return None + if event_type and event_type != "message": + return None + + return { + "role": role, + "content": raw_event.get("content", ""), + "type": "message", + "timestamp": raw_event.get("timestamp"), + "session_id": raw_event.get("session_id"), + "project_name": raw_event.get("project_name"), + } diff --git a/openviking/daemon/watchers/continue_dev_watcher.py b/openviking/daemon/watchers/continue_dev_watcher.py new file mode 100644 index 0000000000..0d87b64324 --- /dev/null +++ b/openviking/daemon/watchers/continue_dev_watcher.py @@ -0,0 +1,68 @@ +""" +Continue.dev log watcher. +Monitors ~/.continue/ JSON log files for AI conversation events. +""" +import json +from typing import Dict, List, Optional + +from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher +from openviking.daemon.watchers.registry import register_watcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@register_watcher("continue_dev") +class ContinueDevWatcher(BaseFileWatcher): + """Watches Continue.dev JSON log files.""" + + def __init__(self, watch_dir, cursor_manager, batch_callback, + file_pattern="*.json", + batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs): + super().__init__( + watch_dir=watch_dir, + cursor_manager=cursor_manager, + batch_callback=batch_callback, + file_pattern=file_pattern, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + + @property + def tool_name(self) -> str: + return "continue_dev" + + def parse_line(self, line: str) -> Optional[Dict]: + """Parse a Continue.dev JSON log line.""" + if not line: + return None + try: + data = json.loads(line) + if not isinstance(data, dict): + return None + return data + except (json.JSONDecodeError, ValueError): + return None + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """ + Normalize Continue.dev event. + Continue.dev uses format: + {"role": "user"|"assistant", "content": "...", "timestamp": "..."} + """ + role = raw_event.get("role", "") + content = raw_event.get("content", "") + + if role not in ("user", "assistant"): + return None + if not content: + return None + + return { + "role": role, + "content": content, + "type": "message", + "timestamp": raw_event.get("timestamp"), + "session_id": raw_event.get("sessionId") or raw_event.get("session_id"), + "project_name": raw_event.get("workspaceDirectory"), + } diff --git a/openviking/daemon/watchers/cursor_watcher.py b/openviking/daemon/watchers/cursor_watcher.py new file mode 100644 index 0000000000..6224fc67d7 --- /dev/null +++ b/openviking/daemon/watchers/cursor_watcher.py @@ -0,0 +1,87 @@ +""" +Cursor IDE log watcher. +Monitors Cursor log files (JSON format) for AI conversation events. +""" +import json +from typing import Dict, List, Optional + +from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher +from openviking.daemon.watchers.registry import register_watcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@register_watcher("cursor") +class CursorWatcher(BaseFileWatcher): + """ + Watches Cursor IDE log files. + + Cursor stores AI conversations in log files under its workspace storage. + The format varies by version, so we implement flexible parsing. + """ + + def __init__(self, watch_dir, cursor_manager, batch_callback, + file_pattern="*.log", + batch_trigger_lines=50, batch_trigger_seconds=300, + extra=None, **kwargs): + super().__init__( + watch_dir=watch_dir, + cursor_manager=cursor_manager, + batch_callback=batch_callback, + file_pattern=file_pattern, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + self.extra = extra or {} + + @property + def tool_name(self) -> str: + return "cursor" + + def parse_line(self, line: str) -> Optional[Dict]: + """Parse a Cursor log line (JSON format).""" + if not line: + return None + try: + data = json.loads(line) + if not isinstance(data, dict): + return None + return data + except (json.JSONDecodeError, ValueError): + return None + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """ + Normalize Cursor log event. + Cursor logs use various schemas. We look for common patterns: + - {"type": "chat", "role": "user"|"assistant", "message": "..."} + - {"event": "ai_response", "content": "..."} + """ + # Try standard chat format + event_type = raw_event.get("type", "") + role = raw_event.get("role", "") + + if role in ("user", "human", "human_turn"): + content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or "" + if content: + return { + "role": "user", + "content": content, + "type": "message", + "timestamp": raw_event.get("timestamp") or raw_event.get("ts"), + "session_id": raw_event.get("conversationId") or raw_event.get("session_id"), + } + + if role in ("assistant", "ai", "ai_response", "bot"): + content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or "" + if content: + return { + "role": "assistant", + "content": content, + "type": "message", + "timestamp": raw_event.get("timestamp") or raw_event.get("ts"), + "session_id": raw_event.get("conversationId") or raw_event.get("session_id"), + } + + return None diff --git a/openviking/daemon/watchers/generic_jsonl_watcher.py b/openviking/daemon/watchers/generic_jsonl_watcher.py new file mode 100644 index 0000000000..1bc1b1da62 --- /dev/null +++ b/openviking/daemon/watchers/generic_jsonl_watcher.py @@ -0,0 +1,104 @@ +""" +Generic JSONL log watcher. +Supports arbitrary JSONL logs with customizable field mapping. +""" +import json +from typing import Dict, List, Optional + +from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher +from openviking.daemon.watchers.registry import register_watcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@register_watcher("generic_jsonl") +class GenericJSONLWatcher(BaseFileWatcher): + """ + Generic JSONL watcher with customizable field mapping. + + Extra config options (passed via constructor **kwargs or extra dict): + - role_field (str): JSON key for role. Default: "role" + - user_role_value (str): Value that indicates user. Default: "user" + - assistant_role_value (str): Value that indicates assistant. Default: "assistant" + - content_field (str): JSON key for content. Default: "content" + - timestamp_field (str): JSON key for timestamp. Default: "timestamp" + - session_id_field (str): JSON key for session_id. Default: "session_id" + - project_name_field (str): JSON key for project_name. Default: "project_name" + - type_field (str): JSON key for event type. Default: "type" + - message_type_value (str): Value that indicates a message event. Default: "message" + """ + + # Default field mappings + DEFAULTS = { + "role_field": "role", + "user_role_value": "user", + "assistant_role_value": "assistant", + "content_field": "content", + "timestamp_field": "timestamp", + "session_id_field": "session_id", + "project_name_field": "project_name", + "type_field": "type", + "message_type_value": "message", + } + + def __init__(self, watch_dir, cursor_manager, batch_callback, + file_pattern="*.jsonl", + batch_trigger_lines=50, batch_trigger_seconds=300, + extra=None, **kwargs): + super().__init__( + watch_dir=watch_dir, + cursor_manager=cursor_manager, + batch_callback=batch_callback, + file_pattern=file_pattern, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + self.extra = extra or {} + self.mapping = {**self.DEFAULTS, **self.extra} + + @property + def tool_name(self) -> str: + return "generic_jsonl" + + def parse_line(self, line: str) -> Optional[Dict]: + if not line: + return None + try: + return json.loads(line) + except (json.JSONDecodeError, ValueError): + return None + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + m = self.mapping + + role_field = m["role_field"] + raw_role = raw_event.get(role_field, "") + + # Map raw role to normalized role + if raw_role == m["user_role_value"]: + role = "user" + elif raw_role == m["assistant_role_value"]: + role = "assistant" + else: + return None + + content = raw_event.get(m["content_field"], "") + if not content: + return None + + # Optional type check + type_field = m.get("type_field") + message_type = m.get("message_type_value") + if type_field and type_field in raw_event: + if raw_event[type_field] != message_type: + return None + + return { + "role": role, + "content": content, + "type": "message", + "timestamp": raw_event.get(m["timestamp_field"]), + "session_id": raw_event.get(m["session_id_field"]), + "project_name": raw_event.get(m["project_name_field"]), + } diff --git a/openviking/daemon/watchers/registry.py b/openviking/daemon/watchers/registry.py new file mode 100644 index 0000000000..18a436347d --- /dev/null +++ b/openviking/daemon/watchers/registry.py @@ -0,0 +1,72 @@ +""" +Watcher registry for creating tool-specific watchers by name. +""" +from typing import Dict, Type + +from openviking.daemon.watchers import BaseWatcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + +# Registry mapping tool names to watcher classes +_WATCHER_REGISTRY: Dict[str, Type] = {} + + +def register_watcher(tool_name: str): + """Decorator to register a watcher class for a tool name.""" + def decorator(cls): + _WATCHER_REGISTRY[tool_name] = cls + return cls + return decorator + + +def create_watcher(tool_name: str, **kwargs) -> BaseWatcher: + """Factory: create a watcher instance by tool name.""" + cls = _WATCHER_REGISTRY.get(tool_name) + if cls is None: + available = list(_WATCHER_REGISTRY.keys()) + raise ValueError(f"Unknown watcher tool: '{tool_name}'. Available: {available}") + return cls(**kwargs) + + +def list_available_watchers() -> list: + """Return list of registered watcher tool names.""" + return list(_WATCHER_REGISTRY.keys()) + + +def _register_builtins(): + """Register built-in watchers. Called lazily to avoid import cycles.""" + if _WATCHER_REGISTRY: + return + try: + from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher + _WATCHER_REGISTRY["claude_code"] = ClaudeCodeWatcher + except ImportError: + pass + + try: + from openviking.daemon.watchers.generic_jsonl_watcher import GenericJSONLWatcher + _WATCHER_REGISTRY["generic_jsonl"] = GenericJSONLWatcher + except ImportError: + pass + + try: + from openviking.daemon.watchers.aider_watcher import AiderWatcher + _WATCHER_REGISTRY["aider"] = AiderWatcher + except ImportError: + pass + + try: + from openviking.daemon.watchers.cursor_watcher import CursorWatcher + _WATCHER_REGISTRY["cursor"] = CursorWatcher + except ImportError: + pass + + try: + from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher + _WATCHER_REGISTRY["continue_dev"] = ContinueDevWatcher + except ImportError: + pass + + +_register_builtins() diff --git a/openviking/server/app.py b/openviking/server/app.py index f221b289bd..31811504ee 100644 --- a/openviking/server/app.py +++ b/openviking/server/app.py @@ -34,6 +34,7 @@ code_router, console_router, content_router, + daemon_router, debug_router, filesystem_router, metrics_router, @@ -276,6 +277,33 @@ async def _oauth_gc_loop(store) -> None: # noqa: ANN001 task_tracker = get_task_tracker() task_tracker.start_cleanup_loop() + # Start Active Daemon if enabled + daemon_service = None + if config.daemon.enabled: + try: + from openviking.daemon.service import DaemonService + from openviking.server.config import DaemonConfig + + daemon_config = config.daemon + if not daemon_config.enabled: + daemon_config = DaemonConfig.from_env() + + if daemon_config.enabled: + resource_service = service.resources + daemon_service = DaemonService( + resource_service=resource_service, + watcher_configs=daemon_config.get_effective_watchers(), + db_path=daemon_config.db_path, + batch_trigger_lines=daemon_config.batch_trigger_lines, + batch_trigger_seconds=daemon_config.batch_trigger_seconds, + ) + await daemon_service.start() + from openviking.server.routers.daemon import set_daemon_service + set_daemon_service(daemon_service) + logger.info("Active Daemon started with %d watcher(s)", len(daemon_service.watchers)) + except Exception as e: + logger.warning("Failed to start Active Daemon: %s", e) + # Initialize tracing and OTLP log export from server.observability. from openviking.telemetry import tracer_module @@ -290,6 +318,16 @@ async def _oauth_gc_loop(store) -> None: # noqa: ANN001 await _initialize_runtime_state(app, service, config) yield + # Stop Active Daemon on shutdown + if daemon_service is not None: + try: + from openviking.server.routers.daemon import set_daemon_service + set_daemon_service(None) + await daemon_service.stop() + logger.info("Active Daemon stopped") + except Exception as e: + logger.warning("Failed to stop Active Daemon: %s", e) + # Cleanup from openviking.metrics.global_api import shutdown_metrics_async from openviking.observability.usage_audit import shutdown_usage_audit @@ -539,6 +577,7 @@ async def general_error_handler(request: Request, exc: Exception): app.include_router(watches_router) app.include_router(webdav_router) app.include_router(bot_router, prefix="/bot/v1") + app.include_router(daemon_router) # OAuth 2.1: when enabled, mount the official MCP SDK auth routes # (DCR / authorize / token / metadata) plus our authorize page + consent / diff --git a/openviking/server/bootstrap.py b/openviking/server/bootstrap.py index 7fa0228e25..c48aff8964 100644 --- a/openviking/server/bootstrap.py +++ b/openviking/server/bootstrap.py @@ -17,7 +17,7 @@ import uvicorn from openviking.server.app import create_app -from openviking.server.config import load_server_config +from openviking.server.config import DaemonConfig, load_server_config from openviking_cli.utils.config import OPENVIKING_CONFIG_ENV from openviking_cli.utils.config.config_loader import resolve_config_path from openviking_cli.utils.config.consts import ( @@ -169,6 +169,12 @@ def main(): dest="with_bot", help="Enable Bot API proxy to Vikingbot (requires Vikingbot running)", ) + parser.add_argument( + "--with-daemon", + action="store_true", + dest="with_daemon", + help="Enable Active Daemon for automatic knowledge extraction from AI tool logs", + ) parser.add_argument( "--bot-port", type=int, @@ -243,6 +249,8 @@ def main(): config.workers = args.workers if args.with_bot: config.with_bot = True + if args.with_daemon: + config.daemon.enabled = True # Configure logging for Uvicorn configure_uvicorn_logging() diff --git a/openviking/server/config.py b/openviking/server/config.py index 88943f8f4a..4a6becdd8a 100644 --- a/openviking/server/config.py +++ b/openviking/server/config.py @@ -3,7 +3,8 @@ """Server configuration for OpenViking HTTP Server.""" import sys -from typing import Dict, List, Literal, Optional +from pathlib import Path +from typing import Any, Dict, List, Literal, Optional from pydantic import BaseModel, Field, ValidationError @@ -156,6 +157,84 @@ class ToolOutputExternalizationConfig(BaseModel): model_config = {"extra": "forbid"} +class WatcherConfig(BaseModel): + """Configuration for a single tool watcher.""" + + tool_name: str + watch_dir: str + file_pattern: str = "*.jsonl" + enabled: bool = True + batch_trigger_lines: int = Field(50, gt=0) + batch_trigger_seconds: int = Field(300, gt=0) + extra: Dict[str, Any] = Field(default_factory=dict) + + model_config = {"extra": "forbid"} + + +class DaemonConfig(BaseModel): + """Configuration for OpenViking Active Daemon.""" + + enabled: bool = False + db_path: Optional[str] = None + watchers: List[WatcherConfig] = Field(default_factory=list) + + # Backward-compatible deprecated fields + watch_dir: Optional[str] = None + batch_trigger_lines: int = Field(50, gt=0) + batch_trigger_seconds: int = Field(300, gt=0) + + model_config = {"extra": "forbid"} + + def get_effective_watchers(self) -> List[WatcherConfig]: + """ + Return list of enabled watchers. + If watchers list is empty but watch_dir is set, create a single claude_code watcher + for backward compatibility. + """ + if self.watchers: + return [w for w in self.watchers if w.enabled] + elif self.watch_dir: + return [WatcherConfig( + tool_name="claude_code", + watch_dir=self.watch_dir, + batch_trigger_lines=self.batch_trigger_lines, + batch_trigger_seconds=self.batch_trigger_seconds, + )] + else: + return [WatcherConfig( + tool_name="claude_code", + watch_dir=str(Path.home() / ".claude" / "projects"), + batch_trigger_lines=self.batch_trigger_lines, + batch_trigger_seconds=self.batch_trigger_seconds, + )] + + @classmethod + def from_env(cls) -> "DaemonConfig": + """Load configuration from OV_DAEMON_* environment variables.""" + import os + import json as json_lib + + enabled = os.getenv("OV_DAEMON_ENABLED", "false").lower() == "true" + watchers_json = os.getenv("OV_DAEMON_WATCHERS") + + watchers = [] + if watchers_json: + try: + raw = json_lib.loads(watchers_json) + watchers = [WatcherConfig(**w) for w in raw] + except (json_lib.JSONDecodeError, ValueError): + pass + + return cls( + enabled=enabled, + watch_dir=os.getenv("OV_DAEMON_WATCH_DIR"), + db_path=os.getenv("OV_DAEMON_DB_PATH"), + batch_trigger_lines=int(os.getenv("OV_DAEMON_BATCH_LINES", "50")), + batch_trigger_seconds=int(os.getenv("OV_DAEMON_BATCH_SECONDS", "300")), + watchers=watchers, + ) + + class ServerConfig(BaseModel): host: str = "127.0.0.1" port: int = 1933 @@ -180,6 +259,7 @@ class ServerConfig(BaseModel): tool_output_externalization: ToolOutputExternalizationConfig = Field( default_factory=ToolOutputExternalizationConfig ) + daemon: DaemonConfig = Field(default_factory=DaemonConfig) model_config = {"extra": "forbid"} diff --git a/openviking/server/routers/__init__.py b/openviking/server/routers/__init__.py index ded23d4892..2d7e98903a 100644 --- a/openviking/server/routers/__init__.py +++ b/openviking/server/routers/__init__.py @@ -7,6 +7,7 @@ from openviking.server.routers.code import router as code_router from openviking.server.routers.console import router as console_router from openviking.server.routers.content import router as content_router +from openviking.server.routers.daemon import router as daemon_router from openviking.server.routers.debug import router as debug_router from openviking.server.routers.filesystem import router as filesystem_router from openviking.server.routers.metrics import router as metrics_router @@ -28,6 +29,7 @@ "admin_router", "bot_router", "code_router", + "daemon_router", "system_router", "resources_router", "filesystem_router", diff --git a/openviking/server/routers/daemon.py b/openviking/server/routers/daemon.py new file mode 100644 index 0000000000..7474e09d33 --- /dev/null +++ b/openviking/server/routers/daemon.py @@ -0,0 +1,81 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Daemon status API endpoints.""" +from typing import Any, Dict, List, Optional + +from fastapi import APIRouter + +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + +router = APIRouter(prefix="/api/v1/daemon", tags=["daemon"]) + +# Module-level reference to the running DaemonService +_daemon_service = None + + +def set_daemon_service(service): + """Called by app.py lifespan to register the daemon service.""" + global _daemon_service + _daemon_service = service + + +@router.get("/status") +async def get_daemon_status() -> Dict[str, Any]: + """ + Get multi-watcher daemon status. + + Returns: + { + "enabled": bool, + "running": bool, + "watchers": [...], + "available_tools": [...], + "db_path": str | null + } + """ + from openviking.daemon.watchers.registry import list_available_watchers + + if _daemon_service is None: + # Daemon not running — return config-based fallback + from openviking.server.config import DaemonConfig + + config = DaemonConfig.from_env() + return { + "enabled": config.enabled, + "running": False, + "watchers": [], + "available_tools": list_available_watchers(), + "db_path": config.db_path, + } + + svc = _daemon_service + watcher_statuses: List[Dict[str, Any]] = [] + for i, watcher in enumerate(svc.watchers): + wc = svc._watcher_configs[i] if i < len(svc._watcher_configs) else None + cursor_count = 0 + try: + if svc.cursor_manager: + cursor_count = len(svc.cursor_manager.get_all_cursors()) + except Exception: + pass + + watcher_statuses.append({ + "tool_name": watcher.tool_name, + "watch_dir": wc.watch_dir if wc else None, + "file_pattern": wc.file_pattern if wc else None, + "enabled": True, + "running": True, + "cursor_count": cursor_count, + "batch_trigger_lines": wc.batch_trigger_lines if wc else None, + "batch_trigger_seconds": wc.batch_trigger_seconds if wc else None, + }) + + return { + "enabled": True, + "running": svc.is_running, + "watchers": watcher_statuses, + "available_tools": list_available_watchers(), + "db_path": svc.db_path, + } diff --git a/tests/daemon/__init__.py b/tests/daemon/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/daemon/test_aider_watcher.py b/tests/daemon/test_aider_watcher.py new file mode 100644 index 0000000000..6ea7721d08 --- /dev/null +++ b/tests/daemon/test_aider_watcher.py @@ -0,0 +1,183 @@ +"""Tests for AiderWatcher parsing logic.""" +import time +from typing import Dict, Optional + +from openviking.daemon.watchers.aider_watcher import AiderWatcher + + +class FakeCursorManager: + """Minimal stub for testing.""" + def __init__(self): + self.cursors = {} + self.updates = [] + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + from openviking.daemon.models import FileCursor + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + self.updates.append((file_path, position)) + + +def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): + batches = [] + cursor_mgr = FakeCursorManager() + watcher = AiderWatcher( + watch_dir=str(tmp_path), + cursor_manager=cursor_mgr, + batch_callback=lambda events: batches.append(events), + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + return watcher, batches, cursor_mgr + + +def test_matches_file_pattern(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.matches_file_pattern("/project/.aider.chat.history.md") + assert not w.matches_file_pattern("/project/other.md") + assert not w.matches_file_pattern("/project/aider.chat.history.md") + + +def test_parse_single_user_assistant_block(tmp_path): + w, _, _ = _make_watcher(tmp_path) + content = ( + "# aider chat started at 2024-01-15 10:30:00\n" + "\n" + "> /path/to/project\n" + "\n" + "#### user:\n" + "How do I implement a REST API in Flask?\n" + "\n" + "#### assistant:\n" + "Here's how to create a basic Flask REST API:\n" + "Use Flask and add routes.\n" + ) + events = w._parse_aider_content(content) + assert len(events) == 2 + assert events[0]["role"] == "user" + assert events[0]["content"] == "How do I implement a REST API in Flask?" + assert events[1]["role"] == "assistant" + assert "Flask REST API" in events[1]["content"] + + +def test_parse_multiple_blocks(tmp_path): + w, _, _ = _make_watcher(tmp_path) + content = ( + "# aider chat started at 2024-01-15 10:30:00\n" + "> /my/project\n" + "#### user:\n" + "First question\n" + "#### assistant:\n" + "First answer\n" + "#### user:\n" + "Second question\n" + "#### assistant:\n" + "Second answer\n" + ) + events = w._parse_aider_content(content) + assert len(events) == 4 + assert events[0]["role"] == "user" + assert events[0]["content"] == "First question" + assert events[1]["role"] == "assistant" + assert events[1]["content"] == "First answer" + assert events[2]["role"] == "user" + assert events[2]["content"] == "Second question" + assert events[3]["role"] == "assistant" + assert events[3]["content"] == "Second answer" + + +def test_timestamp_and_project_extraction(tmp_path): + w, _, _ = _make_watcher(tmp_path) + content = ( + "# aider chat started at 2024-01-15 10:30:00\n" + "> /home/user/myproject\n" + "#### user:\n" + "Hello\n" + ) + events = w._parse_aider_content(content) + assert len(events) == 1 + assert events[0]["timestamp"] == "2024-01-15 10:30:00" + assert events[0]["project_name"] == "/home/user/myproject" + + +def test_empty_content_handling(tmp_path): + w, _, _ = _make_watcher(tmp_path) + events = w._parse_aider_content("") + assert events == [] + + events = w._parse_aider_content("# aider chat started at 2024-01-15 10:30:00\n") + assert events == [] + + +def test_multiline_content(tmp_path): + w, _, _ = _make_watcher(tmp_path) + content = ( + "#### user:\n" + "Line one\n" + "Line two\n" + "Line three\n" + ) + events = w._parse_aider_content(content) + assert len(events) == 1 + assert events[0]["content"] == "Line one\nLine two\nLine three" + + +def test_process_file_integration(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) + + test_file = tmp_path / ".aider.chat.history.md" + test_file.write_text( + "# aider chat started at 2024-01-15 10:30:00\n" + "> /project\n" + "#### user:\n" + "Hello\n" + "#### assistant:\n" + "Hi there\n", + encoding="utf-8", + ) + + w._process_file(str(test_file)) + + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["tool_name"] == "aider" + assert batches[0][1]["role"] == "assistant" + + +def test_incremental_read(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100) + + test_file = tmp_path / ".aider.chat.history.md" + test_file.write_text( + "#### user:\n" + "First message\n", + encoding="utf-8", + ) + w._process_file(str(test_file)) + + # Append more content + with open(str(test_file), "a", encoding="utf-8") as f: + f.write( + "#### assistant:\n" + "Response\n" + ) + w._process_file(str(test_file)) + + w.flush() + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][1]["role"] == "assistant" + + +def test_tool_name(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.tool_name == "aider" diff --git a/tests/daemon/test_base_file_watcher.py b/tests/daemon/test_base_file_watcher.py new file mode 100644 index 0000000000..badefba21f --- /dev/null +++ b/tests/daemon/test_base_file_watcher.py @@ -0,0 +1,163 @@ +"""Tests for BaseFileWatcher abstraction.""" +import os +import time +from typing import Dict, List, Optional +from unittest.mock import MagicMock + +from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher + + +class ConcreteWatcher(BaseFileWatcher): + """Concrete implementation for testing.""" + + @property + def tool_name(self) -> str: + return "test_tool" + + def parse_line(self, line: str) -> Optional[Dict]: + import json + try: + return json.loads(line) + except (json.JSONDecodeError, ValueError): + return None + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + role = raw_event.get("role") + content = raw_event.get("content", "") + if role not in ("user", "assistant"): + return None + return { + "role": role, + "content": content, + "type": "message", + "timestamp": raw_event.get("timestamp"), + } + + +class FakeCursorManager: + def __init__(self): + self.cursors = {} + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + from openviking.daemon.models import FileCursor + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + + +def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): + batches = [] + cursor_mgr = FakeCursorManager() + watcher = ConcreteWatcher( + watch_dir=str(tmp_path), + cursor_manager=cursor_mgr, + batch_callback=lambda events: batches.append(events), + file_pattern="*.jsonl", + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + return watcher, batches, cursor_mgr + + +def test_matches_file_pattern_jsonl(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.matches_file_pattern("/foo/bar.jsonl") + assert not w.matches_file_pattern("/foo/bar.txt") + + +def test_matches_file_pattern_exact(tmp_path): + w, _, _ = _make_watcher(tmp_path) + w.file_pattern = "history.md" + assert w.matches_file_pattern("/foo/history.md") + assert not w.matches_file_pattern("/foo/other.md") + + +def test_process_file_parses_and_normalizes(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) + + # Create a test file + test_file = tmp_path / "test.jsonl" + test_file.write_text( + '{"role": "user", "content": "hello"}\n' + '{"role": "assistant", "content": "hi there"}\n' + ) + + # Process it + w._process_file(str(test_file)) + + # Should have flushed (2 lines >= batch_trigger_lines=2) + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["tool_name"] == "test_tool" + assert batches[0][1]["role"] == "assistant" + + +def test_process_file_skips_invalid_lines(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100) + + test_file = tmp_path / "test.jsonl" + test_file.write_text( + '{"role": "user", "content": "hello"}\n' + 'not valid json\n' + '{"role": "tool", "content": "skipped"}\n' + ) + + w._process_file(str(test_file)) + w.flush() + + assert len(batches) == 1 + assert len(batches[0]) == 1 # only user message, tool role is filtered + assert batches[0][0]["role"] == "user" + + +def test_incremental_read_via_cursor(tmp_path): + w, batches, cm = _make_watcher(tmp_path, batch_trigger_lines=100) + + test_file = tmp_path / "test.jsonl" + test_file.write_text('{"role": "user", "content": "first"}\n') + w._process_file(str(test_file)) + + # Append more content + with open(str(test_file), "a", encoding="utf-8") as f: + f.write('{"role": "assistant", "content": "second"}\n') + w._process_file(str(test_file)) + + w.flush() + assert len(batches) == 1 + assert len(batches[0]) == 2 + + +def test_filter_event_override(tmp_path): + class FilteredWatcher(ConcreteWatcher): + def filter_event(self, event): + return "skip" not in event.get("content", "") + + batches = [] + w = FilteredWatcher( + watch_dir=str(tmp_path), + cursor_manager=FakeCursorManager(), + batch_callback=lambda events: batches.append(events), + file_pattern="*.jsonl", + batch_trigger_lines=100, + batch_trigger_seconds=300, + ) + + test_file = tmp_path / "test.jsonl" + test_file.write_text( + '{"role": "user", "content": "keep this"}\n' + '{"role": "user", "content": "skip this please"}\n' + ) + + w._process_file(str(test_file)) + w.flush() + + assert len(batches) == 1 + assert len(batches[0]) == 1 + assert batches[0][0]["content"] == "keep this" diff --git a/tests/daemon/test_claude_code_watcher.py b/tests/daemon/test_claude_code_watcher.py new file mode 100644 index 0000000000..a31b03e21a --- /dev/null +++ b/tests/daemon/test_claude_code_watcher.py @@ -0,0 +1,134 @@ +"""Tests for ClaudeCodeWatcher parsing and filtering logic.""" +import json +import os +import tempfile + +from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher + + +class FakeCursorManager: + """Minimal stub for testing.""" + def __init__(self): + self.cursors = {} + self.updates = [] + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + self.updates.append((file_path, position)) + + +def _make_watcher(batch_trigger_lines=50, batch_trigger_seconds=300): + batches = [] + cursor_mgr = FakeCursorManager() + watcher = ClaudeCodeWatcher( + watch_dir=tempfile.gettempdir(), + cursor_manager=cursor_mgr, + batch_callback=lambda lines: batches.append(lines), + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + return watcher, batches, cursor_mgr + + +def test_tool_name(): + watcher, _, _ = _make_watcher() + assert watcher.tool_name == "claude_code" + + +def test_parse_valid_jsonl_line(): + watcher, _, _ = _make_watcher() + line = '{"timestamp": "2026-06-15T10:30:00Z", "role": "user", "content": "Hello", "type": "message"}' + event = watcher.parse_line(line) + assert event is not None + assert event["role"] == "user" + assert event["content"] == "Hello" + + +def test_parse_invalid_line(): + watcher, _, _ = _make_watcher() + assert watcher.parse_line("not valid json") is None + assert watcher.parse_line("") is None + + +def test_normalize_event_user_message(): + watcher, _, _ = _make_watcher() + raw = {"role": "user", "type": "message", "content": "Hello", "timestamp": "2026-06-15T10:30:00Z"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "user" + assert result["type"] == "message" + assert result["content"] == "Hello" + + +def test_normalize_event_assistant_message(): + watcher, _, _ = _make_watcher() + raw = {"role": "assistant", "type": "message", "content": "AI answer"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "assistant" + assert result["content"] == "AI answer" + + +def test_normalize_event_excludes_system_role(): + watcher, _, _ = _make_watcher() + raw = {"role": "system", "type": "message", "content": "System msg"} + assert watcher.normalize_event(raw) is None + + +def test_normalize_event_excludes_tool_call(): + watcher, _, _ = _make_watcher() + raw = {"role": "assistant", "type": "tool_call", "content": "call"} + assert watcher.normalize_event(raw) is None + + +def test_normalize_event_excludes_tool_result(): + watcher, _, _ = _make_watcher() + raw = {"role": "assistant", "type": "tool_result", "content": "result"} + assert watcher.normalize_event(raw) is None + + +def test_filter_event_keeps_messages(): + watcher, _, _ = _make_watcher() + event = {"role": "user", "type": "message", "content": "Hello", "tool_name": "claude_code"} + assert watcher.filter_event(event) is True + + +def test_process_file(): + """Test that _process_file reads, parses, normalizes, and buffers events.""" + watcher, batches, cursor_mgr = _make_watcher(batch_trigger_lines=2) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f: + f.write(json.dumps({"role": "user", "type": "message", "content": "Hello"}) + "\n") + f.write(json.dumps({"role": "assistant", "type": "message", "content": "Hi"}) + "\n") + f.write(json.dumps({"role": "system", "type": "message", "content": "ignored"}) + "\n") + tmp_path = f.name + + try: + watcher._process_file(tmp_path) + # batch_trigger_lines=2, so 2 valid events should trigger flush + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][1]["role"] == "assistant" + assert all(e["tool_name"] == "claude_code" for e in batches[0]) + # Cursor should have been updated + assert len(cursor_mgr.updates) == 1 + assert cursor_mgr.updates[0][0] == tmp_path + finally: + os.unlink(tmp_path) + + +def test_force_flush_empty_buffer(): + watcher, batches, _ = _make_watcher() + watcher.flush() + assert len(batches) == 0 + + +def test_force_flush_with_data(): + watcher, batches, _ = _make_watcher() + watcher._buffer.add_line({"role": "user", "content": "test", "tool_name": "claude_code"}, 10) + watcher.flush() + assert len(batches) == 1 diff --git a/tests/daemon/test_config.py b/tests/daemon/test_config.py new file mode 100644 index 0000000000..3118d45b61 --- /dev/null +++ b/tests/daemon/test_config.py @@ -0,0 +1,85 @@ +"""Tests for WatcherConfig and DaemonConfig multi-watcher support.""" +import pytest +from openviking.server.config import WatcherConfig, DaemonConfig + + +def test_watcher_config_defaults(): + wc = WatcherConfig(tool_name="test", watch_dir="/tmp/test") + assert wc.file_pattern == "*.jsonl" + assert wc.enabled is True + assert wc.batch_trigger_lines == 50 + assert wc.extra == {} + + +def test_watcher_config_custom(): + wc = WatcherConfig( + tool_name="aider", + watch_dir="~/Projects", + file_pattern=".aider.chat.history.md", + batch_trigger_lines=100, + extra={"key": "value"}, + ) + assert wc.tool_name == "aider" + assert wc.file_pattern == ".aider.chat.history.md" + assert wc.extra == {"key": "value"} + + +def test_watcher_config_forbid_extra(): + with pytest.raises(ValueError): + WatcherConfig(tool_name="test", watch_dir="/tmp", unknown_field="x") + + +def test_daemon_config_get_effective_watchers_explicit(): + cfg = DaemonConfig( + enabled=True, + watchers=[ + WatcherConfig(tool_name="claude_code", watch_dir="/a"), + WatcherConfig(tool_name="aider", watch_dir="/b"), + ], + ) + effective = cfg.get_effective_watchers() + assert len(effective) == 2 + assert effective[0].tool_name == "claude_code" + assert effective[1].tool_name == "aider" + + +def test_daemon_config_get_effective_watchers_disabled_filtered(): + cfg = DaemonConfig( + enabled=True, + watchers=[ + WatcherConfig(tool_name="claude_code", watch_dir="/a"), + WatcherConfig(tool_name="aider", watch_dir="/b", enabled=False), + ], + ) + effective = cfg.get_effective_watchers() + assert len(effective) == 1 + assert effective[0].tool_name == "claude_code" + + +def test_daemon_config_backward_compat_watch_dir(): + cfg = DaemonConfig(enabled=True, watch_dir="~/.claude/projects") + effective = cfg.get_effective_watchers() + assert len(effective) == 1 + assert effective[0].tool_name == "claude_code" + assert effective[0].watch_dir == "~/.claude/projects" + + +def test_daemon_config_backward_compat_default(): + cfg = DaemonConfig(enabled=True) + effective = cfg.get_effective_watchers() + assert len(effective) == 1 + assert effective[0].tool_name == "claude_code" + + +def test_daemon_config_from_env_watchers(): + import os + os.environ["OV_DAEMON_ENABLED"] = "true" + os.environ["OV_DAEMON_WATCHERS"] = '[{"tool_name": "aider", "watch_dir": "/tmp"}]' + try: + cfg = DaemonConfig.from_env() + assert cfg.enabled is True + assert len(cfg.watchers) == 1 + assert cfg.watchers[0].tool_name == "aider" + finally: + os.environ.pop("OV_DAEMON_ENABLED", None) + os.environ.pop("OV_DAEMON_WATCHERS", None) diff --git a/tests/daemon/test_continue_dev_watcher.py b/tests/daemon/test_continue_dev_watcher.py new file mode 100644 index 0000000000..2a6e01dc14 --- /dev/null +++ b/tests/daemon/test_continue_dev_watcher.py @@ -0,0 +1,193 @@ +"""Tests for ContinueDevWatcher parsing and normalization logic.""" +import time +from typing import Dict, Optional + +from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher + + +class FakeCursorManager: + """Minimal stub for testing.""" + def __init__(self): + self.cursors = {} + self.updates = [] + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + from openviking.daemon.models import FileCursor + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + self.updates.append((file_path, position)) + + +def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): + batches = [] + cursor_mgr = FakeCursorManager() + watcher = ContinueDevWatcher( + watch_dir=str(tmp_path), + cursor_manager=cursor_mgr, + batch_callback=lambda events: batches.append(events), + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + return watcher, batches, cursor_mgr + + +def test_parse_line_valid_json(tmp_path): + w, _, _ = _make_watcher(tmp_path) + line = '{"role": "user", "content": "Hello"}' + result = w.parse_line(line) + assert result is not None + assert result["role"] == "user" + assert result["content"] == "Hello" + + +def test_parse_line_invalid_json(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.parse_line("not valid json") is None + assert w.parse_line("") is None + assert w.parse_line(" ") is None + + +def test_parse_line_non_dict(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.parse_line('"just a string"') is None + assert w.parse_line("[1, 2, 3]") is None + assert w.parse_line("42") is None + + +def test_normalize_standard_format(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "user", "content": "How do I sort a list?"} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "user" + assert event["content"] == "How do I sort a list?" + assert event["type"] == "message" + + +def test_normalize_assistant(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "assistant", "content": "Use the sorted() function."} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "assistant" + assert event["content"] == "Use the sorted() function." + + +def test_normalize_missing_content_filtered(tmp_path): + w, _, _ = _make_watcher(tmp_path) + # Empty content + raw = {"role": "user", "content": ""} + assert w.normalize_event(raw) is None + + # Missing content key entirely + raw = {"role": "user"} + assert w.normalize_event(raw) is None + + +def test_normalize_invalid_role_filtered(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "system", "content": "System message"} + assert w.normalize_event(raw) is None + + raw = {"role": "tool", "content": "Tool output"} + assert w.normalize_event(raw) is None + + raw = {"content": "No role at all"} + assert w.normalize_event(raw) is None + + +def test_normalize_session_id_mapping(tmp_path): + w, _, _ = _make_watcher(tmp_path) + # sessionId field + raw = {"role": "user", "content": "test", "sessionId": "session-abc-123"} + event = w.normalize_event(raw) + assert event is not None + assert event["session_id"] == "session-abc-123" + + # session_id field (alternative) + raw = {"role": "user", "content": "test", "session_id": "session-xyz-789"} + event = w.normalize_event(raw) + assert event is not None + assert event["session_id"] == "session-xyz-789" + + +def test_normalize_workspace_directory_to_project_name(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = { + "role": "user", + "content": "test", + "workspaceDirectory": "/home/user/my-project", + } + event = w.normalize_event(raw) + assert event is not None + assert event["project_name"] == "/home/user/my-project" + + +def test_normalize_timestamp(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "user", "content": "test", "timestamp": "2024-01-15T10:30:00Z"} + event = w.normalize_event(raw) + assert event is not None + assert event["timestamp"] == "2024-01-15T10:30:00Z" + + +def test_normalize_optional_fields_none(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "user", "content": "minimal event"} + event = w.normalize_event(raw) + assert event is not None + assert event["timestamp"] is None + assert event["session_id"] is None + assert event["project_name"] is None + + +def test_tool_name(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.tool_name == "continue_dev" + + +def test_process_file_integration(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) + + test_file = tmp_path / "continue.json" + test_file.write_text( + '{"role": "user", "content": "Hello"}\n' + '{"role": "assistant", "content": "Hi there"}\n', + encoding="utf-8", + ) + + w._process_file(str(test_file)) + + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["tool_name"] == "continue_dev" + assert batches[0][1]["role"] == "assistant" + + +def test_process_file_filters_non_chat(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100) + + test_file = tmp_path / "continue.json" + test_file.write_text( + '{"role": "user", "content": "Hello"}\n' + '{"role": "system", "content": "System prompt"}\n' + '{"role": "assistant", "content": "Hi"}\n' + 'not json at all\n', + encoding="utf-8", + ) + + w._process_file(str(test_file)) + w.flush() + + assert len(batches) == 1 + assert len(batches[0]) == 2 # only user + assistant, system filtered + assert batches[0][0]["role"] == "user" + assert batches[0][1]["role"] == "assistant" diff --git a/tests/daemon/test_conversation_reconstructor.py b/tests/daemon/test_conversation_reconstructor.py new file mode 100644 index 0000000000..b8c588855d --- /dev/null +++ b/tests/daemon/test_conversation_reconstructor.py @@ -0,0 +1,58 @@ +"""Tests for ConversationReconstructor.""" +from openviking.daemon.conversation_reconstructor import ConversationReconstructor + + +def test_reconstruct_simple_conversation(): + r = ConversationReconstructor() + events = [ + {"role": "user", "content": "How to configure PostgreSQL?", "timestamp": "2026-06-15T10:00:00Z"}, + {"role": "assistant", "content": "Edit postgresql.conf", "timestamp": "2026-06-15T10:00:01Z"}, + ] + turns = r.reconstruct(events) + assert len(turns) == 1 + assert turns[0].user_prompt == "How to configure PostgreSQL?" + assert turns[0].assistant_response == "Edit postgresql.conf" + + +def test_skip_orphaned_assistant(): + r = ConversationReconstructor() + events = [ + {"role": "assistant", "content": "Orphan answer", "timestamp": "2026-06-15T10:00:00Z"}, + ] + turns = r.reconstruct(events) + assert len(turns) == 0 + + +def test_multiple_turns(): + r = ConversationReconstructor() + events = [ + {"role": "user", "content": "Q1", "timestamp": "2026-06-15T10:00:00Z"}, + {"role": "assistant", "content": "A1", "timestamp": "2026-06-15T10:00:01Z"}, + {"role": "user", "content": "Q2", "timestamp": "2026-06-15T10:00:02Z"}, + {"role": "assistant", "content": "A2", "timestamp": "2026-06-15T10:00:03Z"}, + ] + turns = r.reconstruct(events) + assert len(turns) == 2 + assert turns[0].user_prompt == "Q1" + assert turns[1].user_prompt == "Q2" + + +def test_unpaired_user_prompt(): + r = ConversationReconstructor() + events = [ + {"role": "user", "content": "No answer", "timestamp": "2026-06-15T10:00:00Z"}, + ] + turns = r.reconstruct(events) + assert len(turns) == 0 + + +def test_preserves_metadata(): + r = ConversationReconstructor() + events = [ + {"role": "user", "content": "Q", "timestamp": "2026-06-15T10:00:00Z", "session_id": "s1", "project_name": "proj"}, + {"role": "assistant", "content": "A", "timestamp": "2026-06-15T10:00:01Z"}, + ] + turns = r.reconstruct(events) + assert turns[0].session_id == "s1" + assert turns[0].project_name == "proj" + assert turns[0].timestamp == "2026-06-15T10:00:00Z" diff --git a/tests/daemon/test_cursor_manager.py b/tests/daemon/test_cursor_manager.py new file mode 100644 index 0000000000..375df56d09 --- /dev/null +++ b/tests/daemon/test_cursor_manager.py @@ -0,0 +1,61 @@ +"""Tests for CursorManager.""" +import os +import tempfile + +import pytest + +from openviking.daemon.cursor_manager import CursorManager + + +@pytest.fixture +def temp_db(): + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + yield path + if os.path.exists(path): + os.unlink(path) + + +def test_default_cursor(temp_db): + manager = CursorManager(temp_db) + cursor = manager.get_cursor("/path/to/file.jsonl") + assert cursor.last_position == 0 + assert cursor.last_read_time == 0.0 + + +def test_save_and_load_cursor(temp_db): + manager = CursorManager(temp_db) + manager.update_cursor("/path/to/file.jsonl", 1024) + + cursor = manager.get_cursor("/path/to/file.jsonl") + assert cursor.last_position == 1024 + assert cursor.last_read_time > 0 + + +def test_persist_across_instances(temp_db): + manager1 = CursorManager(temp_db) + manager1.update_cursor("/path/to/file.jsonl", 2048) + + manager2 = CursorManager(temp_db) + cursor = manager2.get_cursor("/path/to/file.jsonl") + assert cursor.last_position == 2048 + + +def test_get_all_cursors(temp_db): + manager = CursorManager(temp_db) + manager.update_cursor("/path/file1.jsonl", 100) + manager.update_cursor("/path/file2.jsonl", 200) + + cursors = manager.get_all_cursors() + assert len(cursors) == 2 + assert cursors["/path/file1.jsonl"].last_position == 100 + assert cursors["/path/file2.jsonl"].last_position == 200 + + +def test_update_existing_cursor(temp_db): + manager = CursorManager(temp_db) + manager.update_cursor("/path/file.jsonl", 100) + manager.update_cursor("/path/file.jsonl", 500) + + cursor = manager.get_cursor("/path/file.jsonl") + assert cursor.last_position == 500 diff --git a/tests/daemon/test_cursor_watcher.py b/tests/daemon/test_cursor_watcher.py new file mode 100644 index 0000000000..e3982608f4 --- /dev/null +++ b/tests/daemon/test_cursor_watcher.py @@ -0,0 +1,187 @@ +"""Tests for CursorWatcher parsing and normalization logic.""" +import time +from typing import Dict, Optional + +from openviking.daemon.watchers.cursor_watcher import CursorWatcher + + +class FakeCursorManager: + """Minimal stub for testing.""" + def __init__(self): + self.cursors = {} + self.updates = [] + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + from openviking.daemon.models import FileCursor + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + self.updates.append((file_path, position)) + + +def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): + batches = [] + cursor_mgr = FakeCursorManager() + watcher = CursorWatcher( + watch_dir=str(tmp_path), + cursor_manager=cursor_mgr, + batch_callback=lambda events: batches.append(events), + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + ) + return watcher, batches, cursor_mgr + + +def test_parse_line_valid_json(tmp_path): + w, _, _ = _make_watcher(tmp_path) + line = '{"type": "chat", "role": "user", "message": "Hello"}' + result = w.parse_line(line) + assert result is not None + assert result["role"] == "user" + assert result["message"] == "Hello" + + +def test_parse_line_invalid_json(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.parse_line("not valid json") is None + assert w.parse_line("") is None + assert w.parse_line(" ") is None + + +def test_parse_line_non_dict(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.parse_line('"just a string"') is None + assert w.parse_line("[1, 2, 3]") is None + assert w.parse_line("42") is None + + +def test_normalize_standard_chat_format(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"type": "chat", "role": "user", "message": "How do I sort a list?"} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "user" + assert event["content"] == "How do I sort a list?" + assert event["type"] == "message" + + +def test_normalize_human_role(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "human", "content": "What is Python?"} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "user" + assert event["content"] == "What is Python?" + + +def test_normalize_human_turn_role(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "human_turn", "text": "Explain recursion"} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "user" + assert event["content"] == "Explain recursion" + + +def test_normalize_ai_response(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "ai_response", "text": "Recursion is when a function calls itself."} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "assistant" + assert event["content"] == "Recursion is when a function calls itself." + + +def test_normalize_assistant_role(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "assistant", "message": "Here is the answer."} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "assistant" + assert event["content"] == "Here is the answer." + + +def test_normalize_ai_role(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "ai", "content": "AI generated response"} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "assistant" + assert event["content"] == "AI generated response" + + +def test_normalize_bot_role(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "bot", "message": "Bot reply"} + event = w.normalize_event(raw) + assert event is not None + assert event["role"] == "assistant" + assert event["content"] == "Bot reply" + + +def test_normalize_non_chat_event_filtered(tmp_path): + w, _, _ = _make_watcher(tmp_path) + # System message should be filtered + raw = {"role": "system", "content": "System prompt"} + assert w.normalize_event(raw) is None + + # No role + raw = {"type": "info", "content": "some log"} + assert w.normalize_event(raw) is None + + # Empty content + raw = {"role": "user", "message": ""} + assert w.normalize_event(raw) is None + + +def test_normalize_timestamp_fields(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "user", "message": "test", "timestamp": "2024-01-15T10:30:00Z"} + event = w.normalize_event(raw) + assert event["timestamp"] == "2024-01-15T10:30:00Z" + + # Also check ts field + raw = {"role": "user", "message": "test", "ts": "2024-01-15T11:00:00Z"} + event = w.normalize_event(raw) + assert event["timestamp"] == "2024-01-15T11:00:00Z" + + +def test_normalize_session_id_mapping(tmp_path): + w, _, _ = _make_watcher(tmp_path) + raw = {"role": "user", "message": "test", "conversationId": "conv-123"} + event = w.normalize_event(raw) + assert event["session_id"] == "conv-123" + + raw = {"role": "user", "message": "test", "session_id": "sess-456"} + event = w.normalize_event(raw) + assert event["session_id"] == "sess-456" + + +def test_tool_name(tmp_path): + w, _, _ = _make_watcher(tmp_path) + assert w.tool_name == "cursor" + + +def test_process_file_integration(tmp_path): + w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) + + test_file = tmp_path / "cursor.log" + test_file.write_text( + '{"role": "user", "message": "Hello"}\n' + '{"role": "assistant", "message": "Hi there"}\n', + encoding="utf-8", + ) + + w._process_file(str(test_file)) + + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["tool_name"] == "cursor" + assert batches[0][1]["role"] == "assistant" diff --git a/tests/daemon/test_deduplicator.py b/tests/daemon/test_deduplicator.py new file mode 100644 index 0000000000..569c14a3fb --- /dev/null +++ b/tests/daemon/test_deduplicator.py @@ -0,0 +1,42 @@ +"""Tests for KnowledgeDeduplicator.""" +from openviking.daemon.deduplicator import KnowledgeDeduplicator +from openviking.daemon.models import ExtractedKnowledge + + +def _make_knowledge(title: str, content: str) -> ExtractedKnowledge: + return ExtractedKnowledge( + status="EXTRACTED", + category="memories", + title=title, + content=content, + ) + + +def test_first_occurrence_not_duplicate(): + dedup = KnowledgeDeduplicator() + k = _make_knowledge("Test", "Some unique content") + assert not dedup.is_duplicate(k) + + +def test_same_content_is_duplicate(): + dedup = KnowledgeDeduplicator() + k1 = _make_knowledge("Title A", "Same content") + k2 = _make_knowledge("Title B", "Same content") + assert not dedup.is_duplicate(k1) + assert dedup.is_duplicate(k2) + + +def test_different_content_not_duplicate(): + dedup = KnowledgeDeduplicator() + k1 = _make_knowledge("A", "Content 1") + k2 = _make_knowledge("B", "Content 2") + assert not dedup.is_duplicate(k1) + assert not dedup.is_duplicate(k2) + + +def test_clear_resets_cache(): + dedup = KnowledgeDeduplicator() + k = _make_knowledge("Test", "Content") + dedup.is_duplicate(k) + dedup.clear() + assert not dedup.is_duplicate(k) diff --git a/tests/daemon/test_filters.py b/tests/daemon/test_filters.py new file mode 100644 index 0000000000..52d0c07d50 --- /dev/null +++ b/tests/daemon/test_filters.py @@ -0,0 +1,53 @@ +"""Tests for LowValueFilter.""" +from openviking.daemon.filters import LowValueFilter + + +def test_filter_short_content(): + f = LowValueFilter() + events = [ + {"content": "Short"}, + {"content": "This is a longer meaningful conversation about architecture"}, + ] + filtered = f.apply(events) + assert len(filtered) == 1 + + +def test_filter_noise_patterns(): + f = LowValueFilter() + events = [ + {"content": "npm install lodash --save"}, + {"content": "git commit -m 'fix bug'"}, + {"content": "Let's discuss the architecture design pattern for the new module"}, + ] + filtered = f.apply(events) + assert len(filtered) == 1 + assert "architecture" in filtered[0]["content"] + + +def test_filter_pip_install(): + f = LowValueFilter() + events = [ + {"content": "pip install requests library for HTTP calls"}, + ] + filtered = f.apply(events) + assert len(filtered) == 0 + + +def test_preserves_valid_content(): + f = LowValueFilter() + events = [ + {"content": "We decided to use PostgreSQL instead of MySQL for better JSON support"}, + {"content": "The memory leak was caused by unclosed database connections"}, + ] + filtered = f.apply(events) + assert len(filtered) == 2 + + +def test_empty_content_filtered(): + f = LowValueFilter() + events = [ + {"content": ""}, + {"content": " "}, + ] + filtered = f.apply(events) + assert len(filtered) == 0 diff --git a/tests/daemon/test_generic_jsonl_watcher.py b/tests/daemon/test_generic_jsonl_watcher.py new file mode 100644 index 0000000000..38707cba98 --- /dev/null +++ b/tests/daemon/test_generic_jsonl_watcher.py @@ -0,0 +1,156 @@ +"""Tests for GenericJSONLWatcher with default and custom field mappings.""" +import json +import os +import tempfile + +from openviking.daemon.watchers.generic_jsonl_watcher import GenericJSONLWatcher + + +class FakeCursorManager: + """Minimal stub for testing.""" + def __init__(self): + self.cursors = {} + self.updates = [] + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + self.updates.append((file_path, position)) + + +def _make_watcher(extra=None, batch_trigger_lines=50, batch_trigger_seconds=300): + batches = [] + cursor_mgr = FakeCursorManager() + watcher = GenericJSONLWatcher( + watch_dir=tempfile.gettempdir(), + cursor_manager=cursor_mgr, + batch_callback=lambda lines: batches.append(lines), + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + extra=extra, + ) + return watcher, batches, cursor_mgr + + +# -- parse_line tests -- + +def test_parse_valid_jsonl(): + watcher, _, _ = _make_watcher() + line = '{"role": "user", "content": "Hello"}' + result = watcher.parse_line(line) + assert result is not None + assert result["role"] == "user" + + +def test_parse_invalid_jsonl(): + watcher, _, _ = _make_watcher() + assert watcher.parse_line("not json") is None + assert watcher.parse_line("") is None + + +# -- normalize_event with default mapping -- + +def test_normalize_event_default_user(): + watcher, _, _ = _make_watcher() + raw = {"role": "user", "content": "Hello", "type": "message"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "user" + assert result["content"] == "Hello" + assert result["type"] == "message" + + +def test_normalize_event_default_assistant(): + watcher, _, _ = _make_watcher() + raw = {"role": "assistant", "content": "Response", "type": "message"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "assistant" + + +def test_normalize_event_excludes_unknown_role(): + watcher, _, _ = _make_watcher() + raw = {"role": "system", "content": "System msg", "type": "message"} + assert watcher.normalize_event(raw) is None + + +def test_normalize_event_excludes_empty_content(): + watcher, _, _ = _make_watcher() + raw = {"role": "user", "content": "", "type": "message"} + assert watcher.normalize_event(raw) is None + + +def test_normalize_event_excludes_wrong_type(): + watcher, _, _ = _make_watcher() + raw = {"role": "user", "content": "Hello", "type": "tool_call"} + assert watcher.normalize_event(raw) is None + + +def test_normalize_event_passes_when_no_type_field(): + """When the type field is absent, the type check is skipped.""" + watcher, _, _ = _make_watcher() + raw = {"role": "user", "content": "Hello"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "user" + + +# -- normalize_event with custom field mapping -- + +def test_normalize_event_custom_role_field(): + extra = {"role_field": "speaker", "user_role_value": "human", "assistant_role_value": "bot"} + watcher, _, _ = _make_watcher(extra=extra) + raw = {"speaker": "human", "content": "Hi", "type": "message"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "user" + + +def test_normalize_event_custom_content_field(): + extra = {"content_field": "text"} + watcher, _, _ = _make_watcher(extra=extra) + raw = {"role": "user", "text": "Custom content", "type": "message"} + result = watcher.normalize_event(raw) + assert result is not None + assert result["content"] == "Custom content" + + +def test_normalize_event_custom_type_field(): + extra = {"type_field": "event_type", "message_type_value": "chat"} + watcher, _, _ = _make_watcher(extra=extra) + raw = {"role": "user", "content": "Hello", "event_type": "chat"} + result = watcher.normalize_event(raw) + assert result is not None + + raw2 = {"role": "user", "content": "Hello", "event_type": "tool_use"} + assert watcher.normalize_event(raw2) is None + + +# -- _process_file integration -- + +def test_process_file(): + watcher, batches, cursor_mgr = _make_watcher(batch_trigger_lines=2) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f: + f.write(json.dumps({"role": "user", "content": "Hello", "type": "message"}) + "\n") + f.write(json.dumps({"role": "assistant", "content": "Hi", "type": "message"}) + "\n") + f.write(json.dumps({"role": "system", "content": "ignored", "type": "message"}) + "\n") + tmp_path = f.name + + try: + watcher._process_file(tmp_path) + assert len(batches) == 1 + assert len(batches[0]) == 2 + assert batches[0][0]["role"] == "user" + assert batches[0][1]["role"] == "assistant" + assert all(e["tool_name"] == "generic_jsonl" for e in batches[0]) + assert len(cursor_mgr.updates) == 1 + finally: + os.unlink(tmp_path) + + +def test_tool_name(): + watcher, _, _ = _make_watcher() + assert watcher.tool_name == "generic_jsonl" diff --git a/tests/daemon/test_integration.py b/tests/daemon/test_integration.py new file mode 100644 index 0000000000..c4b497092e --- /dev/null +++ b/tests/daemon/test_integration.py @@ -0,0 +1,189 @@ +"""Integration tests for OpenViking Active Daemon.""" +import json +import os +import tempfile +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from openviking.daemon.cursor_manager import CursorManager +from openviking.daemon.deduplicator import KnowledgeDeduplicator +from openviking.daemon.etl_pipeline import BatchETLPipeline +from openviking.daemon.filters import LowValueFilter +from openviking.daemon.conversation_reconstructor import ConversationReconstructor +from openviking.daemon.knowledge_router import KnowledgeRouter +from openviking.daemon.storage_adapter import VikingStorageAdapter +from openviking.daemon.models import ExtractedKnowledge + + +@pytest.fixture +def temp_watch_dir(): + with tempfile.TemporaryDirectory() as tmpdir: + yield Path(tmpdir) + + +@pytest.fixture +def temp_db(): + fd, path = tempfile.mkstemp(suffix=".db") + os.close(fd) + yield path + if os.path.exists(path): + os.unlink(path) + + +def test_jsonl_file_created_and_readable(temp_watch_dir): + """Verify we can create and read JSONL files in the watch directory.""" + jsonl_file = temp_watch_dir / "test_session.jsonl" + + events = [ + { + "timestamp": "2026-06-15T10:00:00Z", + "role": "user", + "content": "How to configure PostgreSQL for high availability?", + "type": "message", + }, + { + "timestamp": "2026-06-15T10:00:01Z", + "role": "assistant", + "content": "Edit postgresql.conf, set max_connections=100 and enable WAL archiving", + "type": "message", + }, + ] + + with open(jsonl_file, "w", encoding="utf-8") as f: + for event in events: + f.write(json.dumps(event) + "\n") + + assert jsonl_file.exists() + lines = jsonl_file.read_text(encoding="utf-8").strip().split("\n") + assert len(lines) == 2 + + parsed = [json.loads(line) for line in lines] + assert parsed[0]["role"] == "user" + assert parsed[1]["role"] == "assistant" + + +def test_cursor_tracks_jsonl_progress(temp_watch_dir, temp_db): + """Verify CursorManager correctly tracks incremental reads.""" + jsonl_file = temp_watch_dir / "session.jsonl" + + # Write first batch + with open(jsonl_file, "w", encoding="utf-8") as f: + f.write(json.dumps({"role": "user", "content": "First question", "type": "message"}) + "\n") + + cursor_mgr = CursorManager(temp_db) + cursor = cursor_mgr.get_cursor(str(jsonl_file)) + assert cursor.last_position == 0 + + # Read the file + with open(jsonl_file, "r", encoding="utf-8") as f: + f.seek(cursor.last_position) + lines = f.readlines() + new_position = f.tell() + + assert len(lines) == 1 + assert new_position > 0 + + cursor_mgr.update_cursor(str(jsonl_file), new_position) + + # Verify cursor persisted + cursor2 = cursor_mgr.get_cursor(str(jsonl_file)) + assert cursor2.last_position == new_position + + # Append more data + with open(jsonl_file, "a", encoding="utf-8") as f: + f.write(json.dumps({"role": "assistant", "content": "Answer", "type": "message"}) + "\n") + + # Read incrementally + with open(jsonl_file, "r", encoding="utf-8") as f: + f.seek(cursor2.last_position) + new_lines = f.readlines() + + assert len(new_lines) == 1 + assert json.loads(new_lines[0])["role"] == "assistant" + + +def test_filter_and_reconstruct_pipeline(): + """Verify the filter -> reconstruct pipeline works end-to-end.""" + events = [ + {"role": "user", "content": "Hi", "type": "message", "timestamp": "2026-06-15T10:00:00Z"}, # Too short + {"role": "user", "content": "npm install lodash --save-dev", "type": "message", "timestamp": "2026-06-15T10:00:01Z"}, # Noise + {"role": "user", "content": "How should we handle database migrations in production?", "type": "message", "timestamp": "2026-06-15T10:00:02Z"}, + {"role": "assistant", "content": "Use Alembic with versioned migration scripts and run them in a CI pipeline before deployment", "type": "message", "timestamp": "2026-06-15T10:00:03Z"}, + ] + + # Step 1: Filter + filt = LowValueFilter() + filtered = filt.apply(events) + assert len(filtered) == 2 # "Hi" and "npm install" removed + + # Step 2: Reconstruct + reconstructor = ConversationReconstructor() + turns = reconstructor.reconstruct(filtered) + assert len(turns) == 1 + assert "database migrations" in turns[0].user_prompt + assert "Alembic" in turns[0].assistant_response + + +def test_knowledge_router_all_categories(): + """Verify router handles all categories correctly.""" + router = KnowledgeRouter() + + skill = ExtractedKnowledge(status="EXTRACTED", category="skills", title="PG Config", content="...", source_tool="claude_code") + assert "skills/claude_code" in router.route(skill) + + mem_proj = ExtractedKnowledge(status="EXTRACTED", category="memories", title="Arch", content="...", project_name="myapp") + assert "memories/projects/myapp" in router.route(mem_proj) + + mem_global = ExtractedKnowledge(status="EXTRACTED", category="memories", title="General", content="...") + assert "memories/global" in router.route(mem_global) + + resource = ExtractedKnowledge(status="EXTRACTED", category="resources", title="Redis", content="...", entity_links=["Redis"]) + assert "resources/Redis" in router.route(resource) + + +def test_deduplicator_prevents_duplicates(): + """Verify deduplicator blocks duplicate knowledge.""" + dedup = KnowledgeDeduplicator() + + k1 = ExtractedKnowledge(status="EXTRACTED", category="memories", title="A", content="Same content here") + k2 = ExtractedKnowledge(status="EXTRACTED", category="memories", title="B", content="Same content here") + k3 = ExtractedKnowledge(status="EXTRACTED", category="memories", title="C", content="Different content") + + assert not dedup.is_duplicate(k1) + assert dedup.is_duplicate(k2) + assert not dedup.is_duplicate(k3) + + +def test_storage_adapter_formats_content(): + """Verify storage adapter generates correct Markdown for each category.""" + mock_service = MagicMock() + adapter = VikingStorageAdapter(mock_service) + + skill = ExtractedKnowledge( + status="EXTRACTED", category="skills", title="Test Skill", + content="Do this thing", confidence=0.9, + actionable_steps=["Step 1", "Step 2"], + ) + content = adapter._format_content(skill) + assert "Test Skill" in content + assert "Step 1" in content + assert "Step 2" in content + + memory = ExtractedKnowledge( + status="EXTRACTED", category="memories", title="Decision", + content="We chose X", entity_links=["tag1"], + timestamp="2026-06-15T10:00:00Z", + ) + content = adapter._format_content(memory) + assert "Decision" in content + assert "tag1" in content + + resource = ExtractedKnowledge( + status="EXTRACTED", category="resources", title="Guide", + content="Reference material", entity_links=["Docker"], + ) + content = adapter._format_content(resource) + assert "Guide" in content + assert "Docker" in content diff --git a/tests/daemon/test_knowledge_extractor.py b/tests/daemon/test_knowledge_extractor.py new file mode 100644 index 0000000000..905cc440d3 --- /dev/null +++ b/tests/daemon/test_knowledge_extractor.py @@ -0,0 +1,117 @@ +"""Tests for KnowledgeExtractor.""" +import json +from typing import Optional + +import pytest + +from openviking.daemon.knowledge_extractor import KnowledgeExtractor +from openviking.daemon.models import ConversationTurn, ExtractedKnowledge + + +class MockVLMConfig: + """Mock VLM config that returns canned string responses via get_completion_async.""" + + def __init__(self, response_text: str): + self.response_text = response_text + + async def get_completion_async(self, prompt: str = "", **kwargs) -> str: + return self.response_text + + +def _make_turn(user="How to configure PostgreSQL?", assistant="Edit postgresql.conf"): + return ConversationTurn( + user_prompt=user, + assistant_response=assistant, + timestamp="2026-06-15T10:00:00Z", + ) + + +@pytest.mark.asyncio +async def test_extract_valid_knowledge(): + llm_response = json.dumps({ + "status": "EXTRACTED", + "category": "skills", + "confidence": 0.9, + "title": "PostgreSQL Config", + "content": "Configure PostgreSQL by editing postgresql.conf", + "project_name": "my-project", + "entity_links": ["PostgreSQL"], + "actionable_steps": ["Edit postgresql.conf"], + }) + extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response)) + result = await extractor.extract(_make_turn()) + + assert result is not None + assert result.status == "EXTRACTED" + assert result.category == "skills" + assert result.confidence == 0.9 + + +@pytest.mark.asyncio +async def test_extract_ignored_status(): + llm_response = json.dumps({ + "status": "IGNORED", + "category": "memories", + "confidence": 0.3, + "title": "Typo fix", + "content": "Fixed typo", + }) + extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response)) + result = await extractor.extract(_make_turn()) + + assert result is None + + +@pytest.mark.asyncio +async def test_extract_low_confidence(): + llm_response = json.dumps({ + "status": "EXTRACTED", + "category": "memories", + "confidence": 0.4, + "title": "Low confidence", + "content": "Some content", + }) + extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response)) + result = await extractor.extract(_make_turn()) + + assert result is None + + +@pytest.mark.asyncio +async def test_extract_invalid_json(): + extractor = KnowledgeExtractor(vlm_config=MockVLMConfig("not valid json")) + result = await extractor.extract(_make_turn()) + + assert result is None + + +@pytest.mark.asyncio +async def test_extract_cleans_markdown(): + llm_response = json.dumps({ + "status": "EXTRACTED", + "category": "memories", + "confidence": 0.8, + "title": "Test", + "content": "```python\nsome code\n```\nActual content here", + }) + extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response)) + result = await extractor.extract(_make_turn()) + + assert result is not None + assert "```" not in result.content + + +@pytest.mark.asyncio +async def test_title_truncated(): + llm_response = json.dumps({ + "status": "EXTRACTED", + "category": "memories", + "confidence": 0.8, + "title": "A" * 100, + "content": "Content", + }) + extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response)) + result = await extractor.extract(_make_turn()) + + assert result is not None + assert len(result.title) <= 50 diff --git a/tests/daemon/test_knowledge_router.py b/tests/daemon/test_knowledge_router.py new file mode 100644 index 0000000000..a045dc4a5a --- /dev/null +++ b/tests/daemon/test_knowledge_router.py @@ -0,0 +1,73 @@ +"""Tests for KnowledgeRouter.""" +from openviking.daemon.knowledge_router import KnowledgeRouter +from openviking.daemon.models import ExtractedKnowledge + + +def _make(**kwargs) -> ExtractedKnowledge: + defaults = { + "status": "EXTRACTED", + "category": "memories", + "title": "Test Title", + "content": "Some content", + } + defaults.update(kwargs) + return ExtractedKnowledge(**defaults) + + +def test_route_skills(): + router = KnowledgeRouter() + k = _make(category="skills", title="PostgreSQL Config") + uri = router.route(k) + assert uri.startswith("viking://skills/general/") + assert uri.endswith(".md") + + +def test_route_skills_with_source_tool(): + router = KnowledgeRouter() + k = _make(category="skills", title="PostgreSQL Config", source_tool="claude_code") + uri = router.route(k) + assert uri.startswith("viking://skills/claude_code/") + assert uri.endswith(".md") + + +def test_route_memories_with_project(): + router = KnowledgeRouter() + k = _make(category="memories", title="Arch Decision", project_name="my-project") + uri = router.route(k) + assert "my-project" in uri + assert uri.endswith("decisions.md") + + +def test_route_memories_global(): + router = KnowledgeRouter() + k = _make(category="memories", title="Global Memory") + uri = router.route(k) + assert uri.startswith("viking://memories/global/") + + +def test_route_resources(): + router = KnowledgeRouter() + k = _make(category="resources", title="Redis Guide", entity_links=["Redis"]) + uri = router.route(k) + assert "Redis" in uri + assert uri.endswith(".md") + + +def test_route_resources_no_tags(): + router = KnowledgeRouter() + k = _make(category="resources", title="General", entity_links=[]) + uri = router.route(k) + assert "general" in uri + + +def test_route_unknown_category(): + router = KnowledgeRouter() + k = _make(category="unknown") + uri = router.route(k) + assert uri is None + + +def test_sanitize_filename(): + router = KnowledgeRouter() + assert router._sanitize_filename('file<>:name') == 'file___name' + assert router._sanitize_filename("a" * 100) == "a" * 50 diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py new file mode 100644 index 0000000000..8623a9e304 --- /dev/null +++ b/tests/daemon/test_multi_watcher_integration.py @@ -0,0 +1,330 @@ +""" +Integration tests for multi-watcher daemon pipeline. +Tests the full flow: multiple watchers -> normalized events -> ETL compatibility. +""" +import os +import time +import pytest +from pathlib import Path + +from openviking.daemon.watchers.registry import create_watcher, list_available_watchers +from openviking.daemon.watchers import BaseWatcher + + +class FakeCursorManager: + """In-memory cursor manager for testing.""" + + def __init__(self): + self.cursors = {} + + def get_cursor(self, file_path): + from openviking.daemon.models import FileCursor + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + from openviking.daemon.models import FileCursor + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + + +# --- Registry Tests --- + +def test_all_watchers_registered(): + """All 5 built-in watchers should be registered.""" + available = list_available_watchers() + expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev"} + assert expected.issubset(set(available)), f"Missing: {expected - set(available)}" + + +def test_create_all_watchers(): + """Verify all registered watchers can be instantiated via factory.""" + batches = [] + cm = FakeCursorManager() + + file_patterns = { + "claude_code": "*.jsonl", + "generic_jsonl": "*.jsonl", + "aider": ".aider.chat.history.md", + "cursor": "*.log", + "continue_dev": "*.json", + } + + for tool_name in list_available_watchers(): + watcher = create_watcher( + tool_name=tool_name, + watch_dir="/tmp/test", + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + file_pattern=file_patterns.get(tool_name, "*.jsonl"), + ) + assert isinstance(watcher, BaseWatcher) + assert watcher.tool_name == tool_name + + +def test_create_unknown_watcher_raises(): + """Unknown tool name should raise ValueError.""" + with pytest.raises(ValueError, match="Unknown watcher tool"): + create_watcher( + tool_name="nonexistent", + watch_dir="/tmp", + cursor_manager=FakeCursorManager(), + batch_callback=lambda e: None, + ) + + +# --- Multi-Watcher Normalization Tests --- + +def test_claude_code_events_have_tool_name(tmp_path): + """Claude Code events should include tool_name='claude_code'.""" + batches = [] + cm = FakeCursorManager() + watcher = create_watcher( + tool_name="claude_code", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + batch_trigger_lines=2, + ) + + test_file = tmp_path / "session.jsonl" + test_file.write_text( + '{"role": "user", "content": "Hello from CC"}\n' + '{"role": "assistant", "content": "Hi from CC"}\n' + ) + watcher._process_file(str(test_file)) + + assert len(batches) == 1 + assert all(e["tool_name"] == "claude_code" for e in batches[0]) + + +def test_generic_jsonl_custom_mapping(tmp_path): + """GenericJSONL with custom field mapping should normalize correctly.""" + batches = [] + cm = FakeCursorManager() + watcher = create_watcher( + tool_name="generic_jsonl", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + batch_trigger_lines=1, + extra={ + "role_field": "author", + "user_role_value": "human", + "assistant_role_value": "ai", + "content_field": "text", + }, + ) + + test_file = tmp_path / "custom.jsonl" + test_file.write_text('{"author": "human", "text": "Custom format test"}\n') + watcher._process_file(str(test_file)) + + assert len(batches) == 1 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["content"] == "Custom format test" + assert batches[0][0]["tool_name"] == "generic_jsonl" + + +def test_aider_multiline_parsing(tmp_path): + """Aider markdown format should produce user+assistant events.""" + batches = [] + cm = FakeCursorManager() + watcher = create_watcher( + tool_name="aider", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + batch_trigger_lines=100, + ) + + test_file = tmp_path / ".aider.chat.history.md" + test_file.write_text( + "# aider chat started at 2026-01-15 10:30:00\n\n" + "> /my/project\n\n" + "#### user:\n" + "How do I use fastapi?\n\n" + "#### assistant:\n" + "FastAPI is great for building APIs.\n" + ) + watcher._process_file(str(test_file)) + watcher.flush() + + assert len(batches) == 1 + events = batches[0] + assert len(events) == 2 + assert events[0]["role"] == "user" + assert events[0]["content"] == "How do I use fastapi?" + assert events[0]["tool_name"] == "aider" + assert events[1]["role"] == "assistant" + + +def test_cursor_log_parsing(tmp_path): + """Cursor log format should be normalized correctly.""" + batches = [] + cm = FakeCursorManager() + watcher = create_watcher( + tool_name="cursor", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + file_pattern="*.log", + batch_trigger_lines=1, + ) + + test_file = tmp_path / "cursor.log" + test_file.write_text( + '{"type": "chat", "role": "user", "message": "Fix this bug"}\n' + ) + watcher._process_file(str(test_file)) + + assert len(batches) == 1 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["content"] == "Fix this bug" + assert batches[0][0]["tool_name"] == "cursor" + + +def test_continue_dev_log_parsing(tmp_path): + """Continue.dev JSON format should be normalized.""" + batches = [] + cm = FakeCursorManager() + watcher = create_watcher( + tool_name="continue_dev", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + file_pattern="*.json", + batch_trigger_lines=1, + ) + + test_file = tmp_path / "continue.json" + test_file.write_text( + '{"role": "user", "content": "Explain this code", "sessionId": "abc123"}\n' + ) + watcher._process_file(str(test_file)) + + assert len(batches) == 1 + assert batches[0][0]["role"] == "user" + assert batches[0][0]["tool_name"] == "continue_dev" + assert batches[0][0]["session_id"] == "abc123" + + +# --- Cross-Watcher ETL Compatibility --- + +def test_normalized_events_compatible_with_reconstructor(): + """Events from all watchers should work with ConversationReconstructor.""" + from openviking.daemon.conversation_reconstructor import ConversationReconstructor + + events = [ + {"role": "user", "content": "Question from CC", "tool_name": "claude_code", + "timestamp": "2026-01-15T10:00:00Z", "session_id": "s1", "project_name": "proj"}, + {"role": "assistant", "content": "Answer from CC", "tool_name": "claude_code", + "timestamp": "2026-01-15T10:00:01Z", "session_id": "s1", "project_name": "proj"}, + {"role": "user", "content": "Question from Aider", "tool_name": "aider", + "timestamp": "2026-01-15T10:00:02Z"}, + {"role": "assistant", "content": "Answer from Aider", "tool_name": "aider", + "timestamp": "2026-01-15T10:00:03Z"}, + ] + + reconstructor = ConversationReconstructor() + turns = reconstructor.reconstruct(events) + + assert len(turns) == 2 + assert turns[0].user_prompt == "Question from CC" + assert turns[1].user_prompt == "Question from Aider" + + +def test_normalized_events_compatible_with_filter(): + """Events from all watchers should work with LowValueFilter.""" + from openviking.daemon.filters import LowValueFilter + + events = [ + {"role": "user", "content": "A meaningful question about architecture", + "tool_name": "claude_code"}, + {"role": "user", "content": "npm install express", + "tool_name": "cursor"}, + {"role": "assistant", "content": "Here is a detailed explanation of the design pattern", + "tool_name": "aider"}, + ] + + f = LowValueFilter() + filtered = f.apply(events) + + assert len(filtered) == 2 + assert all("npm install" not in e["content"] for e in filtered) + + +def test_source_tool_propagated_through_pipeline(): + """source_tool should flow from events through ConversationTurn.""" + from openviking.daemon.conversation_reconstructor import ConversationReconstructor + + events = [ + {"role": "user", "content": "How to use FastAPI?", "tool_name": "cursor", + "timestamp": "2026-01-15T10:00:00Z"}, + {"role": "assistant", "content": "Install FastAPI with pip...", "tool_name": "cursor", + "timestamp": "2026-01-15T10:00:01Z"}, + ] + + reconstructor = ConversationReconstructor() + turns = reconstructor.reconstruct(events) + + assert len(turns) == 1 + assert turns[0].source_tool == "cursor" + + +def test_multi_watcher_config_effective_watchers(): + """DaemonConfig.get_effective_watchers() should handle all cases.""" + from openviking.server.config import WatcherConfig, DaemonConfig + + # Explicit watchers list + cfg = DaemonConfig( + enabled=True, + watchers=[ + WatcherConfig(tool_name="claude_code", watch_dir="/a"), + WatcherConfig(tool_name="aider", watch_dir="/b"), + WatcherConfig(tool_name="cursor", watch_dir="/c", enabled=False), + ], + ) + effective = cfg.get_effective_watchers() + assert len(effective) == 2 # disabled watcher filtered out + assert effective[0].tool_name == "claude_code" + assert effective[1].tool_name == "aider" + + # Backward compat: watch_dir only + cfg2 = DaemonConfig(enabled=True, watch_dir="~/.claude/projects") + effective2 = cfg2.get_effective_watchers() + assert len(effective2) == 1 + assert effective2[0].tool_name == "claude_code" + + # Default fallback + cfg3 = DaemonConfig(enabled=True) + effective3 = cfg3.get_effective_watchers() + assert len(effective3) == 1 + assert effective3[0].tool_name == "claude_code" + + +def test_knowledge_router_uses_source_tool(): + """KnowledgeRouter should use source_tool in URI path.""" + from openviking.daemon.knowledge_router import KnowledgeRouter + from openviking.daemon.models import ExtractedKnowledge + + router = KnowledgeRouter() + + # With source_tool + k1 = ExtractedKnowledge( + status="EXTRACTED", category="skills", title="FastAPI Tips", + content="...", confidence=0.9, source_tool="cursor", + ) + uri1 = router.route(k1) + assert "cursor" in uri1 + assert "claude_code" not in uri1 + + # Without source_tool (fallback) + k2 = ExtractedKnowledge( + status="EXTRACTED", category="skills", title="Python Tips", + content="...", confidence=0.9, + ) + uri2 = router.route(k2) + assert "general" in uri2 diff --git a/tests/daemon/test_registry.py b/tests/daemon/test_registry.py new file mode 100644 index 0000000000..50bd2f77bc --- /dev/null +++ b/tests/daemon/test_registry.py @@ -0,0 +1,31 @@ +"""Tests for watcher registry.""" +import pytest +from openviking.daemon.watchers.registry import ( + create_watcher, + list_available_watchers, + _WATCHER_REGISTRY, + register_watcher, +) + + +def test_list_available_includes_claude_code(): + assert "claude_code" in list_available_watchers() + + +def test_create_watcher_unknown_raises(): + with pytest.raises(ValueError, match="Unknown watcher tool"): + create_watcher("nonexistent_tool", watch_dir="/tmp", cursor_manager=None, + batch_callback=lambda x: None) + + +def test_register_watcher_decorator(): + class FakeWatcher: + pass + + @register_watcher("test_tool_xyz") + class Decorated: + pass + + assert "test_tool_xyz" in _WATCHER_REGISTRY + # cleanup + del _WATCHER_REGISTRY["test_tool_xyz"] diff --git a/tests/daemon/test_service_multi.py b/tests/daemon/test_service_multi.py new file mode 100644 index 0000000000..60fe888eff --- /dev/null +++ b/tests/daemon/test_service_multi.py @@ -0,0 +1,31 @@ +"""Tests for multi-watcher DaemonService.""" +import pytest +from unittest.mock import MagicMock +from openviking.server.config import WatcherConfig + + +def test_service_stores_watcher_configs(): + """DaemonService should accept and store watcher_configs.""" + from openviking.daemon.service import DaemonService + configs = [ + WatcherConfig(tool_name="claude_code", watch_dir="/tmp/cc"), + WatcherConfig(tool_name="aider", watch_dir="/tmp/aider"), + ] + svc = DaemonService( + resource_service=MagicMock(), + watcher_configs=configs, + db_path="/tmp/test.db", + ) + assert len(svc._watcher_configs) == 2 + + +def test_service_backward_compat_single_dir(): + """DaemonService should create single claude_code config from watch_dir.""" + from openviking.daemon.service import DaemonService + svc = DaemonService( + resource_service=MagicMock(), + watch_dir="/tmp/cc", + db_path="/tmp/test.db", + ) + assert len(svc._watcher_configs) == 1 + assert svc._watcher_configs[0].tool_name == "claude_code" diff --git a/web-studio/src/i18n/locales/en.ts b/web-studio/src/i18n/locales/en.ts index 7326f436a8..f824efa947 100644 --- a/web-studio/src/i18n/locales/en.ts +++ b/web-studio/src/i18n/locales/en.ts @@ -301,6 +301,21 @@ const en = { usageDisabled: 'Usage/Audit is not initialized, so live usage stats are unavailable.', }, + daemon: { + status: 'Active Daemon', + enabled: 'Enabled', + disabled: 'Disabled', + running: 'Running', + stopped: 'Stopped', + watchDir: 'Watch Directory', + batchLines: 'Batch Lines', + batchSeconds: 'Batch Seconds', + cursorCount: 'Tracked Files', + lastFlush: 'Last Flush', + watcherCount: 'watcher(s) active', + activeWatchers: 'Active Watchers', + availableTools: 'Available Tools', + }, operations: { page: { placeholder: 'Operations dashboard is under construction.', diff --git a/web-studio/src/i18n/locales/zh-CN.ts b/web-studio/src/i18n/locales/zh-CN.ts index 031b077cb0..f056f9da53 100644 --- a/web-studio/src/i18n/locales/zh-CN.ts +++ b/web-studio/src/i18n/locales/zh-CN.ts @@ -298,6 +298,21 @@ const zhCN = { }, usageDisabled: 'Usage/Audit 未初始化,暂无实时统计。', }, + daemon: { + status: '主动守护进程', + enabled: '已启用', + disabled: '已禁用', + running: '运行中', + stopped: '已停止', + watchDir: '监听目录', + batchLines: '批处理行数', + batchSeconds: '批处理间隔(秒)', + cursorCount: '追踪文件数', + lastFlush: '上次刷新', + watcherCount: '个监听器运行中', + activeWatchers: '活跃监听器', + availableTools: '可用工具', + }, operations: { page: { placeholder: '运维面板能力尚未接入。', diff --git a/web-studio/src/routes/home/-components/daemon-status-card.tsx b/web-studio/src/routes/home/-components/daemon-status-card.tsx new file mode 100644 index 0000000000..a7dfe7984f --- /dev/null +++ b/web-studio/src/routes/home/-components/daemon-status-card.tsx @@ -0,0 +1,186 @@ +import { useQuery } from '@tanstack/react-query'; +import { Activity, Clock, Database, FileText, Monitor, ToggleLeft, ToggleRight } from 'lucide-react'; +import { useTranslation } from 'react-i18next'; +import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; +import { Badge } from '@/components/ui/badge'; +import { Skeleton } from '@/components/ui/skeleton'; +import { ovClient } from '@/lib/ov-client/client'; + +interface WatcherStatus { + tool_name: string; + watch_dir: string | null; + file_pattern: string | null; + enabled: boolean; + running: boolean; + cursor_count: number; + batch_trigger_lines: number | null; + batch_trigger_seconds: number | null; +} + +interface DaemonStatus { + enabled: boolean; + running: boolean; + watchers: WatcherStatus[]; + available_tools: string[]; + db_path: string | null; +} + +async function fetchDaemonStatus(): Promise<DaemonStatus> { + const response = await ovClient.instance.get('/api/v1/daemon/status'); + return response.data as DaemonStatus; +} + +function ToolBadge({ toolName }: { toolName: string }) { + const colors: Record<string, string> = { + claude_code: 'bg-orange-500/10 text-orange-600 dark:text-orange-400 border-orange-200 dark:border-orange-800', + generic_jsonl: 'bg-blue-500/10 text-blue-600 dark:text-blue-400 border-blue-200 dark:border-blue-800', + aider: 'bg-green-500/10 text-green-600 dark:text-green-400 border-green-200 dark:border-green-800', + cursor: 'bg-purple-500/10 text-purple-600 dark:text-purple-400 border-purple-200 dark:border-purple-800', + continue_dev: 'bg-cyan-500/10 text-cyan-600 dark:text-cyan-400 border-cyan-200 dark:border-cyan-800', + }; + return ( + <span className={`inline-flex items-center rounded-md px-2 py-0.5 text-xs font-medium border ${colors[toolName] || 'bg-gray-100 text-gray-600 border-gray-200'}`}> + {toolName} + </span> + ); +} + +export function DaemonStatusCard() { + const { t } = useTranslation(); + + const { data, isLoading, error } = useQuery({ + queryKey: ['daemon-status'], + queryFn: fetchDaemonStatus, + refetchInterval: 30000, + }); + + if (isLoading) { + return ( + <Card> + <CardHeader> + <CardTitle className="flex items-center gap-2"> + <Activity className="h-4 w-4" /> + <Skeleton className="h-5 w-32" /> + </CardTitle> + </CardHeader> + <CardContent> + <Skeleton className="h-4 w-full mb-2" /> + <Skeleton className="h-4 w-3/4" /> + </CardContent> + </Card> + ); + } + + if (error || !data) { + return ( + <Card> + <CardHeader> + <CardTitle className="flex items-center gap-2 text-muted-foreground"> + <Activity className="h-4 w-4" /> + {t('daemon.status')} + </CardTitle> + </CardHeader> + <CardContent> + <p className="text-sm text-muted-foreground">Failed to load daemon status</p> + </CardContent> + </Card> + ); + } + + const statusColor = data.enabled && data.running + ? 'bg-green-500' + : data.enabled + ? 'bg-yellow-500' + : 'bg-gray-400'; + + const statusLabel = data.enabled && data.running + ? t('daemon.running') + : data.enabled + ? t('daemon.stopped') + : t('daemon.disabled'); + + return ( + <Card> + <CardHeader> + <CardTitle className="flex items-center justify-between"> + <div className="flex items-center gap-2"> + <Activity className="h-4 w-4" /> + {t('daemon.status')} + </div> + <Badge variant={data.enabled ? 'default' : 'secondary'} className={statusColor}> + {statusLabel} + </Badge> + </CardTitle> + </CardHeader> + <CardContent className="space-y-4"> + {/* Global Status */} + <div className="flex items-center justify-between text-sm"> + <span className="text-muted-foreground flex items-center gap-1"> + {data.enabled ? <ToggleRight className="h-4 w-4" /> : <ToggleLeft className="h-4 w-4" />} + {t('daemon.enabled')} + </span> + <span className="font-medium"> + {data.watchers.length} {t('daemon.watcherCount')} + </span> + </div> + + {/* Watcher List */} + {data.watchers.length > 0 && ( + <div className="space-y-2"> + <div className="text-xs font-medium text-muted-foreground uppercase tracking-wider"> + {t('daemon.activeWatchers')} + </div> + {data.watchers.map((w, i) => ( + <div + key={`${w.tool_name}-${i}`} + className="flex items-center justify-between rounded-md border px-3 py-2" + > + <div className="flex items-center gap-2 min-w-0"> + <ToolBadge toolName={w.tool_name} /> + {w.watch_dir && ( + <span className="font-mono text-xs text-muted-foreground truncate max-w-[160px]" title={w.watch_dir}> + {w.watch_dir.split(/[\\/]/).pop() || w.watch_dir} + </span> + )} + </div> + <div className="flex items-center gap-3 text-xs text-muted-foreground shrink-0 ml-2"> + <span className="flex items-center gap-1"> + <Database className="h-3 w-3" /> + {w.cursor_count} + </span> + {w.batch_trigger_lines && ( + <span className="flex items-center gap-1"> + <FileText className="h-3 w-3" /> + {w.batch_trigger_lines} + </span> + )} + {w.batch_trigger_seconds && ( + <span className="flex items-center gap-1"> + <Clock className="h-3 w-3" /> + {w.batch_trigger_seconds}s + </span> + )} + </div> + </div> + ))} + </div> + )} + + {/* Available Tools */} + {data.available_tools.length > 0 && ( + <div className="pt-2 border-t"> + <div className="text-xs font-medium text-muted-foreground mb-1.5"> + <Monitor className="h-3 w-3 inline mr-1" /> + {t('daemon.availableTools')} + </div> + <div className="flex flex-wrap gap-1"> + {data.available_tools.map((tool) => ( + <ToolBadge key={tool} toolName={tool} /> + ))} + </div> + </div> + )} + </CardContent> + </Card> + ); +} diff --git a/web-studio/src/routes/home/route.tsx b/web-studio/src/routes/home/route.tsx index a99aefa00d..fae60d7e7d 100644 --- a/web-studio/src/routes/home/route.tsx +++ b/web-studio/src/routes/home/route.tsx @@ -3,6 +3,7 @@ import { useQuery } from '@tanstack/react-query' import { createFileRoute } from '@tanstack/react-router' import { ContextCommitsPanel } from './-components/context-commits-panel' +import { DaemonStatusCard } from './-components/daemon-status-card' import { ContextDataPanel, TodayRetrievalsPanel, @@ -90,6 +91,8 @@ function HomePage() { /> </div> + <DaemonStatusCard /> + <TokenTrendPanel data={tokenSeries.data} isError={tokenSeries.isError} From 9623ca983e46a6cca9f231353c6177f730ac29cd Mon Sep 17 00:00:00 2001 From: baobaodae <2014596548@qq.com> Date: Fri, 19 Jun 2026 16:39:28 +0800 Subject: [PATCH 2/6] fix(daemon): fix ETL pipeline not flushing events and invalid URI routes Root cause: BatchBuffer.created_at was never set in add_line(), so the time-based trigger never fired and events stayed buffered indefinitely. Additional fixes: - Thread-safe enqueue from watchdog thread via loop.call_soon_threadsafe() - _flush_buffer() now calls callback before clearing buffer (prevents data loss) - KnowledgeRouter uses valid viking://resources/ scope (skills/memories were invalid) - Non-ASCII titles sanitized via sha256 hash to produce valid URI paths - VLM extraction concurrency limited to 2 via semaphore - ClaudeCodeWatcher rewrite: handle nested message.content (text blocks, tool_use) - Project name derived from file path via _post_normalize hook - All exception handlers now include exc_info=True for stack traces - 148 tests passing --- docs/daemon/Watcher.md | 32 ++ openviking/daemon/etl_pipeline.py | 12 +- openviking/daemon/knowledge_router.py | 24 +- openviking/daemon/models.py | 3 + openviking/daemon/service.py | 20 +- .../daemon/watchers/base_file_watcher.py | 23 +- .../daemon/watchers/claude_code_watcher.py | 79 ++++- tests/daemon/test_claude_code_watcher.py | 295 ++++++++++++++++-- tests/daemon/test_integration.py | 2 +- tests/daemon/test_knowledge_router.py | 14 +- .../daemon/test_multi_watcher_integration.py | 4 +- 11 files changed, 448 insertions(+), 60 deletions(-) create mode 100644 docs/daemon/Watcher.md diff --git a/docs/daemon/Watcher.md b/docs/daemon/Watcher.md new file mode 100644 index 0000000000..5b672d74f2 --- /dev/null +++ b/docs/daemon/Watcher.md @@ -0,0 +1,32 @@ +# Daemon Watcher 排查计划 + +## 状态:148 tests pass,服务器在 1966 端口运行中,daemon 已处理 3 个文件 + +## 待完成 + +### 1. 确认端到端写入 +目标:验证知识是否真正写入 `viking://resources/...` + +方法: +- 服务器日志当前输出到后台进程 stdout,无法查看。需要改为写文件:在 `ov.conf` 中加 `"log": {"level": "INFO", "output": "D:\\Develop\\ov-test\\server.log"}` +- 杀掉当前服务器,清 cursor DB,重启 +- touch JSONL 触发处理,等 ETL 跑完 +- 查看 `server.log` 中的写入日志(成功/失败) + +预期日志链路: +``` +[claude_code] Flushing N events +_enqueue_batch: received N events +ETL loop: dequeued N events +Processing batch with N events → After filtering: X → Reconstructed Y turns → Extracted Z items +Knowledge ingested: viking://resources/... ← 关键确认点 +``` + +### 2. 根据结果处理 +- 写入成功 → 提交 commit 和 PR +- 写入失败 → 根据日志修 `storage_adapter.py`(可能是 `resource_service.add_resource()` 调用参数或 URI 格式问题) + +### 3. 提交 PR +- 分支基于 `upstream/main` +- 只包含 daemon 相关文件变更 +- PR 目标:`volcengine/OpenViking:main` diff --git a/openviking/daemon/etl_pipeline.py b/openviking/daemon/etl_pipeline.py index 35a47315f8..b0f35d58d1 100644 --- a/openviking/daemon/etl_pipeline.py +++ b/openviking/daemon/etl_pipeline.py @@ -46,15 +46,21 @@ async def process_batch(self, events: List[Dict]) -> List[ExtractedKnowledge]: if not turns: return [] - # Step 3: Extract knowledge in parallel - tasks = [self.extractor.extract(turn) for turn in turns] + # Step 3: Extract knowledge with limited concurrency (2 parallel VLM calls max) + sem = asyncio.Semaphore(2) + + async def _guarded_extract(turn): + async with sem: + return await self.extractor.extract(turn) + + tasks = [_guarded_extract(turn) for turn in turns] results = await asyncio.gather(*tasks, return_exceptions=True) # Step 4: Filter errors and deduplicate extracted: List[ExtractedKnowledge] = [] for result in results: if isinstance(result, Exception): - logger.error("Extraction failed: %s", result) + logger.error("Extraction failed: %s", result, exc_info=result) continue if result is not None and not self.deduplicator.is_duplicate(result): diff --git a/openviking/daemon/knowledge_router.py b/openviking/daemon/knowledge_router.py index cd043d905a..e2adcb112f 100644 --- a/openviking/daemon/knowledge_router.py +++ b/openviking/daemon/knowledge_router.py @@ -1,6 +1,7 @@ """ Route extracted knowledge to appropriate viking:// URIs based on category and project. """ +import hashlib import re from typing import Optional @@ -17,10 +18,10 @@ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]: """ Determine the target URI for a knowledge item. - Routing rules: - - skills -> viking://skills/<source_tool>/<title>.md - - memories (with project) -> viking://memories/projects/<project>/decisions.md - - memories (no project) -> viking://memories/global/<title>.md + Routing rules (all under resources/ scope): + - skills -> viking://resources/skills/<source_tool>/<title>.md + - memories (with project) -> viking://resources/memories/<project>/decisions.md + - memories (no project) -> viking://resources/memories/global/<title>.md - resources -> viking://resources/<tech_stack>/<title>.md """ category = knowledge.category @@ -30,14 +31,14 @@ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]: if category == "skills": safe_source = self._sanitize_filename(source) - return f"viking://skills/{safe_source}/{title}.md" + return f"viking://resources/skills/{safe_source}/{title}.md" elif category == "memories": if project_name: safe_project = self._sanitize_filename(project_name) - return f"viking://memories/projects/{safe_project}/decisions.md" + return f"viking://resources/memories/{safe_project}/decisions.md" else: - return f"viking://memories/global/{title}.md" + return f"viking://resources/memories/global/{title}.md" elif category == "resources": entity_links = knowledge.entity_links @@ -50,7 +51,12 @@ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]: @staticmethod def _sanitize_filename(name: str) -> str: - """Sanitize a string for use as a filename.""" + """Sanitize a string for use as a filename. Non-ASCII names are replaced with a short hash.""" sanitized = re.sub(r'[<>:"/\\|?*]', '_', name) sanitized = sanitized.strip().replace(' ', '_') - return sanitized[:50] + sanitized = sanitized[:50] + # If any non-ASCII characters remain, use a truncated hash instead + if not sanitized.isascii(): + name_hash = hashlib.sha256(name.encode("utf-8")).hexdigest()[:16] + sanitized = name_hash + return sanitized diff --git a/openviking/daemon/models.py b/openviking/daemon/models.py index f409762b53..ea88df8953 100644 --- a/openviking/daemon/models.py +++ b/openviking/daemon/models.py @@ -1,6 +1,7 @@ """ Data models for OpenViking Active Daemon. """ +import time from dataclasses import dataclass, field from typing import Any, Dict, List, Optional @@ -36,6 +37,8 @@ class BatchBuffer: created_at: float = 0.0 def add_line(self, line: Dict[str, Any], byte_size: int): + if self.is_empty(): + self.created_at = time.time() self.lines.append(line) self.byte_count += byte_size diff --git a/openviking/daemon/service.py b/openviking/daemon/service.py index 8ca1c1e2d1..bd6bc8bebe 100644 --- a/openviking/daemon/service.py +++ b/openviking/daemon/service.py @@ -66,10 +66,12 @@ def __init__( self._running = False self._etl_task: Optional[asyncio.Task] = None self._batch_queue: asyncio.Queue = asyncio.Queue() + self._loop: Optional[asyncio.AbstractEventLoop] = None async def start(self): """Start the Daemon service with all configured watchers.""" logger.info("Starting OpenViking Active Daemon...") + self._loop = asyncio.get_running_loop() self.cursor_manager = CursorManager(self.db_path) self.etl_pipeline = BatchETLPipeline() @@ -101,7 +103,7 @@ async def start(self): self.watchers.append(watcher) logger.info("Watcher started: %s -> %s", wc.tool_name, watch_dir) except Exception as e: - logger.warning("Failed to start watcher %s: %s", wc.tool_name, e) + logger.warning("Failed to start watcher %s: %s", wc.tool_name, e, exc_info=True) self._running = True logger.info("Daemon started with %d watcher(s)", len(self.watchers)) @@ -128,11 +130,16 @@ async def stop(self): logger.info("Daemon stopped") def _enqueue_batch(self, events): - """Sync callback from watcher thread - puts events onto async queue.""" + """Sync callback from watcher thread - thread-safe enqueue onto async queue.""" try: - self._batch_queue.put_nowait(events) + logger.info("_enqueue_batch: received %d events, queue size before: %d", + len(events), self._batch_queue.qsize()) + if self._loop is not None and self._loop.is_running(): + self._loop.call_soon_threadsafe(self._batch_queue.put_nowait, events) + else: + self._batch_queue.put_nowait(events) except Exception as e: - logger.error("Failed to enqueue batch: %s", e) + logger.error("Failed to enqueue batch: %s", e, exc_info=True) async def _etl_loop(self): """Background loop that processes batches from the queue.""" @@ -149,6 +156,9 @@ async def _etl_loop(self): if events is None: break + logger.info("ETL loop: dequeued %d events, queue size after: %d", + len(events), self._batch_queue.qsize()) + try: extracted = await self.etl_pipeline.process_batch(events) if not extracted: @@ -172,7 +182,7 @@ async def _etl_loop(self): else: logger.warning("Failed to write: %s", knowledge.title) except Exception as e: - logger.error("Error writing knowledge: %s", e) + logger.error("Error writing knowledge: %s", e, exc_info=True) except Exception as e: logger.error("Error in ETL processing: %s", e, exc_info=True) diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py index 0e6cce1e0f..1f3e71147d 100644 --- a/openviking/daemon/watchers/base_file_watcher.py +++ b/openviking/daemon/watchers/base_file_watcher.py @@ -93,6 +93,13 @@ def filter_event(self, event: Dict) -> bool: """Additional filtering. Override for tool-specific rules. Default: keep all.""" return True + def _post_normalize(self, event: Dict, file_path: str) -> Dict: + """Hook for subclasses to enrich normalized events with file-path context. + Called after normalize_event() and before filter_event(). + Default: return event unchanged. + """ + return event + def matches_file_pattern(self, file_path: str) -> bool: """Check if file matches the watcher's file pattern.""" filename = os.path.basename(file_path) @@ -137,6 +144,10 @@ def _process_file(self, file_path: str): new_position = cursor.last_position + len(new_content.encode("utf-8")) + logger.info("[%s] Processing %s: %d bytes new content from pos %d", + self.tool_name, file_path, len(new_content.encode("utf-8")), cursor.last_position) + + event_count = 0 for line in new_content.splitlines(): line = line.strip() if not line: @@ -150,6 +161,8 @@ def _process_file(self, file_path: str): if normalized is None: continue + normalized = self._post_normalize(normalized, file_path) + if not self.filter_event(normalized): continue @@ -158,13 +171,15 @@ def _process_file(self, file_path: str): byte_size = len(line.encode("utf-8")) self._buffer.add_line(normalized, byte_size) + event_count += 1 + logger.info("[%s] Added %d events to buffer (total: %d, trigger at %d)", + self.tool_name, event_count, len(self._buffer.lines), self.batch_trigger_lines) self.cursor_manager.update_cursor(file_path, new_position) - self._check_batch_trigger() except Exception as e: - logger.error("[%s] Error processing file %s: %s", self.tool_name, file_path, e) + logger.error("[%s] Error processing file %s: %s", self.tool_name, file_path, e, exc_info=True) def _check_batch_trigger(self): """Check if batch trigger conditions are met.""" @@ -183,10 +198,10 @@ def _flush_buffer(self): return events = self._buffer.lines.copy() - self._buffer.clear() logger.info("[%s] Flushing %d events", self.tool_name, len(events)) try: self.batch_callback(events) + self._buffer.clear() except Exception as e: - logger.error("[%s] Batch callback failed: %s", self.tool_name, e) + logger.error("[%s] Batch callback failed: %s", self.tool_name, e, exc_info=True) diff --git a/openviking/daemon/watchers/claude_code_watcher.py b/openviking/daemon/watchers/claude_code_watcher.py index 4174d559b7..88ffae8c87 100644 --- a/openviking/daemon/watchers/claude_code_watcher.py +++ b/openviking/daemon/watchers/claude_code_watcher.py @@ -1,8 +1,17 @@ """ Claude Code JSONL log watcher. Monitors ~/.claude/projects/<project>/<session>.jsonl files. + +Real log format (verified against 288 sessions, 29541 lines): +- Top-level "type": "user" | "assistant" | "attachment" | "queue-operation" | "system" | ... +- "role" and "content" are nested inside "message" object +- "message.content" can be a plain string OR an array of content blocks + (e.g. [{"type": "text", "text": "..."}, {"type": "tool_use", ...}]) +- Session ID is at top-level "sessionId" (camelCase, no underscore) +- Project name is NOT in the JSON — derived from file path """ import json +import os from typing import Dict, List, Optional from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher @@ -12,6 +21,26 @@ logger = get_logger(__name__) +def _extract_text_from_content(content) -> str: + """Extract plain text from Claude Code message content. + + content can be: + - str: a plain text message + - list: array of content blocks (text, tool_use, thinking, tool_result) + Returns concatenated text from all "text" blocks, or "" if none. + """ + if isinstance(content, str): + return content + if isinstance(content, list): + text_parts = [] + for block in content: + if isinstance(block, dict): + if block.get("type") == "text" and "text" in block: + text_parts.append(block["text"]) + return "\n".join(text_parts) + return "" + + @register_watcher("claude_code") class ClaudeCodeWatcher(BaseFileWatcher): """Watches Claude Code JSONL log files and extracts conversation events.""" @@ -41,20 +70,58 @@ def parse_line(self, line: str) -> Optional[Dict]: return None def normalize_event(self, raw_event: Dict) -> Optional[Dict]: - """Normalize Claude Code event to common format.""" - role = raw_event.get("role") + """Normalize Claude Code event to common format. + + Only extracts events where top-level type is "user" or "assistant" + and message.role is "user" or "assistant". + Skips tool_use-only assistant messages (no text content). + """ event_type = raw_event.get("type", "") + # Only process conversation messages + if event_type not in ("user", "assistant"): + return None + + msg = raw_event.get("message") + if not isinstance(msg, dict): + return None + + role = msg.get("role") if role not in ("user", "assistant"): return None - if event_type and event_type != "message": + + # Extract text content — skip if empty (e.g. tool_use-only messages) + raw_content = msg.get("content", "") + content = _extract_text_from_content(raw_content) + if not content: return None + # Skip tool_result messages (type="user" but content is tool_result array) + if isinstance(raw_content, list): + has_tool_result = any( + isinstance(b, dict) and b.get("type") == "tool_result" + for b in raw_content + ) + if has_tool_result: + return None + return { "role": role, - "content": raw_event.get("content", ""), + "content": content, "type": "message", "timestamp": raw_event.get("timestamp"), - "session_id": raw_event.get("session_id"), - "project_name": raw_event.get("project_name"), + "session_id": raw_event.get("sessionId"), + "project_name": None, # injected by _post_normalize } + + def _post_normalize(self, event: Dict, file_path: str) -> Dict: + """Derive project_name from file path: ~/.claude/projects/<project>/<session>.jsonl""" + if not event.get("project_name"): + parts = file_path.replace("\\", "/").split("/") + try: + projects_idx = parts.index("projects") + if projects_idx + 1 < len(parts) - 1: + event["project_name"] = parts[projects_idx + 1] + except ValueError: + pass + return event diff --git a/tests/daemon/test_claude_code_watcher.py b/tests/daemon/test_claude_code_watcher.py index a31b03e21a..201670f045 100644 --- a/tests/daemon/test_claude_code_watcher.py +++ b/tests/daemon/test_claude_code_watcher.py @@ -1,9 +1,19 @@ -"""Tests for ClaudeCodeWatcher parsing and filtering logic.""" +"""Tests for ClaudeCodeWatcher parsing and filtering logic. + +All test fixtures use the REAL Claude Code JSONL format: +- Top-level "type": "user" | "assistant" | ... +- "role" and "content" nested inside "message" object +- "sessionId" at top level (camelCase) +- "content" can be string or array of content blocks +""" import json import os import tempfile -from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher +from openviking.daemon.watchers.claude_code_watcher import ( + ClaudeCodeWatcher, + _extract_text_from_content, +) class FakeCursorManager: @@ -33,18 +43,156 @@ def _make_watcher(batch_trigger_lines=50, batch_trigger_seconds=300): return watcher, batches, cursor_mgr +# --- Fixtures matching REAL Claude Code JSONL format --- + +def _make_user_event(content="Hello", session_id="test-session-001"): + """Create a realistic user event matching real Claude Code logs.""" + return { + "type": "user", + "message": {"role": "user", "content": content}, + "uuid": "user-uuid-001", + "timestamp": "2026-06-15T10:30:00.000Z", + "sessionId": session_id, + } + + +def _make_assistant_event(content="AI answer", session_id="test-session-001"): + """Create a realistic assistant event with string content.""" + return { + "type": "assistant", + "message": { + "role": "assistant", + "content": content, + }, + "uuid": "asst-uuid-001", + "timestamp": "2026-06-15T10:30:01.000Z", + "sessionId": session_id, + } + + +def _make_assistant_event_blocks(blocks, session_id="test-session-001"): + """Create an assistant event with array content blocks.""" + return { + "type": "assistant", + "message": { + "role": "assistant", + "content": blocks, + }, + "uuid": "asst-uuid-002", + "timestamp": "2026-06-15T10:30:02.000Z", + "sessionId": session_id, + } + + +def _make_queue_event(): + """Create a queue-operation event (should be skipped).""" + return { + "type": "queue-operation", + "operation": "enqueue", + "timestamp": "2026-06-15T10:29:59.000Z", + "sessionId": "test-session-001", + "content": "Some prompt", + } + + +def _make_system_event(): + """Create a system event (should be skipped).""" + return { + "type": "system", + "subtype": "stop_hook_summary", + "uuid": "sys-uuid-001", + "timestamp": "2026-06-15T10:30:03.000Z", + "sessionId": "test-session-001", + } + + +def _make_attachment_event(): + """Create an attachment event (should be skipped).""" + return { + "type": "attachment", + "attachment": {"type": "hook_success"}, + "uuid": "att-uuid-001", + "timestamp": "2026-06-15T10:29:58.000Z", + "sessionId": "test-session-001", + } + + +def _make_tool_result_user_event(): + """Create a user event carrying tool_result (should be skipped).""" + return { + "type": "user", + "message": { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "call_001", + "content": "output here", + "is_error": False, + } + ], + }, + "uuid": "tr-uuid-001", + "timestamp": "2026-06-15T10:30:04.000Z", + "sessionId": "test-session-001", + } + + +# --- Tests --- + def test_tool_name(): watcher, _, _ = _make_watcher() assert watcher.tool_name == "claude_code" +# --- _extract_text_from_content helper --- + +def test_extract_text_from_string(): + assert _extract_text_from_content("hello world") == "hello world" + + +def test_extract_text_from_array_with_text(): + blocks = [ + {"type": "thinking", "thinking": "let me think..."}, + {"type": "text", "text": "Here is the answer."}, + {"type": "tool_use", "name": "Bash", "input": {"command": "ls"}}, + ] + assert _extract_text_from_content(blocks) == "Here is the answer." + + +def test_extract_text_from_array_multiple_texts(): + blocks = [ + {"type": "text", "text": "Part 1"}, + {"type": "text", "text": "Part 2"}, + ] + assert _extract_text_from_content(blocks) == "Part 1\nPart 2" + + +def test_extract_text_from_array_tool_only(): + blocks = [ + {"type": "tool_use", "name": "Read", "input": {"file_path": "/x"}}, + ] + assert _extract_text_from_content(blocks) == "" + + +def test_extract_text_from_none(): + assert _extract_text_from_content(None) == "" + + +def test_extract_text_from_empty_list(): + assert _extract_text_from_content([]) == "" + + +# --- parse_line --- + def test_parse_valid_jsonl_line(): watcher, _, _ = _make_watcher() - line = '{"timestamp": "2026-06-15T10:30:00Z", "role": "user", "content": "Hello", "type": "message"}' - event = watcher.parse_line(line) - assert event is not None - assert event["role"] == "user" - assert event["content"] == "Hello" + event = _make_user_event("Hello") + line = json.dumps(event) + result = watcher.parse_line(line) + assert result is not None + assert result["type"] == "user" + assert result["message"]["role"] == "user" def test_parse_invalid_line(): @@ -53,72 +201,165 @@ def test_parse_invalid_line(): assert watcher.parse_line("") is None -def test_normalize_event_user_message(): +# --- normalize_event --- + +def test_normalize_user_string_message(): watcher, _, _ = _make_watcher() - raw = {"role": "user", "type": "message", "content": "Hello", "timestamp": "2026-06-15T10:30:00Z"} + raw = _make_user_event("Hello world") result = watcher.normalize_event(raw) assert result is not None assert result["role"] == "user" + assert result["content"] == "Hello world" assert result["type"] == "message" - assert result["content"] == "Hello" + assert result["session_id"] == "test-session-001" + + +def test_normalize_assistant_string_message(): + watcher, _, _ = _make_watcher() + raw = _make_assistant_event("AI response here") + result = watcher.normalize_event(raw) + assert result is not None + assert result["role"] == "assistant" + assert result["content"] == "AI response here" -def test_normalize_event_assistant_message(): +def test_normalize_assistant_text_block(): + """Assistant with text content block should be extracted.""" watcher, _, _ = _make_watcher() - raw = {"role": "assistant", "type": "message", "content": "AI answer"} + raw = _make_assistant_event_blocks([ + {"type": "thinking", "thinking": "internal reasoning"}, + {"type": "text", "text": "The answer is 42."}, + ]) result = watcher.normalize_event(raw) assert result is not None assert result["role"] == "assistant" - assert result["content"] == "AI answer" + assert result["content"] == "The answer is 42." + assert "internal reasoning" not in result["content"] -def test_normalize_event_excludes_system_role(): +def test_normalize_assistant_tool_use_only_skipped(): + """Assistant message with only tool_use blocks (no text) should be skipped.""" watcher, _, _ = _make_watcher() - raw = {"role": "system", "type": "message", "content": "System msg"} + raw = _make_assistant_event_blocks([ + {"type": "tool_use", "name": "Bash", "input": {"command": "ls"}}, + ]) + result = watcher.normalize_event(raw) + assert result is None + + +def test_normalize_tool_result_user_skipped(): + """User events carrying tool_result should be skipped.""" + watcher, _, _ = _make_watcher() + raw = _make_tool_result_user_event() + result = watcher.normalize_event(raw) + assert result is None + + +def test_normalize_queue_operation_skipped(): + watcher, _, _ = _make_watcher() + raw = _make_queue_event() assert watcher.normalize_event(raw) is None -def test_normalize_event_excludes_tool_call(): +def test_normalize_system_event_skipped(): watcher, _, _ = _make_watcher() - raw = {"role": "assistant", "type": "tool_call", "content": "call"} + raw = _make_system_event() assert watcher.normalize_event(raw) is None -def test_normalize_event_excludes_tool_result(): +def test_normalize_attachment_event_skipped(): watcher, _, _ = _make_watcher() - raw = {"role": "assistant", "type": "tool_result", "content": "result"} + raw = _make_attachment_event() assert watcher.normalize_event(raw) is None +def test_normalize_no_message_field_skipped(): + """Events with type=user/assistant but no message dict should be skipped.""" + watcher, _, _ = _make_watcher() + assert watcher.normalize_event({"type": "user"}) is None + assert watcher.normalize_event({"type": "assistant", "message": "not a dict"}) is None + + +# --- _post_normalize (project_name from path) --- + +def test_post_normalize_injects_project_name(): + watcher, _, _ = _make_watcher() + event = {"role": "user", "content": "test", "project_name": None} + path = "C:/Users/test/.claude/projects/D--Develop-MyProject/abc123.jsonl" + result = watcher._post_normalize(event, path) + assert result["project_name"] == "D--Develop-MyProject" + + +def test_post_normalize_windows_backslash(): + watcher, _, _ = _make_watcher() + event = {"role": "user", "content": "test", "project_name": None} + path = "C:\\Users\\test\\.claude\\projects\\D--Develop-OpenViking\\session.jsonl" + result = watcher._post_normalize(event, path) + assert result["project_name"] == "D--Develop-OpenViking" + + +def test_post_normalize_preserves_existing_project_name(): + watcher, _, _ = _make_watcher() + event = {"role": "user", "content": "test", "project_name": "already-set"} + path = "C:/Users/test/.claude/projects/D--Develop-Other/abc.jsonl" + result = watcher._post_normalize(event, path) + assert result["project_name"] == "already-set" + + +# --- filter_event --- + def test_filter_event_keeps_messages(): watcher, _, _ = _make_watcher() event = {"role": "user", "type": "message", "content": "Hello", "tool_name": "claude_code"} assert watcher.filter_event(event) is True +# --- _process_file (integration of parse + normalize + buffer) --- + def test_process_file(): """Test that _process_file reads, parses, normalizes, and buffers events.""" watcher, batches, cursor_mgr = _make_watcher(batch_trigger_lines=2) - with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f: - f.write(json.dumps({"role": "user", "type": "message", "content": "Hello"}) + "\n") - f.write(json.dumps({"role": "assistant", "type": "message", "content": "Hi"}) + "\n") - f.write(json.dumps({"role": "system", "type": "message", "content": "ignored"}) + "\n") - tmp_path = f.name + # Build a realistic session file path + tmp_dir = tempfile.mkdtemp() + project_dir = os.path.join(tmp_dir, "projects", "D--Develop-TestProj") + os.makedirs(project_dir) + session_file = os.path.join(project_dir, "sess-001.jsonl") + + with open(session_file, "w", encoding="utf-8") as f: + # queue-operation → should be skipped + f.write(json.dumps(_make_queue_event()) + "\n") + # user message → should be extracted + f.write(json.dumps(_make_user_event("Hello")) + "\n") + # attachment → should be skipped + f.write(json.dumps(_make_attachment_event()) + "\n") + # assistant text reply → should be extracted + f.write(json.dumps(_make_assistant_event("Hi there")) + "\n") + # system event → should be skipped + f.write(json.dumps(_make_system_event()) + "\n") + # tool_result user → should be skipped + f.write(json.dumps(_make_tool_result_user_event()) + "\n") try: - watcher._process_file(tmp_path) + watcher._process_file(session_file) # batch_trigger_lines=2, so 2 valid events should trigger flush assert len(batches) == 1 assert len(batches[0]) == 2 assert batches[0][0]["role"] == "user" + assert batches[0][0]["content"] == "Hello" assert batches[0][1]["role"] == "assistant" + assert batches[0][1]["content"] == "Hi there" assert all(e["tool_name"] == "claude_code" for e in batches[0]) + # project_name should be derived from path + assert batches[0][0]["project_name"] == "D--Develop-TestProj" # Cursor should have been updated assert len(cursor_mgr.updates) == 1 - assert cursor_mgr.updates[0][0] == tmp_path + assert cursor_mgr.updates[0][0] == session_file finally: - os.unlink(tmp_path) + os.unlink(session_file) + os.rmdir(project_dir) + os.rmdir(os.path.join(tmp_dir, "projects")) + os.rmdir(tmp_dir) def test_force_flush_empty_buffer(): diff --git a/tests/daemon/test_integration.py b/tests/daemon/test_integration.py index c4b497092e..ee62964a01 100644 --- a/tests/daemon/test_integration.py +++ b/tests/daemon/test_integration.py @@ -134,7 +134,7 @@ def test_knowledge_router_all_categories(): assert "skills/claude_code" in router.route(skill) mem_proj = ExtractedKnowledge(status="EXTRACTED", category="memories", title="Arch", content="...", project_name="myapp") - assert "memories/projects/myapp" in router.route(mem_proj) + assert "memories/myapp" in router.route(mem_proj) mem_global = ExtractedKnowledge(status="EXTRACTED", category="memories", title="General", content="...") assert "memories/global" in router.route(mem_global) diff --git a/tests/daemon/test_knowledge_router.py b/tests/daemon/test_knowledge_router.py index a045dc4a5a..739703321b 100644 --- a/tests/daemon/test_knowledge_router.py +++ b/tests/daemon/test_knowledge_router.py @@ -18,7 +18,7 @@ def test_route_skills(): router = KnowledgeRouter() k = _make(category="skills", title="PostgreSQL Config") uri = router.route(k) - assert uri.startswith("viking://skills/general/") + assert uri.startswith("viking://resources/skills/general/") assert uri.endswith(".md") @@ -26,7 +26,7 @@ def test_route_skills_with_source_tool(): router = KnowledgeRouter() k = _make(category="skills", title="PostgreSQL Config", source_tool="claude_code") uri = router.route(k) - assert uri.startswith("viking://skills/claude_code/") + assert uri.startswith("viking://resources/skills/claude_code/") assert uri.endswith(".md") @@ -42,7 +42,7 @@ def test_route_memories_global(): router = KnowledgeRouter() k = _make(category="memories", title="Global Memory") uri = router.route(k) - assert uri.startswith("viking://memories/global/") + assert uri.startswith("viking://resources/memories/global/") def test_route_resources(): @@ -71,3 +71,11 @@ def test_sanitize_filename(): router = KnowledgeRouter() assert router._sanitize_filename('file<>:name') == 'file___name' assert router._sanitize_filename("a" * 100) == "a" * 50 + + +def test_sanitize_filename_non_ascii(): + router = KnowledgeRouter() + result = router._sanitize_filename("中文标题测试") + assert len(result) == 16 + assert result.isascii() + assert all(c in "0123456789abcdef" for c in result) diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py index 8623a9e304..d3a797ffa0 100644 --- a/tests/daemon/test_multi_watcher_integration.py +++ b/tests/daemon/test_multi_watcher_integration.py @@ -91,8 +91,8 @@ def test_claude_code_events_have_tool_name(tmp_path): test_file = tmp_path / "session.jsonl" test_file.write_text( - '{"role": "user", "content": "Hello from CC"}\n' - '{"role": "assistant", "content": "Hi from CC"}\n' + '{"type": "user", "message": {"role": "user", "content": "Hello from CC"}}\n' + '{"type": "assistant", "message": {"role": "assistant", "content": "Hi from CC"}}\n' ) watcher._process_file(str(test_file)) From 0136fbca4bea1ecc539bd7c126f751e6cd7cfd46 Mon Sep 17 00:00:00 2001 From: baobaodae <2014596548@qq.com> Date: Sat, 20 Jun 2026 11:52:17 +0800 Subject: [PATCH 3/6] feat(daemon): add BasePollingWatcher and CursorDBWatcher for SQLite-based AI tool monitoring Extend daemon watcher framework to support SQLite database sources alongside existing JSONL file watchers. BasePollingWatcher uses Thread + Event.wait() polling instead of watchdog Observer, with cursor advancement on all raw events (including filtered) to avoid infinite re-query loops. CursorDBWatcher monitors Cursor IDE's dual-SQLite storage: - Global DB (cursorDiskKV) for bubbleId:* conversation entries - type 1=user, 2=assistant; filters empty text (tool calls/streaming) Includes 28 new tests (12 polling base + 16 cursor_db) and 3 integration tests. --- docs/daemon/db-watcher-plan.md | 752 ++++++++++++++++++ .../daemon/watchers/base_polling_watcher.py | 205 +++++ .../daemon/watchers/cursor_db_watcher.py | 193 +++++ openviking/daemon/watchers/registry.py | 6 + tests/daemon/test_base_polling_watcher.py | 313 ++++++++ tests/daemon/test_cursor_db_watcher.py | 422 ++++++++++ .../daemon/test_multi_watcher_integration.py | 59 +- 7 files changed, 1948 insertions(+), 2 deletions(-) create mode 100644 docs/daemon/db-watcher-plan.md create mode 100644 openviking/daemon/watchers/base_polling_watcher.py create mode 100644 openviking/daemon/watchers/cursor_db_watcher.py create mode 100644 tests/daemon/test_base_polling_watcher.py create mode 100644 tests/daemon/test_cursor_db_watcher.py diff --git a/docs/daemon/db-watcher-plan.md b/docs/daemon/db-watcher-plan.md new file mode 100644 index 0000000000..49ae875c41 --- /dev/null +++ b/docs/daemon/db-watcher-plan.md @@ -0,0 +1,752 @@ +## Daemon 数据库源工具适配方案(v2 — 详细执行计划) + +### 一、现状分析 + +#### 1.1 当前架构全景 + +``` + BaseWatcher Protocol + (tool_name, start, stop, flush) + │ + BaseFileWatcher (ABC) + ┌─────────┼──────────────────────┐ + │ │ │ + watchdog CursorManager BatchBuffer + Observer (byte offset) (lines + time) + │ │ │ + └─────────┴──────────┬───────────┘ + │ + _process_file() + parse_line() → normalize_event() → _post_normalize() → filter_event() + │ + batch_callback → DaemonService._enqueue_batch() + │ + asyncio.Queue → _etl_loop() → BatchETLPipeline + │ + Filter → Reconstruct → Extract(LLM) → Dedup + │ + VikingStorageAdapter → viking://resources/ +``` + +#### 1.2 现有 5 个文件 Watcher + +| Watcher | tool_name | 文件格式 | 继承 | 特殊点 | +|---------|-----------|----------|------|--------| +| ClaudeCodeWatcher | `claude_code` | `*.jsonl` | BaseFileWatcher | `_post_normalize` 从路径推导 project_name | +| CursorWatcher | `cursor` | `*.log` | BaseFileWatcher | 灵活 role 映射(user/human/human_turn) | +| AiderWatcher | `aider` | `.aider.chat.history.md` | BaseFileWatcher | **覆盖** `_process_file`,多行 Markdown 解析 | +| ContinueDevWatcher | `continue_dev` | `*.json` | BaseFileWatcher | 标准 JSON 行格式 | +| GenericJSONLWatcher | `generic_jsonl` | `*.jsonl` | BaseFileWatcher | `extra` dict 自定义字段映射 | + +#### 1.3 关键接口约束 + +- **BaseWatcher Protocol**(`watchers/__init__.py`):4 个方法,天然支持非文件型 watcher +- **Registry**(`registry.py`):`create_watcher(tool_name, **kwargs)` 工厂模式,已有 `extra` 参数传递链 +- **CursorManager**(`cursor_manager.py`):`file_path TEXT PK` + `last_position INTEGER`,后者是通用整数——文件 watcher 存字节偏移,DB watcher 可存 rowid/timestamp +- **DaemonService**(`service.py`):`_enqueue_batch` 已用 `loop.call_soon_threadsafe`,线程安全——polling thread 可直接调用 +- **WatcherConfig**(`config.py`):已有 `extra: Dict[str, Any]` 字段,无需改 config schema + +#### 1.4 不需要改动的部分 + +- `BatchBuffer` / `BatchETLPipeline` / `ConversationReconstructor` / `KnowledgeExtractor` / `KnowledgeDeduplicator` / `VikingStorageAdapter` / `KnowledgeRouter` ——全部与数据源无关,只消费 normalized events +- `WatcherConfig` / `DaemonConfig` —— schema 已够用 +- `DaemonService.start()` / `_etl_loop()` —— 已支持多 watcher 并行 + +--- + +### 二、需要适配的数据库源工具(含格式调研) + +#### 2.1 Cursor(P0)— 双库架构 + +**⚠️ 关键发现:Cursor 使用两个独立的 SQLite 数据库** + +| 数据库 | 路径 (Windows) | 表名 | 用途 | +|--------|---------------|------|------| +| 工作区 DB | `%APPDATA%\Cursor\User\workspaceStorage\<hash>\state.vscdb` | `ItemTable`(PascalCase) | 会话元数据 | +| 全局 DB | `%APPDATA%\Cursor\User\globalStorage\state.vscdb` | `cursorDiskKV`(camelCase) | 对话内容 | + +**踩坑点:表名大小写不同!混用会报 `no such table` 错误。** + +**工作区 DB — 会话元数据**: +```sql +-- 查询所有会话列表 +SELECT value FROM ItemTable WHERE [key] = 'composer.composerData'; +``` +返回 JSON,结构为 `{ "allComposers": [ { "id": "<composerId>", "createdAt": "...", ... } ] }` + +**全局 DB — 对话内容(Bubble 数据)**: +```sql +-- 查询某会话的所有消息 +SELECT [key], value FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:<composerId>:%' ORDER BY rowid ASC; + +-- 查询所有 composerId +SELECT DISTINCT SUBSTR([key], 10, INSTR(SUBSTR([key], 10), ':') - 1) +FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:%'; +``` + +**Key 格式**:`bubbleId:<composerId>:<bubbleId>`(冒号分隔三段) + +**Value JSON 结构(单条消息)**: +```json +{ + "_v": 3, // schema 版本号 + "type": 1, // 1=用户消息, 2=助手消息 + "text": "用户的问题内容", // 对话文本 + "createdAt": "2026-...", // 时间戳 + "isAgentic": false, // 是否为 Agent 模式 + "toolResults": [], // 工具调用结果 + "codeBlocks": [], // 代码块 + "allThinkingBlocks": [ // AI 思考过程(仅 type=2) + { "thinking": "..." } + ] +} +``` + +**解析注意事项**: +1. 流式传输会产生 `text` 为空的中间态 assistant 消息,必须过滤 +2. `_v` 版本号可能随更新递增,遇到未知版本应仅打 warning +3. 工作区路径有 `file:///` 前缀且 URL 编码,需解码 +4. 项目文件夹删除后,工作区 DB 丢失但全局 DB 数据仍在("孤立对话") +5. 数据物理隔离在两个 DB,必须先从工作区获取 composerId,再跨库查询全局内容 + +#### 2.2 Trae(P1)— 单库架构 + +| 路径 (Windows) | 表名 | Key 模式 | +|---------------|------|----------| +| `%APPDATA%\Trae\User\workspaceStorage\<hash>\state.vscdb` | `ItemTable` | `memento/icube-ai-ng-chat-storage` 或 `memento/icube-ai-agent-storage` | + +**Value JSON 结构**: +```json +{ + "list": [ + { + "messages": [ + { + "role": "user", + "content": "用户的问题", + "timestamp": "...", + "status": "active" // "deleted" 的需跳过 + }, + { + "role": "assistant", + "content": "", // 注意:可能为空! + "agentTaskContent": { // 实际内容可能在这里 + "proposalText": "实际回复内容", + "proposalReasoningContent": "推理过程" + } + } + ] + } + ] +} +``` + +**踩坑点**:助手消息的 `content` 字段经常为空,核心内容在 `agentTaskContent` 中。提取优先级:`content` > `agentTaskContent.proposalText` > `finish` 工具结果。 + +#### 2.3 Windsurf(P2) + +- 存储格式:类似 VS Code 的 SQLite `state.vscdb` +- 路径:`%APPDATA%\Windsurf\User\` 下 +- Cascade 会话文件:`~/.codeium/windsurf/cascade/` +- **具体表名和 key 模式尚无公开文档**,需实际 dump 确认 +- 使用 WAL 模式(存在 `-wal` 和 `-shm` 文件) + +#### 2.4 GitHub Copilot(P2) + +**⚠️ 关键发现:Copilot 不用 SQLite,用的是 JSON/JSONL 文件** + +| 路径 | 格式 | 说明 | +|------|------|------| +| VS Code `workspaceStorage/<hash>/chatSessions/` | `.json` + `.jsonl` | 项目级 | +| VS Code `globalStorage/chatSessions/` | `.json` + `.jsonl` | 全局 | + +**JSON/JSONL 结构**: +- `.json` 文件包含 `sessionId`、`timestamp`、`customTitle` +- `.jsonl` 文件首行 `kind: 0` 为完整快照,后续行为 UI 补丁(应忽略) +- 快照内 `requests` 数组嵌套用户输入和助手响应 +- 响应通过 `kind` 字段区分(文本/思考/工具调用) + +→ 这意味着 GitHub Copilot 更适合用现有的 `BaseFileWatcher`(文件追加模式),而非 `BasePollingWatcher`。需要新建一个 `CopilotChatWatcher(BaseFileWatcher)` 子类。 + +#### 2.5 优先级排序 + +| 优先级 | 工具 | Watcher 类型 | 理由 | +|--------|------|-------------|------| +| **P0** | Cursor | BasePollingWatcher(双库 SQLite) | 用户量最大,格式已调研 | +| P1 | Trae | BasePollingWatcher(单库 SQLite) | 格式已明确,VS Code fork | +| P1 | Windsurf | BasePollingWatcher(SQLite) | 与 Cursor/Trae 类似,需 dump 确认 | +| P2 | GitHub Copilot | **BaseFileWatcher**(JSON/JSONL 文件) | 非 SQLite!用文件追加模式 | +| P3 | Warp Terminal | BasePollingWatcher | 命令历史,低优先级 | +| P4 | Tabnine | BasePollingWatcher | JSON+SQLite 混合,低优先级 | + +--- + +### 三、架构设计 + +#### 3.1 新增 BasePollingWatcher 基类 + +**文件**:`openviking/daemon/watchers/base_polling_watcher.py` + +不继承 `BaseFileWatcher`,直接实现 `BaseWatcher` Protocol。 + +```python +class BasePollingWatcher(ABC): + """Base class for database/API-based watchers that use periodic polling.""" + + def __init__( + self, + tool_name: str, + watch_dir: str, # DB 文件所在目录(或 DB 文件路径) + cursor_manager: CursorManager, + batch_callback: Callable[[List[Dict]], None], + poll_interval: int = 30, # 轮询间隔(秒) + batch_trigger_lines: int = 50, + batch_trigger_seconds: int = 300, + extra: Optional[Dict] = None, + **kwargs, # 兼容 create_watcher 工厂的其他参数 + ): + self._tool_name = tool_name + self.watch_dir = os.path.expanduser(watch_dir) + self.cursor_manager = cursor_manager + self.batch_callback = batch_callback + self.poll_interval = poll_interval + self.extra = extra or {} + + self._buffer = BatchBuffer() + self.batch_trigger_lines = batch_trigger_lines + self.batch_trigger_seconds = batch_trigger_seconds + self._poll_thread: Optional[Thread] = None + self._stop_event = threading.Event() + + # ─── BaseWatcher Protocol ─── + @property + def tool_name(self) -> str: + return self._tool_name + + def start(self) -> None: + self._stop_event.clear() + self._poll_thread = Thread(target=self._poll_loop, daemon=True, name=f"poll-{self._tool_name}") + self._poll_thread.start() + logger.info("[%s] Polling watcher started (interval=%ds)", self._tool_name, self.poll_interval) + + def stop(self) -> None: + self._stop_event.set() + if self._poll_thread: + self._poll_thread.join(timeout=5) + logger.info("[%s] Polling watcher stopped", self._tool_name) + + def flush(self) -> None: + self._flush_buffer() + + # ─── 子类必须实现 ─── + @abstractmethod + def query_new_events(self, last_cursor: int) -> List[Dict]: + """查询数据源中 cursor 之后的新事件。 + Args: + last_cursor: 上次处理到的位置(rowid/timestamp/offset) + Returns: + raw event dicts,每个必须包含 '_cursor_position' 字段 + """ + ... + + @abstractmethod + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """将 raw DB row 转换为标准 normalized event dict。 + 返回 None 表示跳过。 + 输出格式同文件 watcher:{role, content, type, timestamp, session_id, project_name} + """ + ... + + # ─── 可选覆盖 ─── + def filter_event(self, event: Dict) -> bool: + """额外过滤。默认保留全部。""" + return True + + def resolve_db_path(self) -> Optional[str]: + """解析 DB 文件完整路径。 + 默认:在 watch_dir 下查找 extra['db_filename']。 + 子类可覆盖以实现更复杂的发现逻辑。 + """ + db_filename = self.extra.get("db_filename", "state.vscdb") + candidate = os.path.join(self.watch_dir, db_filename) + if os.path.exists(candidate): + return candidate + return None + + # ─── 内部方法 ─── + def _poll_loop(self): + """主轮询循环。""" + # cursor key 使用 watch_dir(与文件 watcher 的 file_path 区分) + cursor_key = self.watch_dir + + while not self._stop_event.is_set(): + try: + db_path = self.resolve_db_path() + if db_path is None: + logger.debug("[%s] DB not found, retrying...", self._tool_name) + self._stop_event.wait(self.poll_interval) + continue + + cursor = self.cursor_manager.get_cursor(cursor_key) + raw_events = self.query_new_events(cursor.last_position) + + if raw_events: + new_position = cursor.last_position + for raw in raw_events: + normalized = self.normalize_event(raw) + if normalized is None: + continue + if not self.filter_event(normalized): + continue + normalized["tool_name"] = self._tool_name + self._buffer.add_line(normalized, byte_size=0) + + # 追踪最大 cursor position + pos = raw.get("_cursor_position", 0) + if pos > new_position: + new_position = pos + + # 更新 cursor + if new_position > cursor.last_position: + self.cursor_manager.update_cursor(cursor_key, new_position) + + self._check_batch_trigger() + + except Exception as e: + logger.error("[%s] Poll error: %s", self._tool_name, e, exc_info=True) + + self._stop_event.wait(self.poll_interval) + + def _check_batch_trigger(self): + if self._buffer.is_empty(): + return + line_count = len(self._buffer.lines) + age = time.time() - self._buffer.created_at if self._buffer.created_at > 0 else 0 + if line_count >= self.batch_trigger_lines or age >= self.batch_trigger_seconds: + self._flush_buffer() + + def _flush_buffer(self): + if self._buffer.is_empty(): + return + events = self._buffer.lines.copy() + logger.info("[%s] Flushing %d events", self._tool_name, len(events)) + try: + self.batch_callback(events) + self._buffer.clear() + except Exception as e: + logger.error("[%s] Batch callback failed: %s", self._tool_name, e, exc_info=True) +``` + +**关键设计决策:** + +| 决策点 | 方案 | 理由 | +|--------|------|------| +| 轮询机制 | `Thread` + `Event.wait(interval)` | 与 watchdog Observer 平级,DaemonService 已处理线程安全 | +| cursor key | 使用 `watch_dir` 字符串 | 与文件 watcher 的 `file_path` 互不冲突,CursorManager 无需改动 | +| DB 连接管理 | 子类在 `query_new_events` 内自行 open/close | SQLite `?mode=ro` 只读,避免锁冲突;CursorManager 已有 per-call connect 模式 | +| `_cursor_position` 约定 | raw event dict 中必须携带 | 让子类灵活定义位置语义(rowid/timestamp/offset),基类只取 max | +| `BatchBuffer` 复用 | 直接用现有实现 | `byte_size=0`(DB 无字节偏移概念),time trigger 正常工作 | +| `extra` 参数 | 传递 `poll_interval`、`db_filename` 等 | 利用已有 WatcherConfig.extra 字段 | + +**注意:`BatchBuffer` 与 trigger 值存储** + +当前 `BatchBuffer` dataclass 不接受 trigger 参数——`batch_trigger_lines` 和 `batch_trigger_seconds` 由 `BaseFileWatcher` 作为实例属性持有,在 `_check_batch_trigger()` 中读取。 + +`BasePollingWatcher` 采用同样模式:`self.batch_trigger_lines` / `self.batch_trigger_seconds` 存在 watcher 实例上,`BatchBuffer()` 无参构造。与 `BaseFileWatcher` 完全一致,零改动。 + +#### 3.2 CursorDBWatcher 实现(P0)— 基于真实格式调研 + +**文件**:`openviking/daemon/watchers/cursor_db_watcher.py` + +Cursor 使用**双库架构**:工作区 DB 存会话元数据,全局 DB 存对话内容。CursorDBWatcher 的 `watch_dir` 指向 Cursor 用户数据根目录(如 `%APPDATA%\Cursor\User`),内部同时访问两个 state.vscdb。 + +```python +@register_watcher("cursor_db") +class CursorDBWatcher(BasePollingWatcher): + """Watches Cursor IDE's dual-SQLite storage for AI conversations. + + Architecture: + - Workspace DB: workspaceStorage/<hash>/state.vscdb → ItemTable → composer.composerData (session metadata) + - Global DB: globalStorage/state.vscdb → cursorDiskKV → bubbleId:<composerId>:<bubbleId> (message content) + """ + + def __init__(self, watch_dir, cursor_manager, batch_callback, + poll_interval=30, batch_trigger_lines=50, batch_trigger_seconds=300, + extra=None, **kwargs): + super().__init__( + tool_name="cursor_db", + watch_dir=watch_dir, # e.g. %APPDATA%\Cursor\User + cursor_manager=cursor_manager, + batch_callback=batch_callback, + poll_interval=poll_interval, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + extra=extra, + ) + self._global_db_path = os.path.join(self.watch_dir, "globalStorage", "state.vscdb") + self._workspace_storage_dir = os.path.join(self.watch_dir, "workspaceStorage") + + def resolve_db_path(self) -> Optional[str]: + """返回全局 DB 路径(主要数据源)。""" + if os.path.exists(self._global_db_path): + return self._global_db_path + return None + + def _discover_composer_ids(self) -> List[str]: + """扫描所有工作区 DB,收集 composerId 列表。 + 用于关联全局 DB 中的 bubble 数据。 + """ + composer_ids = [] + if not os.path.isdir(self._workspace_storage_dir): + return composer_ids + + for ws_hash in os.listdir(self._workspace_storage_dir): + ws_db = os.path.join(self._workspace_storage_dir, ws_hash, "state.vscdb") + if not os.path.exists(ws_db): + continue + try: + conn = sqlite3.connect(f"file:{ws_db}?mode=ro", uri=True) + try: + row = conn.execute( + "SELECT value FROM ItemTable WHERE [key] = 'composer.composerData'" + ).fetchone() + if row and row[0]: + data = json.loads(row[0]) + all_composers = data.get("allComposers", []) + for c in all_composers: + cid = c.get("id") + if cid: + composer_ids.append(cid) + finally: + conn.close() + except Exception: + continue + return composer_ids + + def query_new_events(self, last_cursor: int) -> List[Dict]: + """从全局 DB 的 cursorDiskKV 表查询新 bubble 数据。 + + 策略:直接扫描所有 bubbleId:* key(rowid > last_cursor), + 不依赖工作区 DB 的 composerId 列表(支持发现孤立对话)。 + """ + db_path = self.resolve_db_path() + if not db_path: + return [] + + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + conn.execute("PRAGMA busy_timeout = 3000") + try: + rows = conn.execute( + "SELECT rowid, [key], value FROM cursorDiskKV " + "WHERE rowid > ? AND [key] LIKE 'bubbleId:%' " + "ORDER BY rowid ASC LIMIT 500", + (last_cursor,) + ).fetchall() + + events = [] + for rowid, key, value in rows: + # 解析 key: bubbleId:<composerId>:<bubbleId> + parts = key.split(":", 2) + composer_id = parts[1] if len(parts) >= 3 else None + + try: + parsed_value = json.loads(value) if isinstance(value, str) else value + except (json.JSONDecodeError, TypeError): + continue + + events.append({ + "rowid": rowid, + "key": key, + "value": parsed_value, + "composer_id": composer_id, + "_cursor_position": rowid, + }) + return events + finally: + conn.close() + except sqlite3.OperationalError as e: + logger.warning("[cursor_db] SQLite error (DB may be locked): %s", e) + return [] + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """解析 Cursor bubble 格式。 + + Value JSON: + - _v: schema version (currently 3) + - type: 1=user, 2=assistant + - text: message content + - createdAt: timestamp + - allThinkingBlocks: AI reasoning (assistant only) + """ + value = raw_event.get("value") + if not isinstance(value, dict): + return None + + # Schema version check — warn but don't crash + schema_version = value.get("_v", 0) + if schema_version > 3: + logger.debug("[cursor_db] Unknown bubble schema v%d", schema_version) + + # type: 1=user, 2=assistant + bubble_type = value.get("type") + if bubble_type == 1: + role = "user" + elif bubble_type == 2: + role = "assistant" + else: + return None + + # text: message content + content = value.get("text", "") + if not content or not content.strip(): + return None # 过滤流式传输的空壳消息 + + return { + "role": role, + "content": content, + "type": "message", + "timestamp": value.get("createdAt"), + "session_id": raw_event.get("composer_id"), + } + + def filter_event(self, event: Dict) -> bool: + """过滤过短内容。""" + content = event.get("content", "") + return len(content.strip()) >= 10 +``` + +**与原计划的关键差异**: + +| 原假设 | 实际格式 | +|--------|----------| +| 单库 `ItemTable` + `WHERE key LIKE '%chat%'` | 双库:`ItemTable`(元数据)+ `cursorDiskKV`(内容) | +| value 有 `role` 字段 | `type` 字段:1=用户, 2=助手 | +| value 有 `message`/`content` 字段 | `text` 字段 | +| `session_id` 在 value 内 | 从 key 的冒号分隔中解析(`composerId`) | +| 无 schema 版本 | `_v` 字段标识版本(当前主流 v3) | + +**调研验证命令**(在用户机器上确认): +```bash +# 确认全局 DB 表名 +sqlite3 "%APPDATA%/Cursor/User/globalStorage/state.vscdb" ".tables" +# 预期输出: cursorDiskKV + +# 查看 bubble 数据样例 +sqlite3 "%APPDATA%/Cursor/User/globalStorage/state.vscdb" \ + "SELECT [key], value FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:%' LIMIT 3" +``` + +#### 3.3 WindsurfDBWatcher(P2,待调研) + +**⚠️ Windsurf 的具体表名和 key 模式尚无公开文档。** Phase 4 需先 dump 真实 state.vscdb 确认: +- 是否使用 `cursorDiskKV` 还是 `ItemTable` +- bubble key 模式是否与 Cursor 相同 +- 是否有独特的 Cascade 会话格式 + +如果格式与 Cursor 高度相似,可继承 CursorDBWatcher: + +```python +@register_watcher("windsurf_db") +class WindsurfDBWatcher(CursorDBWatcher): + """Windsurf — format TBD after dump analysis.""" + + @property + def tool_name(self) -> str: + return "windsurf_db" +``` + +如果格式差异大,则独立实现。 + +#### 3.4 Registry 更新 + +在 `_register_builtins()` 中添加: + +```python +try: + from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher + _WATCHER_REGISTRY["cursor_db"] = CursorDBWatcher +except ImportError: + pass + +try: + from openviking.daemon.watchers.trae_db_watcher import TraeDBWatcher + _WATCHER_REGISTRY["trae_db"] = TraeDBWatcher +except ImportError: + pass + +try: + from openviking.daemon.watchers.windsurf_db_watcher import WindsurfDBWatcher + _WATCHER_REGISTRY["windsurf_db"] = WindsurfDBWatcher +except ImportError: + pass +``` + +#### 3.5 WatcherConfig 示例 + +```json +{ + "server": { + "daemon": { + "enabled": true, + "watchers": [ + { + "tool_name": "claude_code", + "watch_dir": "C:\\Users\\xxx\\.claude\\projects", + "batch_trigger_lines": 5 + }, + { + "tool_name": "cursor_db", + "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Cursor\\User", + "batch_trigger_lines": 5, + "extra": { + "poll_interval": 30 + } + }, + { + "tool_name": "trae_db", + "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Trae\\User", + "batch_trigger_lines": 5, + "extra": { + "poll_interval": 30 + } + }, + { + "tool_name": "windsurf_db", + "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Windsurf\\User", + "batch_trigger_lines": 5, + "extra": { + "poll_interval": 30 + } + } + ] + } + } +} +``` + +**注意**:`watch_dir` 现在指向各 IDE 的 `User` 根目录(而非 `globalStorage`),因为 CursorDBWatcher 内部需要同时访问 `workspaceStorage/` 和 `globalStorage/` 两个子目录。 + +#### 3.6 CursorManager 兼容性 + +**无需改动**。现有表结构完全适用: + +```sql +CREATE TABLE IF NOT EXISTS file_cursors ( + file_path TEXT PRIMARY KEY, -- DB watcher: watch_dir 路径 + last_position INTEGER NOT NULL, -- DB watcher: rowid / timestamp + last_read_time REAL NOT NULL, + updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP +) +``` + +文件 watcher 和 DB watcher 以 `file_path` 字段值(文件路径 vs 目录路径)自然区分,互不干扰。 + +#### 3.7 DaemonService 兼容性 + +**无需改动**。`service.py` 的 `start()` 方法已经: +- 通过 `create_watcher(**kwargs)` 工厂创建 watcher,自动适配 BasePollingWatcher +- 通过 `_enqueue_batch` → `call_soon_threadsafe` 处理来自 polling thread 的回调 +- 通过 `watcher.start()` / `stop()` / `flush()` 统一生命周期管理 + +唯一需要确认:`DaemonService.start()` 中 `Path(watch_dir).mkdir(parents=True, exist_ok=True)` 对 DB watcher 是否安全——DB watcher 的 watch_dir 是已存在的 Cursor 安装目录,`mkdir(exist_ok=True)` 不会出错。✓ + +--- + +### 四、实施任务清单 + +#### Phase 1:基础设施(BasePollingWatcher + 测试框架) + +| # | 任务 | 文件 | 预估 | +|---|------|------|------| +| 1.1 | 创建 `BasePollingWatcher` 基类 | `watchers/base_polling_watcher.py` | ~120 行 | +| 1.2 | 创建 `test_base_polling_watcher.py` 单元测试 | `tests/daemon/` | mock poll loop / buffer / flush / Protocol 验证 | + +#### Phase 2:CursorDBWatcher(P0 核心) + +| # | 任务 | 文件 | 预估 | +|---|------|------|------| +| 2.1 | ~~调研~~:dump 真实 Cursor state.vscdb 结构 | — | **已完成**(见 §2.1) | +| 2.2 | 创建 `CursorDBWatcher`(双库架构、cursorDiskKV、bubbleId 解析) | `watchers/cursor_db_watcher.py` | ~150 行 | +| 2.3 | 单元测试 `test_cursor_db_watcher.py` | `tests/daemon/` | 创建临时双库 SQLite → mock bubble 数据 → normalize 验证 | +| 2.4 | 集成测试:完整 poll 周期 → batch_callback 验证 | 同上 | | +| 2.5 | Registry 注册 `cursor_db` | `watchers/registry.py` | 5 行 | + +#### Phase 3:TraeDBWatcher(P1) + +| # | 任务 | 文件 | 预估 | +|---|------|------|------| +| 3.1 | 创建 `TraeDBWatcher`(单库 ItemTable、`memento/icube-ai-ng-chat-storage` key) | `watchers/trae_db_watcher.py` | ~120 行 | +| 3.2 | `normalize_event`:处理 `content` 为空的 fallback(`agentTaskContent.proposalText`) | 同上 | | +| 3.3 | 单元测试 + Registry 注册 `trae_db` | `tests/daemon/test_trae_db_watcher.py` | | + +#### Phase 4:WindsurfDBWatcher(P2,需先 dump 确认格式) + +| # | 任务 | 文件 | 预估 | +|---|------|------|------| +| 4.1 | **调研**:dump 真实 Windsurf state.vscdb 确认表名和 key 模式 | 手动 | 可能继承 CursorDBWatcher | +| 4.2 | 创建 `WindsurfDBWatcher` | `watchers/windsurf_db_watcher.py` | ~30 行(若格式同 Cursor) | +| 4.3 | 单元测试 + Registry 注册 `windsurf_db` | | | + +#### Phase 5:集成验证 + +| # | 任务 | 文件 | 预估 | +|---|------|------|------| +| 5.1 | 更新 `test_multi_watcher_integration.py`:加入 DB watcher 测试 | `tests/daemon/` | | +| 5.2 | 端到端:真实 Cursor 对话 → watcher → ETL → viking:// | 手动验证 | | +| 5.3 | 更新 `test_all_watchers_registered` 验证新增 watcher | | 2 行 | + +--- + +### 五、风险点与应对 + +| 风险 | 影响 | 应对 | +|------|------|------| +| Cursor 锁住 SQLite(WAL mode) | `query_new_events` 读超时 | `?mode=ro` + `PRAGMA busy_timeout=3000`;读失败时跳过本轮 | +| Cursor 更新 `cursorDiskKV` 格式 / `_v` 版本升级 | normalize 失效 | normalize_event 已含 `_v` 版本检查;加 debug 日志记录未知格式 | +| watch_dir 不存在或 DB 文件被删 | resolve_db_path 返回 None | _poll_loop 已有 None 检查,优雅跳过 | +| polling thread 与 asyncio event loop 竞争 | batch_callback 死锁 | DaemonService._enqueue_batch 已用 call_soon_threadsafe,无改动 | +| CursorManager 的 `file_path` 字段语义混淆 | 调试困难 | 在 BasePollingWatcher 文档注释中明确说明 cursor_key = watch_dir | +| Trae 助手消息 `content` 为空 | 丢失有效内容 | 三级 fallback:`content` → `agentTaskContent.proposalText` → `finish` 工具结果 | +| Windsurf 格式与 Cursor 不同 | 不能简单继承 | Phase 4 调研确认后再决定实现方式 | + +--- + +### 六、验证计划 + +每个新 DB watcher 的三级验证: + +1. **单元测试**:创建临时 SQLite → 写入 mock 数据(模拟真实 bubble/trait 格式)→ 调用 `query_new_events(0)` → 验证返回值 → 调用 `normalize_event` → 验证 normalized 格式 +2. **集成测试**:`BasePollingWatcher.start()` → 等待 1-2 个 poll 周期 → 验证 `batch_callback` 被调用且 events 格式正确 → `stop()` +3. **端到端**:真实工具产生对话 → watcher 检测 → ETL pipeline → 知识写入 `viking://resources/` + +--- + +### 七、文件变更清单 + +| 操作 | 文件路径 | +|------|----------| +| **新增** | `openviking/daemon/watchers/base_polling_watcher.py` | +| **新增** | `openviking/daemon/watchers/cursor_db_watcher.py`(P0) | +| **新增** | `openviking/daemon/watchers/trae_db_watcher.py`(P1) | +| **新增** | `openviking/daemon/watchers/windsurf_db_watcher.py`(P2) | +| **新增** | `tests/daemon/test_base_polling_watcher.py` | +| **新增** | `tests/daemon/test_cursor_db_watcher.py` | +| **新增** | `tests/daemon/test_trae_db_watcher.py` | +| **修改** | `openviking/daemon/watchers/registry.py` — 添加 cursor_db / trae_db / windsurf_db 注册 | +| **修改** | `tests/daemon/test_multi_watcher_integration.py` — 添加 DB watcher 测试用例 | +| **不改** | `models.py` / `cursor_manager.py` / `service.py` / `etl_pipeline.py` / `config.py` / `base_file_watcher.py` | + +--- + +### 八、参考资料 + +| 来源 | 链接 | +|------|------| +| Cursor state.vscdb 解析踩坑记 | https://article.juejin.cn/post/7640063917021167625 | +| Cursor 对话导入:解析 SQLite 里的宝藏 | https://juejin.cn/post/7640053666326741030 | +| 5 种 AI 对话数据格式全解析 | https://blog.csdn.net/2201_75708499/article/details/161789991 | +| Codex CLI / Trae / Copilot 数据源接入 | https://juejin.cn/post/7640357289836281882 | +| AI编程助手数据提取终极指南 | https://www.xugj520.cn/archives/ai-coding-assistant-data-extraction-2.html | +| Trae 对话记录导出脚本 | https://juejin.cn/post/7472786501598101523 | +| windsurf-monitor (GitHub) | https://github.com/bjfwan/windsurf-monitor | diff --git a/openviking/daemon/watchers/base_polling_watcher.py b/openviking/daemon/watchers/base_polling_watcher.py new file mode 100644 index 0000000000..29059b7a93 --- /dev/null +++ b/openviking/daemon/watchers/base_polling_watcher.py @@ -0,0 +1,205 @@ +""" +Abstract base class for database/API-based watchers that use periodic polling. +Unlike BaseFileWatcher (watchdog + file cursor), this uses Thread + Event.wait(interval). +Subclasses implement query_new_events() and normalize_event(). +""" +import os +import time +import threading +from abc import ABC, abstractmethod +from threading import Thread +from typing import Callable, Dict, List, Optional + +from openviking.daemon.models import BatchBuffer +from openviking.daemon.cursor_manager import CursorManager +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +class BasePollingWatcher(ABC): + """Base class for database/API-based watchers that use periodic polling. + + Implements BaseWatcher Protocol (tool_name, start, stop, flush) without + inheriting BaseFileWatcher. Uses a daemon Thread for polling instead of + watchdog Observer. + + Subclasses must implement: + - query_new_events(last_cursor): Query data source for new events + - normalize_event(raw_event): Convert raw event to normalized format + + Optional overrides: + - filter_event(event): Additional filtering + - resolve_db_path(): Custom DB file discovery logic + """ + + def __init__( + self, + tool_name: str, + watch_dir: str, + cursor_manager: CursorManager, + batch_callback: Callable[[List[Dict]], None], + poll_interval: int = 30, + batch_trigger_lines: int = 50, + batch_trigger_seconds: int = 300, + extra: Optional[Dict] = None, + **kwargs, + ): + self._tool_name = tool_name + self.watch_dir = os.path.expanduser(watch_dir) + self.cursor_manager = cursor_manager + self.batch_callback = batch_callback + self.poll_interval = poll_interval + self.extra = extra or {} + + self._buffer = BatchBuffer() + self.batch_trigger_lines = batch_trigger_lines + self.batch_trigger_seconds = batch_trigger_seconds + self._poll_thread: Optional[Thread] = None + self._stop_event = threading.Event() + + # --- BaseWatcher Protocol --- + + @property + def tool_name(self) -> str: + return self._tool_name + + def start(self) -> None: + self._stop_event.clear() + self._poll_thread = Thread( + target=self._poll_loop, daemon=True, name=f"poll-{self._tool_name}" + ) + self._poll_thread.start() + logger.info( + "[%s] Polling watcher started (interval=%ds, dir=%s)", + self._tool_name, self.poll_interval, self.watch_dir, + ) + + def stop(self) -> None: + self._stop_event.set() + if self._poll_thread: + self._poll_thread.join(timeout=5) + logger.info("[%s] Polling watcher stopped", self._tool_name) + + def flush(self) -> None: + self._flush_buffer() + + # --- Subclasses must implement --- + + @abstractmethod + def query_new_events(self, last_cursor: int) -> List[Dict]: + """Query data source for events newer than last_cursor. + + Args: + last_cursor: Last processed position (rowid/timestamp/offset) + + Returns: + List of raw event dicts. Each MUST include '_cursor_position' field. + """ + ... + + @abstractmethod + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """Convert raw event to normalized format. + + Returns None to skip. Output must have at minimum: + {role, content, type, timestamp, session_id} + """ + ... + + # --- Optional overrides --- + + def filter_event(self, event: Dict) -> bool: + """Additional filtering. Return True to keep, False to skip.""" + return True + + def resolve_db_path(self) -> Optional[str]: + """Resolve DB file path. Default: watch_dir/extra['db_filename']. + Subclasses can override for more complex discovery. + """ + db_filename = self.extra.get("db_filename", "state.vscdb") + candidate = os.path.join(self.watch_dir, db_filename) + if os.path.exists(candidate): + return candidate + return None + + # --- Internal --- + + def _poll_loop(self): + """Main polling loop. Runs in a daemon thread.""" + cursor_key = self.watch_dir + + while not self._stop_event.is_set(): + try: + db_path = self.resolve_db_path() + if db_path is None: + logger.debug("[%s] DB not found, retrying...", self._tool_name) + self._stop_event.wait(self.poll_interval) + continue + + cursor = self.cursor_manager.get_cursor(cursor_key) + raw_events = self.query_new_events(cursor.last_position) + + if raw_events: + new_position = cursor.last_position + for raw in raw_events: + # Always advance cursor for every raw event seen, + # even if normalize/filter drops it. Otherwise filtered + # rows would be re-queried on every poll cycle. + pos = raw.get("_cursor_position", 0) + if pos > new_position: + new_position = pos + + normalized = self.normalize_event(raw) + if normalized is None: + continue + if not self.filter_event(normalized): + continue + + normalized["tool_name"] = self._tool_name + self._buffer.add_line(normalized, byte_size=0) + + if new_position > cursor.last_position: + self.cursor_manager.update_cursor(cursor_key, new_position) + + self._check_batch_trigger() + + except Exception as e: + logger.error( + "[%s] Poll error: %s", self._tool_name, e, exc_info=True + ) + + self._stop_event.wait(self.poll_interval) + + def _check_batch_trigger(self): + """Check if batch trigger conditions are met.""" + if self._buffer.is_empty(): + return + + line_count = len(self._buffer.lines) + age = ( + time.time() - self._buffer.created_at + if self._buffer.created_at > 0 + else 0 + ) + + if ( + line_count >= self.batch_trigger_lines + or age >= self.batch_trigger_seconds + ): + self._flush_buffer() + + def _flush_buffer(self): + """Flush buffered events via batch_callback.""" + if self._buffer.is_empty(): + return + + events = self._buffer.lines.copy() + logger.info("[%s] Flushing %d events", self._tool_name, len(events)) + try: + self.batch_callback(events) + self._buffer.clear() + except Exception as e: + logger.error( + "[%s] Batch callback failed: %s", self._tool_name, e, exc_info=True + ) diff --git a/openviking/daemon/watchers/cursor_db_watcher.py b/openviking/daemon/watchers/cursor_db_watcher.py new file mode 100644 index 0000000000..77a6444318 --- /dev/null +++ b/openviking/daemon/watchers/cursor_db_watcher.py @@ -0,0 +1,193 @@ +""" +Cursor IDE SQLite database watcher. +Monitors Cursor's dual-SQLite storage for AI conversations: +- Workspace DB: workspaceStorage/<hash>/state.vscdb -> ItemTable (session metadata) +- Global DB: globalStorage/state.vscdb -> cursorDiskKV (bubble message content) + +Key format: bubbleId:<composerId>:<bubbleId> +Value JSON: {_v, type(1=user/2=assistant), text, createdAt, allThinkingBlocks, ...} +""" +import json +import os +import sqlite3 +from typing import Dict, List, Optional + +from openviking.daemon.watchers.base_polling_watcher import BasePollingWatcher +from openviking.daemon.watchers.registry import register_watcher +from openviking_cli.utils.logger import get_logger + +logger = get_logger(__name__) + + +@register_watcher("cursor_db") +class CursorDBWatcher(BasePollingWatcher): + """Watches Cursor IDE's dual-SQLite storage for AI conversations. + + Architecture: + - Workspace DB: workspaceStorage/<hash>/state.vscdb -> ItemTable -> composer.composerData + - Global DB: globalStorage/state.vscdb -> cursorDiskKV -> bubbleId:<composerId>:<bubbleId> + + watch_dir should point to the Cursor User root: + - Windows: %APPDATA%\\Cursor\\User + - macOS: ~/Library/Application Support/Cursor/User + - Linux: ~/.config/Cursor/User + """ + + def __init__(self, watch_dir, cursor_manager, batch_callback, + poll_interval=30, batch_trigger_lines=50, batch_trigger_seconds=300, + extra=None, **kwargs): + super().__init__( + tool_name="cursor_db", + watch_dir=watch_dir, + cursor_manager=cursor_manager, + batch_callback=batch_callback, + poll_interval=poll_interval, + batch_trigger_lines=batch_trigger_lines, + batch_trigger_seconds=batch_trigger_seconds, + extra=extra, + ) + self._global_db_path = os.path.join( + self.watch_dir, "globalStorage", "state.vscdb" + ) + self._workspace_storage_dir = os.path.join( + self.watch_dir, "workspaceStorage" + ) + + @property + def tool_name(self) -> str: + return "cursor_db" + + def resolve_db_path(self) -> Optional[str]: + """Return global DB path (primary data source).""" + if os.path.exists(self._global_db_path): + return self._global_db_path + return None + + def _discover_composer_ids(self) -> List[str]: + """Scan all workspace DBs to collect composerId list. + Useful for correlating bubble data in global DB. + """ + composer_ids = [] + if not os.path.isdir(self._workspace_storage_dir): + return composer_ids + + for ws_hash in os.listdir(self._workspace_storage_dir): + ws_db = os.path.join( + self._workspace_storage_dir, ws_hash, "state.vscdb" + ) + if not os.path.exists(ws_db): + continue + try: + conn = sqlite3.connect(f"file:{ws_db}?mode=ro", uri=True) + try: + row = conn.execute( + "SELECT value FROM ItemTable " + "WHERE [key] = 'composer.composerData'" + ).fetchone() + if row and row[0]: + data = json.loads(row[0]) + for c in data.get("allComposers", []): + cid = c.get("id") + if cid: + composer_ids.append(cid) + finally: + conn.close() + except Exception: + continue + return composer_ids + + def query_new_events(self, last_cursor: int) -> List[Dict]: + """Query global DB cursorDiskKV for new bubble data. + + Strategy: scan all bubbleId:* keys (rowid > last_cursor). + Does NOT depend on workspace DB composerId list (supports orphan conversations). + """ + db_path = self.resolve_db_path() + if not db_path: + return [] + + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + conn.execute("PRAGMA busy_timeout = 3000") + try: + rows = conn.execute( + "SELECT rowid, [key], value FROM cursorDiskKV " + "WHERE rowid > ? AND [key] LIKE 'bubbleId:%' " + "ORDER BY rowid ASC LIMIT 500", + (last_cursor,), + ).fetchall() + + events = [] + for rowid, key, value in rows: + # Parse key: bubbleId:<composerId>:<bubbleId> + parts = key.split(":", 2) + composer_id = parts[1] if len(parts) >= 3 else None + + try: + parsed_value = ( + json.loads(value) if isinstance(value, str) else value + ) + except (json.JSONDecodeError, TypeError): + continue + + events.append({ + "rowid": rowid, + "key": key, + "value": parsed_value, + "composer_id": composer_id, + "_cursor_position": rowid, + }) + return events + finally: + conn.close() + except sqlite3.OperationalError as e: + logger.warning("[cursor_db] SQLite error (DB may be locked): %s", e) + return [] + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + """Parse Cursor bubble format. + + Value JSON fields: + - _v: schema version (currently 3) + - type: 1=user, 2=assistant + - text: message content + - createdAt: timestamp + - allThinkingBlocks: AI reasoning (assistant only) + """ + value = raw_event.get("value") + if not isinstance(value, dict): + return None + + # Schema version check - warn but don't crash + schema_version = value.get("_v", 0) + if schema_version > 3: + logger.debug( + "[cursor_db] Unknown bubble schema v%d", schema_version + ) + + # type: 1=user, 2=assistant + bubble_type = value.get("type") + if bubble_type == 1: + role = "user" + elif bubble_type == 2: + role = "assistant" + else: + return None + + # text: message content + content = value.get("text", "") + if not content or not content.strip(): + return None # Filter empty streaming artifacts + + return { + "role": role, + "content": content, + "type": "message", + "timestamp": value.get("createdAt"), + "session_id": raw_event.get("composer_id"), + } + + def filter_event(self, event: Dict) -> bool: + """Filter short content.""" + content = event.get("content", "") + return len(content.strip()) >= 10 diff --git a/openviking/daemon/watchers/registry.py b/openviking/daemon/watchers/registry.py index 18a436347d..b76b9624d3 100644 --- a/openviking/daemon/watchers/registry.py +++ b/openviking/daemon/watchers/registry.py @@ -68,5 +68,11 @@ def _register_builtins(): except ImportError: pass + try: + from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher + _WATCHER_REGISTRY["cursor_db"] = CursorDBWatcher + except ImportError: + pass + _register_builtins() diff --git a/tests/daemon/test_base_polling_watcher.py b/tests/daemon/test_base_polling_watcher.py new file mode 100644 index 0000000000..cf4a819251 --- /dev/null +++ b/tests/daemon/test_base_polling_watcher.py @@ -0,0 +1,313 @@ +""" +Unit tests for BasePollingWatcher. +Tests poll loop, buffer management, batch trigger, and BaseWatcher Protocol compliance. +""" +import time +import pytest +from typing import Dict, List, Optional +from unittest.mock import MagicMock + +from openviking.daemon.watchers.base_polling_watcher import BasePollingWatcher +from openviking.daemon.watchers import BaseWatcher +from openviking.daemon.models import FileCursor + + +class FakeCursorManager: + """In-memory cursor manager for testing.""" + + def __init__(self): + self.cursors = {} + + def get_cursor(self, file_path): + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + + +class ConcretePollingWatcher(BasePollingWatcher): + """Concrete implementation for testing.""" + + def __init__(self, mock_events=None, **kwargs): + super().__init__(**kwargs) + self._mock_events = mock_events or [] + self._query_count = 0 + + def query_new_events(self, last_cursor: int) -> List[Dict]: + self._query_count += 1 + return [e for e in self._mock_events if e.get("_cursor_position", 0) > last_cursor] + + def normalize_event(self, raw_event: Dict) -> Optional[Dict]: + role = raw_event.get("role") + content = raw_event.get("content", "") + if not role or not content: + return None + return { + "role": role, + "content": content, + "type": "message", + "timestamp": raw_event.get("timestamp"), + "session_id": raw_event.get("session_id"), + } + + +# --- Protocol Compliance --- + +def test_implements_base_watcher_protocol(): + """BasePollingWatcher must satisfy BaseWatcher Protocol.""" + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir="/tmp/test", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + assert isinstance(w, BaseWatcher) + + +def test_tool_name_property(): + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="my_tool", + watch_dir="/tmp", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + assert w.tool_name == "my_tool" + + +# --- Buffer and Batch Trigger --- + +def test_flush_empty_buffer_no_callback(): + """Flushing empty buffer should not call batch_callback.""" + batches = [] + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir="/tmp", + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + ) + w.flush() + assert len(batches) == 0 + + +def test_flush_nonempty_buffer_calls_callback(): + """Flushing non-empty buffer should call batch_callback with events.""" + batches = [] + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir="/tmp", + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + ) + w._buffer.add_line({"role": "user", "content": "hello"}, byte_size=0) + w.flush() + + assert len(batches) == 1 + assert batches[0][0]["role"] == "user" + assert w._buffer.is_empty() + + +def test_batch_trigger_by_line_count(): + """Buffer should flush when line count reaches trigger.""" + batches = [] + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir="/tmp", + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + batch_trigger_lines=3, + ) + + for i in range(3): + w._buffer.add_line({"role": "user", "content": f"msg {i}"}, byte_size=0) + + w._check_batch_trigger() + assert len(batches) == 1 + assert len(batches[0]) == 3 + + +def test_batch_trigger_by_time(): + """Buffer should flush when age exceeds trigger seconds.""" + batches = [] + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir="/tmp", + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + batch_trigger_lines=100, # high line trigger + batch_trigger_seconds=1, # low time trigger + ) + + w._buffer.add_line({"role": "user", "content": "old msg"}, byte_size=0) + # Manually age the buffer + w._buffer.created_at = time.time() - 5 + + w._check_batch_trigger() + assert len(batches) == 1 + + +def test_no_trigger_below_thresholds(): + """Buffer should NOT flush when below both thresholds.""" + batches = [] + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir="/tmp", + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + batch_trigger_lines=100, + batch_trigger_seconds=300, + ) + + w._buffer.add_line({"role": "user", "content": "msg"}, byte_size=0) + w._check_batch_trigger() + assert len(batches) == 0 + + +# --- Poll Loop --- + +def test_poll_loop_processes_events(tmp_path): + """Poll loop should query, normalize, buffer, and trigger batch.""" + batches = [] + cm = FakeCursorManager() + + events = [ + {"role": "user", "content": "Hello", "_cursor_position": 1}, + {"role": "assistant", "content": "Hi there", "_cursor_position": 2}, + ] + + w = ConcretePollingWatcher( + mock_events=events, + tool_name="test", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + poll_interval=1, + batch_trigger_lines=2, + ) + # Override resolve_db_path to return a valid path + w.resolve_db_path = lambda: str(tmp_path / "fake.db") + + w.start() + time.sleep(2.5) # Wait for at least 1 poll cycle + w.stop() + + assert len(batches) >= 1 + assert all(e["tool_name"] == "test" for e in batches[0]) + assert batches[0][0]["role"] == "user" + assert batches[0][1]["role"] == "assistant" + + +def test_poll_loop_updates_cursor(tmp_path): + """Poll loop should update cursor after processing events.""" + cm = FakeCursorManager() + events = [ + {"role": "user", "content": "msg", "_cursor_position": 42}, + ] + + w = ConcretePollingWatcher( + mock_events=events, + tool_name="test", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: None, + poll_interval=1, + batch_trigger_lines=100, + ) + w.resolve_db_path = lambda: str(tmp_path / "fake.db") + + w.start() + time.sleep(1.5) + w.stop() + + cursor = cm.get_cursor(str(tmp_path)) + assert cursor.last_position == 42 + + +def test_poll_loop_skips_when_db_not_found(tmp_path): + """Poll loop should gracefully skip when DB doesn't exist.""" + cm = FakeCursorManager() + w = ConcretePollingWatcher( + tool_name="test", + watch_dir=str(tmp_path / "nonexistent"), + cursor_manager=cm, + batch_callback=lambda e: None, + poll_interval=1, + ) + # Default resolve_db_path returns None for nonexistent dir + + w.start() + time.sleep(1.5) + w.stop() + # Should not raise — just skip gracefully + + +def test_filter_event_skips_unwanted(tmp_path): + """filter_event returning False should skip the event.""" + batches = [] + cm = FakeCursorManager() + + events = [ + {"role": "user", "content": "keep this", "_cursor_position": 1}, + {"role": "user", "content": "skip", "_cursor_position": 2}, + ] + + class FilteringWatcher(ConcretePollingWatcher): + def filter_event(self, event): + return event["content"] != "skip" + + w = FilteringWatcher( + mock_events=events, + tool_name="test", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + poll_interval=1, + batch_trigger_lines=10, + ) + w.resolve_db_path = lambda: str(tmp_path / "fake.db") + + w.start() + time.sleep(1.5) + w.stop() + w.flush() + + all_events = [e for batch in batches for e in batch] + assert len(all_events) == 1 + assert all_events[0]["content"] == "keep this" + + +# --- Callback failure resilience --- + +def test_callback_failure_does_not_crash(tmp_path): + """batch_callback failure should be caught, not crash the watcher.""" + cm = FakeCursorManager() + events = [{"role": "user", "content": "msg", "_cursor_position": 1}] + + def failing_callback(e): + raise RuntimeError("simulated failure") + + w = ConcretePollingWatcher( + mock_events=events, + tool_name="test", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=failing_callback, + poll_interval=1, + batch_trigger_lines=1, + ) + w.resolve_db_path = lambda: str(tmp_path / "fake.db") + + w.start() + time.sleep(2) + w.stop() + # Should not raise — error is logged and caught diff --git a/tests/daemon/test_cursor_db_watcher.py b/tests/daemon/test_cursor_db_watcher.py new file mode 100644 index 0000000000..1042e1fa92 --- /dev/null +++ b/tests/daemon/test_cursor_db_watcher.py @@ -0,0 +1,422 @@ +""" +Unit tests for CursorDBWatcher. +Tests dual-SQLite architecture: global DB (cursorDiskKV) + workspace DB (ItemTable). +Uses temporary SQLite databases to simulate real Cursor storage. +""" +import json +import os +import sqlite3 +import time +import pytest + +from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher +from openviking.daemon.watchers import BaseWatcher +from openviking.daemon.models import FileCursor + + +class FakeCursorManager: + """In-memory cursor manager for testing.""" + + def __init__(self): + self.cursors = {} + + def get_cursor(self, file_path): + return self.cursors.get(file_path, FileCursor(file_path=file_path)) + + def update_cursor(self, file_path, position): + self.cursors[file_path] = FileCursor( + file_path=file_path, + last_position=position, + last_read_time=time.time(), + ) + + +def _create_global_db(db_path, bubbles): + """Create a mock global state.vscdb with cursorDiskKV table. + + Args: + db_path: Path for the SQLite file + bubbles: List of (key, value_dict) tuples + """ + os.makedirs(os.path.dirname(db_path), exist_ok=True) + conn = sqlite3.connect(db_path) + conn.execute(""" + CREATE TABLE IF NOT EXISTS cursorDiskKV ( + [key] TEXT PRIMARY KEY, + value TEXT + ) + """) + for key, value in bubbles: + conn.execute( + "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)", + (key, json.dumps(value)), + ) + conn.commit() + conn.close() + + +def _create_workspace_db(db_path, composers): + """Create a mock workspace state.vscdb with ItemTable. + + Args: + db_path: Path for the SQLite file + composers: List of composer dicts with 'id' field + """ + os.makedirs(os.path.dirname(db_path), exist_ok=True) + conn = sqlite3.connect(db_path) + conn.execute(""" + CREATE TABLE IF NOT EXISTS ItemTable ( + [key] TEXT PRIMARY KEY, + value TEXT + ) + """) + composer_data = {"allComposers": composers} + conn.execute( + "INSERT INTO ItemTable ([key], value) VALUES (?, ?)", + ("composer.composerData", json.dumps(composer_data)), + ) + conn.commit() + conn.close() + + +def _make_cursor_user_dir(tmp_path): + """Create a mock Cursor User directory structure.""" + user_dir = tmp_path / "Cursor" / "User" + user_dir.mkdir(parents=True, exist_ok=True) + return user_dir + + +# --- Protocol --- + +def test_implements_base_watcher(): + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + assert isinstance(w, BaseWatcher) + assert w.tool_name == "cursor_db" + + +# --- resolve_db_path --- + +def test_resolve_db_path_found(tmp_path): + user_dir = _make_cursor_user_dir(tmp_path) + global_db = user_dir / "globalStorage" / "state.vscdb" + _create_global_db(str(global_db), []) + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + assert w.resolve_db_path() == str(global_db) + + +def test_resolve_db_path_not_found(tmp_path): + user_dir = tmp_path / "empty" + user_dir.mkdir() + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + assert w.resolve_db_path() is None + + +# --- query_new_events --- + +def test_query_new_events_empty_db(tmp_path): + user_dir = _make_cursor_user_dir(tmp_path) + global_db = user_dir / "globalStorage" / "state.vscdb" + _create_global_db(str(global_db), []) + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + events = w.query_new_events(0) + assert events == [] + + +def test_query_new_events_returns_bubbles(tmp_path): + user_dir = _make_cursor_user_dir(tmp_path) + global_db = user_dir / "globalStorage" / "state.vscdb" + + bubbles = [ + ("bubbleId:comp1:bub1", { + "_v": 3, "type": 1, "text": "Hello Cursor", + "createdAt": "2026-06-20T10:00:00Z", + }), + ("bubbleId:comp1:bub2", { + "_v": 3, "type": 2, "text": "Hi! How can I help?", + "createdAt": "2026-06-20T10:00:01Z", + }), + ("nonBubbleKey", {"some": "other data"}), + ] + _create_global_db(str(global_db), bubbles) + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + events = w.query_new_events(0) + + # Should only return bubbleId:* keys, not nonBubbleKey + assert len(events) == 2 + assert all(e["key"].startswith("bubbleId:") for e in events) + assert events[0]["composer_id"] == "comp1" + assert events[0]["_cursor_position"] > 0 + + +def test_query_new_events_respects_cursor(tmp_path): + user_dir = _make_cursor_user_dir(tmp_path) + global_db = user_dir / "globalStorage" / "state.vscdb" + + bubbles = [ + ("bubbleId:c1:b1", {"_v": 3, "type": 1, "text": "old msg", "createdAt": "t1"}), + ("bubbleId:c1:b2", {"_v": 3, "type": 2, "text": "new msg", "createdAt": "t2"}), + ] + _create_global_db(str(global_db), bubbles) + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + # First query: get all + all_events = w.query_new_events(0) + assert len(all_events) == 2 + + # Second query: use last rowid as cursor + last_rowid = all_events[-1]["rowid"] + new_events = w.query_new_events(last_rowid) + assert len(new_events) == 0 + + +# --- normalize_event --- + +def test_normalize_user_message(): + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + raw = { + "value": { + "_v": 3, + "type": 1, + "text": "How do I use FastAPI?", + "createdAt": "2026-06-20T10:00:00Z", + }, + "composer_id": "comp-abc", + "_cursor_position": 5, + } + result = w.normalize_event(raw) + + assert result is not None + assert result["role"] == "user" + assert result["content"] == "How do I use FastAPI?" + assert result["type"] == "message" + assert result["timestamp"] == "2026-06-20T10:00:00Z" + assert result["session_id"] == "comp-abc" + + +def test_normalize_assistant_message(): + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + raw = { + "value": { + "_v": 3, + "type": 2, + "text": "FastAPI is a modern Python framework.", + "createdAt": "2026-06-20T10:00:01Z", + "allThinkingBlocks": [{"thinking": "Let me think..."}], + }, + "composer_id": "comp-abc", + "_cursor_position": 6, + } + result = w.normalize_event(raw) + + assert result is not None + assert result["role"] == "assistant" + assert result["content"] == "FastAPI is a modern Python framework." + + +def test_normalize_skips_empty_text(): + """Streaming artifacts with empty text should be skipped.""" + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + raw = { + "value": {"_v": 3, "type": 2, "text": "", "createdAt": "t"}, + "composer_id": "c1", + "_cursor_position": 7, + } + assert w.normalize_event(raw) is None + + +def test_normalize_skips_unknown_type(): + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + raw = { + "value": {"_v": 3, "type": 99, "text": "unknown", "createdAt": "t"}, + "composer_id": "c1", + "_cursor_position": 8, + } + assert w.normalize_event(raw) is None + + +def test_normalize_skips_non_dict_value(): + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + raw = {"value": "not a dict", "composer_id": "c1", "_cursor_position": 9} + assert w.normalize_event(raw) is None + + +def test_normalize_future_schema_version(): + """Future _v values should produce a debug log, not crash.""" + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + + raw = { + "value": {"_v": 99, "type": 1, "text": "future format", "createdAt": "t"}, + "composer_id": "c1", + "_cursor_position": 10, + } + result = w.normalize_event(raw) + assert result is not None # Still processes with warning + + +# --- filter_event --- + +def test_filter_short_content(): + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir="/tmp/fake", + cursor_manager=cm, + batch_callback=lambda e: None, + ) + assert w.filter_event({"content": "ok"}) is False # < 10 chars + assert w.filter_event({"content": "This is long enough content"}) is True + + +# --- _discover_composer_ids --- + +def test_discover_composer_ids(tmp_path): + user_dir = _make_cursor_user_dir(tmp_path) + + # Create two workspace DBs + ws1_db = user_dir / "workspaceStorage" / "hash1" / "state.vscdb" + _create_workspace_db(str(ws1_db), [ + {"id": "comp-1", "createdAt": "t1"}, + {"id": "comp-2", "createdAt": "t2"}, + ]) + + ws2_db = user_dir / "workspaceStorage" / "hash2" / "state.vscdb" + _create_workspace_db(str(ws2_db), [ + {"id": "comp-3", "createdAt": "t3"}, + ]) + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + ids = w._discover_composer_ids() + assert set(ids) == {"comp-1", "comp-2", "comp-3"} + + +def test_discover_composer_ids_no_workspaces(tmp_path): + user_dir = _make_cursor_user_dir(tmp_path) + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: None, + ) + ids = w._discover_composer_ids() + assert ids == [] + + +# --- Integration: full poll cycle --- + +def test_full_poll_cycle(tmp_path): + """End-to-end: create mock DBs -> start watcher -> verify batch_callback.""" + batches = [] + user_dir = _make_cursor_user_dir(tmp_path) + global_db = user_dir / "globalStorage" / "state.vscdb" + + bubbles = [ + ("bubbleId:comp1:b1", { + "_v": 3, "type": 1, + "text": "How do I implement a binary search in Python?", + "createdAt": "2026-06-20T10:00:00Z", + }), + ("bubbleId:comp1:b2", { + "_v": 3, "type": 2, + "text": "Here is a binary search implementation using iterative approach...", + "createdAt": "2026-06-20T10:00:01Z", + }), + ] + _create_global_db(str(global_db), bubbles) + + cm = FakeCursorManager() + w = CursorDBWatcher( + watch_dir=str(user_dir), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + poll_interval=1, + batch_trigger_lines=2, + ) + + w.start() + time.sleep(2.5) + w.stop() + + assert len(batches) >= 1 + events = batches[0] + assert len(events) == 2 + assert events[0]["role"] == "user" + assert events[0]["tool_name"] == "cursor_db" + assert events[1]["role"] == "assistant" + + # Cursor should be updated + cursor = cm.get_cursor(str(user_dir)) + assert cursor.last_position > 0 diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py index d3a797ffa0..ad641c39d9 100644 --- a/tests/daemon/test_multi_watcher_integration.py +++ b/tests/daemon/test_multi_watcher_integration.py @@ -33,9 +33,9 @@ def update_cursor(self, file_path, position): # --- Registry Tests --- def test_all_watchers_registered(): - """All 5 built-in watchers should be registered.""" + """All 6 built-in watchers should be registered.""" available = list_available_watchers() - expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev"} + expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev", "cursor_db"} assert expected.issubset(set(available)), f"Missing: {expected - set(available)}" @@ -328,3 +328,58 @@ def test_knowledge_router_uses_source_tool(): ) uri2 = router.route(k2) assert "general" in uri2 + + +# --- CursorDBWatcher Integration --- + +def test_cursor_db_watcher_via_factory(tmp_path): + """cursor_db watcher should be creatable via factory and satisfy Protocol.""" + batches = [] + cm = FakeCursorManager() + + watcher = create_watcher( + tool_name="cursor_db", + watch_dir=str(tmp_path), + cursor_manager=cm, + batch_callback=lambda e: batches.append(e), + poll_interval=60, + ) + assert isinstance(watcher, BaseWatcher) + assert watcher.tool_name == "cursor_db" + + +def test_cursor_db_normalize_compatible_with_reconstructor(): + """Events from cursor_db watcher should work with ConversationReconstructor.""" + from openviking.daemon.conversation_reconstructor import ConversationReconstructor + + events = [ + {"role": "user", "content": "How to use Cursor effectively?", + "tool_name": "cursor_db", "timestamp": "2026-06-20T10:00:00Z", + "session_id": "comp-123"}, + {"role": "assistant", "content": "Here are some tips for using Cursor...", + "tool_name": "cursor_db", "timestamp": "2026-06-20T10:00:01Z", + "session_id": "comp-123"}, + ] + + reconstructor = ConversationReconstructor() + turns = reconstructor.reconstruct(events) + + assert len(turns) == 1 + assert turns[0].user_prompt == "How to use Cursor effectively?" + assert turns[0].source_tool == "cursor_db" + + +def test_cursor_db_events_compatible_with_filter(): + """Events from cursor_db should work with LowValueFilter.""" + from openviking.daemon.filters import LowValueFilter + + events = [ + {"role": "user", "content": "A meaningful question about architecture design", + "tool_name": "cursor_db"}, + {"role": "assistant", "content": "Here is a detailed explanation of the pattern", + "tool_name": "cursor_db"}, + ] + + f = LowValueFilter() + filtered = f.apply(events) + assert len(filtered) == 2 From f6f260f1c4b0fd37ab388cc7e0849fd6a11e4da0 Mon Sep 17 00:00:00 2001 From: baobaodae <2014596548@qq.com> Date: Sat, 20 Jun 2026 12:37:31 +0800 Subject: [PATCH 4/6] docs(daemon): update db-watcher plan with E2E results and cursor fix - Mark Phase 1-2 as complete (BasePollingWatcher + CursorDBWatcher) - Add section 4.1: real data E2E validation results against state.vscdb - Add section 4.2: cursor advancement bug fix documentation - Document _discover_composer_ids real data finding (cursorDiskKV, not ItemTable) - Update task checklist and file change list with status --- docs/daemon/db-watcher-plan.md | 167 ++++++++++++++++++++++++++------- 1 file changed, 132 insertions(+), 35 deletions(-) diff --git a/docs/daemon/db-watcher-plan.md b/docs/daemon/db-watcher-plan.md index 49ae875c41..2e07e7fc9c 100644 --- a/docs/daemon/db-watcher-plan.md +++ b/docs/daemon/db-watcher-plan.md @@ -293,6 +293,13 @@ class BasePollingWatcher(ABC): if raw_events: new_position = cursor.last_position for raw in raw_events: + # ⚠️ 关键:先推进 cursor,再做 normalize/filter。 + # 否则被过滤掉的条目(如空文本工具调用)如果恰好是最高 rowid, + # 会导致 cursor 永远不越过它,每次 poll 都重复查出。 + pos = raw.get("_cursor_position", 0) + if pos > new_position: + new_position = pos + normalized = self.normalize_event(raw) if normalized is None: continue @@ -301,11 +308,6 @@ class BasePollingWatcher(ABC): normalized["tool_name"] = self._tool_name self._buffer.add_line(normalized, byte_size=0) - # 追踪最大 cursor position - pos = raw.get("_cursor_position", 0) - if pos > new_position: - new_position = pos - # 更新 cursor if new_position > cursor.last_position: self.cursor_manager.update_cursor(cursor_key, new_position) @@ -343,6 +345,7 @@ class BasePollingWatcher(ABC): |--------|------|------| | 轮询机制 | `Thread` + `Event.wait(interval)` | 与 watchdog Observer 平级,DaemonService 已处理线程安全 | | cursor key | 使用 `watch_dir` 字符串 | 与文件 watcher 的 `file_path` 互不冲突,CursorManager 无需改动 | +| cursor 推进时机 | **先推进再 normalize/filter** | ⚠️ 实测发现:被过滤掉的条目(如空文本工具调用)如果是最高 rowid,不先推进会导致无限重复查询 | | DB 连接管理 | 子类在 `query_new_events` 内自行 open/close | SQLite `?mode=ro` 只读,避免锁冲突;CursorManager 已有 per-call connect 模式 | | `_cursor_position` 约定 | raw event dict 中必须携带 | 让子类灵活定义位置语义(rowid/timestamp/offset),基类只取 max | | `BatchBuffer` 复用 | 直接用现有实现 | `byte_size=0`(DB 无字节偏移概念),time trigger 正常工作 | @@ -537,6 +540,36 @@ sqlite3 "%APPDATA%/Cursor/User/globalStorage/state.vscdb" \ "SELECT [key], value FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:%' LIMIT 3" ``` +**⚠️ `_discover_composer_ids` 待修正(实测发现)**: + +真实数据验证发现 `composer.composerData` **不在** 工作区 ItemTable 中,而是以 `composerData:<uuid>` 的形式存在全局 DB 的 `cursorDiskKV` 表中(13 条记录)。工作区 ItemTable 只有 `composer.composerHeaders`。 + +当前实现(查 workspace ItemTable)不会报错但收集不到数据。主流程不依赖此辅助方法(直接扫描 bubbleId:*),但如需使用应改为: + +```python +def _discover_composer_ids(self) -> List[str]: + """从全局 DB cursorDiskKV 查询 composerData:* keys。""" + composer_ids = [] + db_path = self.resolve_db_path() + if not db_path: + return composer_ids + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) + try: + rows = conn.execute( + "SELECT [key] FROM cursorDiskKV WHERE [key] LIKE 'composerData:%'" + ).fetchall() + for (key,) in rows: + cid = key.split(":", 1)[1] + if cid: + composer_ids.append(cid) + finally: + conn.close() + except Exception: + pass + return composer_ids +``` + #### 3.3 WindsurfDBWatcher(P2,待调研) **⚠️ Windsurf 的具体表名和 key 模式尚无公开文档。** Phase 4 需先 dump 真实 state.vscdb 确认: @@ -655,24 +688,26 @@ CREATE TABLE IF NOT EXISTS file_cursors ( ### 四、实施任务清单 -#### Phase 1:基础设施(BasePollingWatcher + 测试框架) +#### Phase 1:基础设施(BasePollingWatcher + 测试框架)— ✅ 已完成 -| # | 任务 | 文件 | 预估 | +| # | 任务 | 文件 | 状态 | |---|------|------|------| -| 1.1 | 创建 `BasePollingWatcher` 基类 | `watchers/base_polling_watcher.py` | ~120 行 | -| 1.2 | 创建 `test_base_polling_watcher.py` 单元测试 | `tests/daemon/` | mock poll loop / buffer / flush / Protocol 验证 | +| 1.1 | 创建 `BasePollingWatcher` 基类 | `watchers/base_polling_watcher.py` | ✅ ~170 行 | +| 1.2 | 创建 `test_base_polling_watcher.py` 单元测试 | `tests/daemon/` | ✅ 12 tests passing | -#### Phase 2:CursorDBWatcher(P0 核心) +#### Phase 2:CursorDBWatcher(P0 核心)— ✅ 已完成 -| # | 任务 | 文件 | 预估 | +| # | 任务 | 文件 | 状态 | |---|------|------|------| -| 2.1 | ~~调研~~:dump 真实 Cursor state.vscdb 结构 | — | **已完成**(见 §2.1) | -| 2.2 | 创建 `CursorDBWatcher`(双库架构、cursorDiskKV、bubbleId 解析) | `watchers/cursor_db_watcher.py` | ~150 行 | -| 2.3 | 单元测试 `test_cursor_db_watcher.py` | `tests/daemon/` | 创建临时双库 SQLite → mock bubble 数据 → normalize 验证 | -| 2.4 | 集成测试:完整 poll 周期 → batch_callback 验证 | 同上 | | -| 2.5 | Registry 注册 `cursor_db` | `watchers/registry.py` | 5 行 | +| 2.1 | dump 真实 Cursor state.vscdb 结构 | — | ✅ 见 §2.1 + §4.1 | +| 2.2 | 创建 `CursorDBWatcher`(双库架构、cursorDiskKV、bubbleId 解析) | `watchers/cursor_db_watcher.py` | ✅ ~170 行 | +| 2.3 | 单元测试 `test_cursor_db_watcher.py` | `tests/daemon/` | ✅ 16 tests passing | +| 2.4 | 集成测试:完整 poll 周期 → batch_callback 验证 | 同上 | ✅ `test_full_poll_cycle` | +| 2.5 | Registry 注册 `cursor_db` | `watchers/registry.py` | ✅ | +| 2.6 | **E2E 验证**:真实 state.vscdb 端到端 | — | ✅ 见 §4.1 | +| 2.7 | **Bug 修复**:cursor 推进时机(先于 normalize) | `base_polling_watcher.py` | ✅ 见 §4.2 | -#### Phase 3:TraeDBWatcher(P1) +#### Phase 3:TraeDBWatcher(P1)— 🔲 待实施 | # | 任务 | 文件 | 预估 | |---|------|------|------| @@ -680,7 +715,7 @@ CREATE TABLE IF NOT EXISTS file_cursors ( | 3.2 | `normalize_event`:处理 `content` 为空的 fallback(`agentTaskContent.proposalText`) | 同上 | | | 3.3 | 单元测试 + Registry 注册 `trae_db` | `tests/daemon/test_trae_db_watcher.py` | | -#### Phase 4:WindsurfDBWatcher(P2,需先 dump 确认格式) +#### Phase 4:WindsurfDBWatcher(P2,需先 dump 确认格式)— 🔲 待实施 | # | 任务 | 文件 | 预估 | |---|------|------|------| @@ -688,13 +723,74 @@ CREATE TABLE IF NOT EXISTS file_cursors ( | 4.2 | 创建 `WindsurfDBWatcher` | `watchers/windsurf_db_watcher.py` | ~30 行(若格式同 Cursor) | | 4.3 | 单元测试 + Registry 注册 `windsurf_db` | | | -#### Phase 5:集成验证 +#### Phase 5:集成验证 — ✅ 部分完成 -| # | 任务 | 文件 | 预估 | +| # | 任务 | 文件 | 状态 | |---|------|------|------| -| 5.1 | 更新 `test_multi_watcher_integration.py`:加入 DB watcher 测试 | `tests/daemon/` | | -| 5.2 | 端到端:真实 Cursor 对话 → watcher → ETL → viking:// | 手动验证 | | -| 5.3 | 更新 `test_all_watchers_registered` 验证新增 watcher | | 2 行 | +| 5.1 | 更新 `test_multi_watcher_integration.py`:加入 DB watcher 测试 | `tests/daemon/` | ✅ 3 new tests | +| 5.2 | 端到端:真实 Cursor 对话 → watcher → ETL → viking:// | 手动验证 | 🔲 需 `serve --with-daemon` | +| 5.3 | 更新 `test_all_watchers_registered` 验证新增 watcher | | ✅ 6 watchers | +| 5.4 | `_discover_composer_ids` 修正为查 cursorDiskKV | `cursor_db_watcher.py` | 🔲 低优先级 | +| 5.5 | 全量回归测试(53 tests) | — | ✅ all passing | + +--- + +### 四.一、实施记录 + +#### 4.1 真实数据 E2E 验证(2026-06-20) + +使用本机 `C:\Users\20145\AppData\Roaming\Cursor\User\globalStorage\state.vscdb`(1.4MB)对 CursorDBWatcher 进行端到端验证。 + +**DB 概况**: + +| 表 | 行数 | 说明 | +|----|------|------| +| `ItemTable` | 128 | workspace 元数据(`composer.composerHeaders` 等) | +| `cursorDiskKV` | 104 | 对话内容 + composer 数据 | + +**cursorDiskKV key 分布**: + +| 前缀 | 数量 | 说明 | +|------|------|------| +| `agentKv:blob:*` | 66 | Agent 知识片段 | +| `bubbleId:*` | 20 | 对话消息(5 条有文本,15 条空文本/工具调用) | +| `composerData:*` | 13 | 会话元数据(⚠️ 不在 ItemTable 中) | +| `checkpointId:*` | 1 | 检查点 | +| `inlineDiffs-*` | 4 | 内联 diff | + +**E2E 测试结果**: + +| 检查项 | 结果 | +|--------|------| +| DB 发现 & 连接 | ✅ PASS | +| 首次 poll 查出 20 条 bubbleId | ✅ PASS | +| normalize 保留 5 条(过滤 15 条空文本工具调用) | ✅ PASS | +| batch 回调正确投递 5 条事件 | ✅ PASS | +| cursor 持久化到 rowid=91 | ✅ PASS | +| 二次 poll 幂等(0 条新数据) | ✅ PASS | +| cursor 二次 poll 不变 | ✅ PASS | +| 角色分布 user=1, assistant=4 | ✅ PASS | +| 53 项单元测试全部通过,0 回归 | ✅ PASS | + +**真实 bubble 格式确认**: +- `_v: 3` ✓ +- `type: 1`(user)/ `2`(assistant) ✓ +- `text` 字段包含对话内容 ✓ +- `createdAt` ISO-8601 时间戳 ✓ +- `toolFormerData` 对象(工具调用,text 为空)✓ +- `capabilityType: 15`(工具调用标记)✓ + +#### 4.2 Cursor 推进 Bug 修复 + +**问题**:`_poll_loop` 中 cursor 只在 normalize 保留事件后才推进。被过滤掉的条目(如空文本工具调用 bubble,`text: ""`)如果恰好是批次中最高 rowid,会导致 cursor 永远不越过它,每次 poll 都重复查出该条目。 + +**症状**:E2E 验证中首次 poll 后 cursor 停在 rowid=90(最后一条有文本的 bubble),但 rowid=91 是一条空文本工具调用 bubble。第二次 poll 返回 1 条 raw event(虽然 normalize 正确返回 None 不产生输出),但浪费查询且 cursor 不推进。 + +**修复**:将 `_cursor_position` 追踪提前到 normalize/filter 之前。无论事件是否被保留,cursor 都推进到已扫描的最大 rowid。 + +**影响范围**:`base_polling_watcher.py` 的 `_poll_loop` 方法。所有 BasePollingWatcher 子类(包括未来的 Trae、Windsurf watcher)均受益。 + +**验证**:修复后二次 poll 返回 0 条 raw event,cursor 稳定在 rowid=91。 --- @@ -724,18 +820,19 @@ CREATE TABLE IF NOT EXISTS file_cursors ( ### 七、文件变更清单 -| 操作 | 文件路径 | -|------|----------| -| **新增** | `openviking/daemon/watchers/base_polling_watcher.py` | -| **新增** | `openviking/daemon/watchers/cursor_db_watcher.py`(P0) | -| **新增** | `openviking/daemon/watchers/trae_db_watcher.py`(P1) | -| **新增** | `openviking/daemon/watchers/windsurf_db_watcher.py`(P2) | -| **新增** | `tests/daemon/test_base_polling_watcher.py` | -| **新增** | `tests/daemon/test_cursor_db_watcher.py` | -| **新增** | `tests/daemon/test_trae_db_watcher.py` | -| **修改** | `openviking/daemon/watchers/registry.py` — 添加 cursor_db / trae_db / windsurf_db 注册 | -| **修改** | `tests/daemon/test_multi_watcher_integration.py` — 添加 DB watcher 测试用例 | -| **不改** | `models.py` / `cursor_manager.py` / `service.py` / `etl_pipeline.py` / `config.py` / `base_file_watcher.py` | +| 操作 | 文件路径 | 状态 | +|------|----------|------| +| ✅ 新增 | `openviking/daemon/watchers/base_polling_watcher.py` | ~170 行,含 cursor 推进 bug fix | +| ✅ 新增 | `openviking/daemon/watchers/cursor_db_watcher.py`(P0) | ~170 行 | +| 🔲 新增 | `openviking/daemon/watchers/trae_db_watcher.py`(P1) | 待 Phase 3 | +| 🔲 新增 | `openviking/daemon/watchers/windsurf_db_watcher.py`(P2) | 待 Phase 4 | +| ✅ 新增 | `tests/daemon/test_base_polling_watcher.py` | 12 tests | +| ✅ 新增 | `tests/daemon/test_cursor_db_watcher.py` | 16 tests | +| 🔲 新增 | `tests/daemon/test_trae_db_watcher.py` | 待 Phase 3 | +| ✅ 修改 | `openviking/daemon/watchers/registry.py` | +cursor_db 注册 | +| 🔲 修改 | `openviking/daemon/watchers/registry.py` | +trae_db / +windsurf_db 注册 | +| ✅ 修改 | `tests/daemon/test_multi_watcher_integration.py` | +3 cursor_db 测试 | +| ✅ 不改 | `models.py` / `cursor_manager.py` / `service.py` / `etl_pipeline.py` / `config.py` / `base_file_watcher.py` | 验证无需改动 | --- From 17fd773d29acf0954ab059301a5e661dfc884457 Mon Sep 17 00:00:00 2001 From: baobaodae <2014596548@qq.com> Date: Thu, 25 Jun 2026 21:36:21 +0800 Subject: [PATCH 5/6] chore(daemon): add license headers, trim watchers to verified set, remove Echo2.md - Add SPDX/AGPL license headers to all 21 openviking/daemon/**/*.py files - Remove unverified watchers: aider, cursor (file), continue_dev - Keep only verified watchers: claude_code, generic_jsonl, cursor_db - Update registry, tests, and docs to reflect the trimmed watcher set - Remove Echo2.md (to be published as [RFC] GitHub Discussion) - Fix viking:// URI paths in daemon README --- Echo2.md | 429 ------------------ docs/daemon/README.md | 10 +- docs/daemon/Watcher.md | 32 -- docs/daemon/configuration.md | 47 +- openviking/daemon/__init__.py | 2 + .../daemon/conversation_reconstructor.py | 2 + openviking/daemon/cursor_manager.py | 2 + openviking/daemon/deduplicator.py | 2 + openviking/daemon/etl_pipeline.py | 2 + openviking/daemon/filters.py | 2 + openviking/daemon/knowledge_extractor.py | 2 + openviking/daemon/knowledge_router.py | 2 + openviking/daemon/models.py | 2 + openviking/daemon/service.py | 2 + openviking/daemon/storage_adapter.py | 2 + openviking/daemon/watchers/__init__.py | 2 + openviking/daemon/watchers/aider_watcher.py | 148 ------ .../daemon/watchers/base_file_watcher.py | 2 + .../daemon/watchers/base_polling_watcher.py | 2 + .../daemon/watchers/claude_code_watcher.py | 2 + .../daemon/watchers/continue_dev_watcher.py | 68 --- .../daemon/watchers/cursor_db_watcher.py | 2 + openviking/daemon/watchers/cursor_watcher.py | 87 ---- .../daemon/watchers/generic_jsonl_watcher.py | 2 + openviking/daemon/watchers/registry.py | 20 +- tests/daemon/test_aider_watcher.py | 183 -------- tests/daemon/test_continue_dev_watcher.py | 193 -------- tests/daemon/test_cursor_watcher.py | 187 -------- .../daemon/test_multi_watcher_integration.py | 123 +---- tests/daemon/test_service_multi.py | 2 +- 30 files changed, 105 insertions(+), 1458 deletions(-) delete mode 100644 Echo2.md delete mode 100644 docs/daemon/Watcher.md delete mode 100644 openviking/daemon/watchers/aider_watcher.py delete mode 100644 openviking/daemon/watchers/continue_dev_watcher.py delete mode 100644 openviking/daemon/watchers/cursor_watcher.py delete mode 100644 tests/daemon/test_aider_watcher.py delete mode 100644 tests/daemon/test_continue_dev_watcher.py delete mode 100644 tests/daemon/test_cursor_watcher.py diff --git a/Echo2.md b/Echo2.md deleted file mode 100644 index 3f18cb473d..0000000000 --- a/Echo2.md +++ /dev/null @@ -1,429 +0,0 @@ -# Echo II: Multi-Source Watcher Extension - -> 从单一 Claude Code 监听到多工具 AI 编码助手日志的统一监听与知识提取框架 - ---- - -## 一、动机与背景 - -Active Daemon 的 v1 实现仅支持 Claude Code JSONL 日志监听。然而现实中,开发者的日常工具栈远不止于此: - -- **Claude Code CLI** — JSONL 日志(已实现) -- **Cursor** — 日志文件 + SQLite 数据库 -- **Aider** — `.aider.chat.history.md` Markdown 文件 -- **GitHub Copilot** — VS Code workspace 日志 -- **Continue.dev** — `~/.continue/` 下的 JSON 日志 -- **Windsurf / Cascade** — 类 Cursor 的日志目录 - -Echo.md 原文已明确提到需要支持 Cursor(SQLite polling)和 OpenCode(rotating text logs),但 v1 仅完成了最简单的 Claude Code 路径。本次扩展的核心目标是:**将 Daemon 从单源监听器升级为多源监听框架**,使开发者可以自由配置任意数量的工具监听路径。 - ---- - -## 二、目标工具可行性分析 - -| 工具 | 存储格式 | 驱动模式 | 可行性 | 优先级 | -|------|----------|----------|--------|--------| -| **Claude Code** | JSONL append-only | 文件增量监听 | ★★★★★ 已实现 | P0 | -| **Aider** | Markdown `.aider.chat.history.md` | 文件增量监听 | ★★★★☆ | P1 | -| **Cursor** | SQLite `state.vscdb` + 日志 | DB polling + 文件监听 | ★★★☆☆ | P1 | -| **Continue.dev** | JSON 日志 | 文件增量监听 | ★★★★☆ | P2 | -| **GitHub Copilot** | VS Code workspace logs | 文件增量监听 | ★★★☆☆ | P2 | -| **Generic JSONL** | 任意 JSONL | 文件增量监听 | ★★★★★ | P1 | - -### 驱动模式分类 - -根据日志格式差异,Watcher 分为两种驱动模式: - -**File-Append Watcher**(文件追加监听) -- 使用 watchdog 监听文件变更事件 -- 通过 FileCursor(byte offset)实现增量读取 -- 适用于:Claude Code、Aider、Continue.dev、Generic JSONL - -**Database Polling Watcher**(数据库轮询监听) -- 定时轮询 SQLite 数据库 -- 通过 timestamp cursor 实现增量查询 -- 适用于:Cursor(state.vscdb) - ---- - -## 三、架构设计 - -### 3.1 核心原则 - -**每个 Watcher 负责将自己的原始事件归一化为统一格式**,下游 ETL pipeline 完全不需要感知工具差异。 - -``` - ┌─ ClaudeCodeWatcher ──┐ - │ │ -AiderWatcher ────────┤ ├──→ normalized events ──→ ETL Pipeline ──→ viking:// - │ (shared queue) │ -GenericJSONLWatcher──┘ │ - │ -CursorWatcher (polling) ────────────────────┘ -``` - -### 3.2 归一化事件格式 (NormalizedEvent) - -所有 Watcher 将原始日志转换为以下统一格式的 dict: - -```python -{ - "role": "user" | "assistant", # 必须 - "content": "...", # 必须,消息正文 - "type": "message", # 必须,固定 "message" - "timestamp": "2026-06-16T10:30:00Z", # 可选 - "session_id": "...", # 可选,工具特定 - "project_name": "...", # 可选 - "tool_name": "claude_code", # 必须,来源工具标识 -} -``` - -新增 `tool_name` 字段用于区分来源。现有的 `ConversationReconstructor`、`LowValueFilter`、`KnowledgeExtractor` 均通过 `event.get("role")` / `event.get("content")` 读取,**完全兼容**。 - -### 3.3 BaseWatcher Protocol - -```python -# openviking/daemon/watchers/__init__.py - -from typing import Protocol, runtime_checkable - -@runtime_checkable -class BaseWatcher(Protocol): - """All tool-specific watchers must implement this interface.""" - - @property - def tool_name(self) -> str: ... - - def start(self) -> None: ... - def stop(self) -> None: ... - def flush(self) -> None: ... -``` - -### 3.4 BaseFileWatcher 抽象基类 - -为 File-Append 模式的 Watcher 提取公共逻辑(watchdog Observer + 增量读取 + 游标管理 + 批处理缓冲): - -```python -# openviking/daemon/watchers/base_file_watcher.py - -class BaseFileWatcher(ABC): - """Abstract base for file-append based watchers.""" - - def __init__(self, watch_dir, cursor_manager, batch_callback, - file_pattern, batch_trigger_lines, batch_trigger_seconds): - ... - - @abstractmethod - def parse_line(self, line: str) -> Optional[Dict]: - """Parse a raw line into a raw event dict. Tool-specific.""" - ... - - @abstractmethod - def normalize_event(self, raw_event: Dict) -> Optional[Dict]: - """Convert raw event to NormalizedEvent format. Tool-specific.""" - ... - - def filter_event(self, event: Dict) -> bool: - """Optional: additional tool-specific filtering. Default: True.""" - return True - - # --- 公共逻辑(不需要子类重写) --- - def start(self) -> None: ... # 创建 Observer + Handler - def stop(self) -> None: ... - def flush(self) -> None: ... -``` - -子类只需实现 `parse_line()` 和 `normalize_event()` 两个方法。 - -### 3.5 Watcher 注册表 (WatcherRegistry) - -```python -# openviking/daemon/watchers/registry.py - -WATCHER_REGISTRY: Dict[str, Type[BaseWatcher]] = { - "claude_code": ClaudeCodeWatcher, - "aider": AiderWatcher, - "cursor": CursorWatcher, - "continue_dev": ContinueDevWatcher, - "generic_jsonl": GenericJSONLWatcher, -} - -def create_watcher(tool_name: str, **kwargs) -> BaseWatcher: - """Factory function to create a watcher by tool name.""" - cls = WATCHER_REGISTRY.get(tool_name) - if not cls: - raise ValueError(f"Unknown tool: {tool_name}. Available: {list(WATCHER_REGISTRY.keys())}") - return cls(**kwargs) -``` - ---- - -## 四、配置设计 - -### 4.1 新配置结构 - -`DaemonConfig` 从单一 `watch_dir` 演进为支持 `watchers` 列表: - -```python -class WatcherConfig(BaseModel): - tool_name: str # 必填:工具标识 - watch_dir: str # 必填:监听路径 - file_pattern: str = "*.jsonl" # 可选:文件匹配模式 - enabled: bool = True # 可选:是否启用 - batch_trigger_lines: int = Field(50, gt=0) # 可选:批处理行数 - batch_trigger_seconds: int = Field(300, gt=0) # 可选:批处理秒数 - extra: Dict[str, Any] = Field(default_factory=dict) # 可选:工具特定参数 - -class DaemonConfig(BaseModel): - enabled: bool = False - db_path: Optional[str] = None - watchers: List[WatcherConfig] = Field(default_factory=list) - - # 向后兼容:保留旧字段 - watch_dir: Optional[str] = None # deprecated - batch_trigger_lines: int = Field(50, gt=0) # deprecated, 全局默认值 - batch_trigger_seconds: int = Field(300, gt=0) # deprecated, 全局默认值 - - def get_effective_watchers(self) -> List[WatcherConfig]: - """向后兼容:如果 watchers 为空但 watch_dir 存在,自动构造单个 Claude Code watcher.""" - if self.watchers: - return [w for w in self.watchers if w.enabled] - elif self.watch_dir: - return [WatcherConfig( - tool_name="claude_code", - watch_dir=self.watch_dir, - batch_trigger_lines=self.batch_trigger_lines, - batch_trigger_seconds=self.batch_trigger_seconds, - )] - else: - return [WatcherConfig( - tool_name="claude_code", - watch_dir=str(Path.home() / ".claude" / "projects"), - batch_trigger_lines=self.batch_trigger_lines, - batch_trigger_seconds=self.batch_trigger_seconds, - )] -``` - -### 4.2 ov.conf 示例 - -```json -{ - "server": { - "daemon": { - "enabled": true, - "db_path": "~/.openviking/daemon/cursors.db", - "watchers": [ - { - "tool_name": "claude_code", - "watch_dir": "~/.claude/projects", - "file_pattern": "*.jsonl" - }, - { - "tool_name": "aider", - "watch_dir": "~/Projects", - "file_pattern": ".aider.chat.history.md" - }, - { - "tool_name": "cursor", - "watch_dir": "~/.cursor", - "extra": {"poll_interval_seconds": 60} - }, - { - "tool_name": "generic_jsonl", - "watch_dir": "~/ai-logs/copilot", - "file_pattern": "*.jsonl", - "extra": {"user_role_field": "author", "assistant_role_value": "copilot"} - } - ] - } - } -} -``` - -### 4.3 环境变量 - -| 变量 | 说明 | -|------|------| -| `OV_DAEMON_ENABLED` | 全局开关 | -| `OV_DAEMON_WATCHERS` | JSON 格式的 watchers 列表(覆盖 ov.conf) | - ---- - -## 五、实现计划 - -### Batch 1:基础抽象层(3 文件 + 6 tests) - -| 任务 | 文件 | 说明 | -|------|------|------| -| 1.1 | `watchers/__init__.py` | 定义 `BaseWatcher` Protocol | -| 1.2 | `watchers/base_file_watcher.py` | 抽象基类,提取 ClaudeCodeWatcher 的公共逻辑 | -| 1.3 | `watchers/registry.py` | Watcher 注册表 + `create_watcher()` 工厂 | -| 1.4 | `tests/daemon/test_base_file_watcher.py` | BaseFileWatcher 的 parse/normalize/buffer 测试 | -| 1.5 | `tests/daemon/test_registry.py` | 注册表查找、未知工具报错测试 | - -### Batch 2:重构 ClaudeCodeWatcher + GenericJSONLWatcher(3 文件 + 8 tests) - -| 任务 | 文件 | 说明 | -|------|------|------| -| 2.1 | `watchers/claude_code_watcher.py` | 重构为继承 BaseFileWatcher,实现 parse_line/normalize_event | -| 2.2 | `watchers/generic_jsonl_watcher.py` | 通用 JSONL Watcher,支持自定义字段映射 | -| 2.3 | `models.py` | 在 BatchBuffer 中增加 `tool_name` 字段 | -| 2.4 | `tests/daemon/test_claude_code_watcher.py` | 更新已有测试(保持兼容) | -| 2.5 | `tests/daemon/test_generic_jsonl_watcher.py` | GenericJSONLWatcher 的解析/归一化/字段映射测试 | - -### Batch 3:AiderWatcher + CursorWatcher(4 文件 + 10 tests) - -| 任务 | 文件 | 说明 | -|------|------|------| -| 3.1 | `watchers/aider_watcher.py` | Aider Markdown 历史解析 → 归一化事件 | -| 3.2 | `watchers/cursor_watcher.py` | Cursor SQLite polling + 日志文件监听 | -| 3.3 | `watchers/continue_dev_watcher.py` | Continue.dev JSON 日志监听 | -| 3.4 | `tests/daemon/test_aider_watcher.py` | Markdown 解析、对话重组、增量读取测试 | -| 3.5 | `tests/daemon/test_cursor_watcher.py` | SQLite polling、增量查询测试 | -| 3.6 | `tests/daemon/test_continue_dev_watcher.py` | JSON 日志解析测试 | - -### Batch 4:配置 + 服务层重构(4 文件 + 6 tests) - -| 任务 | 文件 | 说明 | -|------|------|------| -| 4.1 | `server/config.py` | 新增 WatcherConfig,重构 DaemonConfig | -| 4.2 | `daemon/service.py` | 单 watcher → watchers 列表,使用 registry 创建 | -| 4.3 | `daemon/knowledge_router.py` | URI 中包含 tool_name:`viking://skills/{tool_name}/...` | -| 4.4 | `daemon/storage_adapter.py` | Markdown 模板中 Source 字段参数化 | -| 4.5 | `daemon/models.py` | ExtractedKnowledge 增加 `source_tool` 字段 | -| 4.6 | `server/bootstrap.py` | 环境变量 OV_DAEMON_WATCHERS 支持 | -| 4.7 | `tests/daemon/test_config.py` | WatcherConfig 验证 + 向后兼容测试 | -| 4.8 | `tests/daemon/test_service_multi.py` | 多 watcher 启动/停止/路由测试 | - -### Batch 5:API + Web Studio(3 文件 + 0 tests) - -| 任务 | 文件 | 说明 | -|------|------|------| -| 5.1 | `server/routers/daemon.py` | 扩展 status 端点:返回每个 watcher 的状态 | -| 5.2 | `web-studio/.../daemon-status-card.tsx` | 重构为多 watcher 视图:表格 + 状态 | -| 5.3 | `web-studio/.../i18n` | 多 watcher 相关翻译 | - -### Batch 6:集成测试 + 收尾(2 文件) - -| 任务 | 文件 | 说明 | -|------|------|------| -| 6.1 | `tests/daemon/test_multi_watcher_integration.py` | 多 watcher → ETL → storage 全链路 | -| 6.2 | 文档更新 | README、Echo.md 引用更新 | - ---- - -## 六、关键设计决策 - -### 6.1 归一化 vs 分支管道 - -**选择:Watcher 层归一化**,而非 ETL 层分支。 - -理由: -- ETL Pipeline 的四个阶段(Filter → Reconstruct → Extract → Deduplicate)已经设计为通用的 -- 如果在 ETL 层引入工具分支,每个阶段都需要 if/else,复杂度 O(tools × stages) -- Watcher 层归一化后,新增工具只需实现 `parse_line()` + `normalize_event()`,ETL 零修改 - -### 6.2 向后兼容策略 - -旧的 `DaemonConfig` 字段(`watch_dir`、`batch_trigger_lines`、`batch_trigger_seconds`)保留为 deprecated,`get_effective_watchers()` 方法自动将旧配置转换为单个 `WatcherConfig`。这确保现有用户的 ov.conf 不需要修改。 - -### 6.3 Cursor SQLite Polling - -Cursor 使用 SQLite `state.vscdb` 存储对话历史。与文件监听不同,需要: -- 定时轮询(默认 60 秒) -- 通过 `last_timestamp` 游标实现增量查询 -- 复用 `CursorManager` 存储轮询位置 - -这是一个独立于 watchdog 的驱动模式,但通过实现相同的 `BaseWatcher` Protocol,对 DaemonService 完全透明。 - -### 6.4 GenericJSONLWatcher 字段映射 - -为了支持任意 JSONL 日志(Copilot、自研工具等),GenericJSONLWatcher 支持通过 `extra` 配置自定义字段映射: - -```json -{ - "tool_name": "generic_jsonl", - "watch_dir": "~/ai-logs/my-tool", - "extra": { - "role_field": "author", - "user_role_value": "human", - "assistant_role_value": "ai", - "content_field": "text", - "timestamp_field": "ts" - } -} -``` - -默认值兼容 Claude Code 格式(`role` / `user` / `assistant` / `content` / `timestamp`)。 - ---- - -## 七、API 变更 - -### 7.1 GET /api/v1/daemon/status(增强) - -```json -{ - "enabled": true, - "running": true, - "watchers": [ - { - "tool_name": "claude_code", - "watch_dir": "/Users/xxx/.claude/projects", - "enabled": true, - "running": true, - "cursor_count": 12, - "batch_trigger_lines": 50, - "batch_trigger_seconds": 300 - }, - { - "tool_name": "aider", - "watch_dir": "/Users/xxx/Projects", - "enabled": true, - "running": true, - "cursor_count": 3, - "batch_trigger_lines": 50, - "batch_trigger_seconds": 300 - } - ], - "db_path": "...", - "available_tools": ["claude_code", "aider", "cursor", "continue_dev", "generic_jsonl"] -} -``` - ---- - -## 八、Web Studio 变更 - -Home 页面 DaemonStatusCard 从单状态卡片升级为多 Watcher 表格视图: - -- 顶部:全局状态(enabled/running)+ watcher 数量统计 -- 中部:Watcher 列表表格(工具名、路径、状态、游标数、最后活动) -- 底部:可用工具列表(available_tools) - -每 30 秒自动刷新。 - ---- - -## 九、风险与缓解 - -| 风险 | 影响 | 缓解措施 | -|------|------|----------| -| Cursor SQLite schema 变更 | 解析失败 | 防御性编程 + 版本检测 + 降级到日志文件监听 | -| 多 watcher 并发写入 SQLite cursor DB | 竞态条件 | SQLite WAL mode + 每个 watcher 独立 cursor key | -| Aider Markdown 格式不稳定 | 解析错误 | 宽松正则 + 跳过不可解析段落 + 详细日志 | -| 大量 watcher 导致 ETL 队列拥堵 | 延迟增加 | Queue 大小限制 + 背压 + 优先级队列 | - ---- - -## 十、里程碑 - -| 阶段 | 内容 | 预计 commits | -|------|------|-------------| -| Batch 1 | 基础抽象层 | 3-4 | -| Batch 2 | 重构 + GenericJSONL | 4-5 | -| Batch 3 | Aider + Cursor + Continue | 5-6 | -| Batch 4 | 配置 + 服务层 | 4-5 | -| Batch 5 | API + Web Studio | 3-4 | -| Batch 6 | 集成 + 收尾 | 2-3 | -| **总计** | | **21-27 commits** | diff --git a/docs/daemon/README.md b/docs/daemon/README.md index a7e5173c1f..874b210eac 100644 --- a/docs/daemon/README.md +++ b/docs/daemon/README.md @@ -29,17 +29,19 @@ Daemon 会在服务器启动时自动运行(如果 `OV_DAEMON_ENABLED=true`) 2. **增量读取** — 文件游标技术,只处理新增内容 3. **批量处理** — 累积 50 行或 5 分钟后触发 ETL 管道 4. **知识提取** — 调用 LLM 过滤噪声,提取有价值的知识 -5. **自动存储** — 写入 `viking://skills/`、`viking://memories/`、`viking://resources/` +5. **自动存储** — 写入 `viking://resources/skills/`、`viking://resources/memories/`、`viking://resources/` ## 知识分类 | 类型 | 目标路径 | 说明 | |------|---------|------| -| Skills | `viking://skills/claude_code/<title>.md` | 可复用的操作指南 | -| Memories (有项目) | `viking://memories/projects/<project>/decisions.md` | 项目决策日志 | -| Memories (无项目) | `viking://memories/global/<title>.md` | 全局记忆 | +| Skills | `viking://resources/skills/<source>/<title>.md` | 可复用的操作指南 | +| Memories (有项目) | `viking://resources/memories/<project>/decisions.md` | 项目决策日志 | +| Memories (无项目) | `viking://resources/memories/global/<title>.md` | 全局记忆 | | Resources | `viking://resources/<tech>/<title>.md` | 参考资源 | +> **注意**:viking:// URI 只支持 `resources/`、`user/`、`agent/` 三个顶级 scope。所有知识统一路由到 `resources/` 下。 + ## 架构 ``` diff --git a/docs/daemon/Watcher.md b/docs/daemon/Watcher.md deleted file mode 100644 index 5b672d74f2..0000000000 --- a/docs/daemon/Watcher.md +++ /dev/null @@ -1,32 +0,0 @@ -# Daemon Watcher 排查计划 - -## 状态:148 tests pass,服务器在 1966 端口运行中,daemon 已处理 3 个文件 - -## 待完成 - -### 1. 确认端到端写入 -目标:验证知识是否真正写入 `viking://resources/...` - -方法: -- 服务器日志当前输出到后台进程 stdout,无法查看。需要改为写文件:在 `ov.conf` 中加 `"log": {"level": "INFO", "output": "D:\\Develop\\ov-test\\server.log"}` -- 杀掉当前服务器,清 cursor DB,重启 -- touch JSONL 触发处理,等 ETL 跑完 -- 查看 `server.log` 中的写入日志(成功/失败) - -预期日志链路: -``` -[claude_code] Flushing N events -_enqueue_batch: received N events -ETL loop: dequeued N events -Processing batch with N events → After filtering: X → Reconstructed Y turns → Extracted Z items -Knowledge ingested: viking://resources/... ← 关键确认点 -``` - -### 2. 根据结果处理 -- 写入成功 → 提交 commit 和 PR -- 写入失败 → 根据日志修 `storage_adapter.py`(可能是 `resource_service.add_resource()` 调用参数或 URI 格式问题) - -### 3. 提交 PR -- 分支基于 `upstream/main` -- 只包含 daemon 相关文件变更 -- PR 目标:`volcengine/OpenViking:main` diff --git a/docs/daemon/configuration.md b/docs/daemon/configuration.md index 169bf341e1..422c50e1a1 100644 --- a/docs/daemon/configuration.md +++ b/docs/daemon/configuration.md @@ -10,19 +10,54 @@ | `OV_DAEMON_BATCH_LINES` | 批处理触发行数 | `50` | | `OV_DAEMON_BATCH_SECONDS` | 批处理触发秒数 | `300` | -## YAML 配置 (ov.conf) +## JSON 配置 (ov.conf) + +> **重要**:daemon 配置必须放在 `"server"` 节内,不是顶层配置。ov.conf 的 JSON 解析器不支持 `#` 注释。 + +单 watcher 配置: + +```json +{ + "server": { + "port": 1988, + "daemon": { + "enabled": true, + "watch_dir": "~/.claude/projects", + "batch_trigger_lines": 50, + "batch_trigger_seconds": 300 + } + } +} +``` + +多 watcher 配置(推荐,支持同时监听多个 AI 工具): ```json { - "daemon": { - "enabled": true, - "watch_dir": "~/.claude/projects", - "batch_trigger_lines": 50, - "batch_trigger_seconds": 300 + "server": { + "port": 1988, + "daemon": { + "enabled": true, + "watchers": [ + { + "tool_name": "claude_code", + "watch_dir": "C:\\Users\\xxx\\.claude\\projects", + "batch_trigger_lines": 5, + "batch_trigger_seconds": 60 + }, + { + "tool_name": "cursor_db", + "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Cursor\\User\\globalStorage", + "poll_interval": 60 + } + ] + } } } ``` +`watchers` 数组中每项支持 `tool_name`、`watch_dir`、`batch_trigger_lines`、`batch_trigger_seconds`、`extra` 字段。当 `watchers` 存在时,顶层的 `watch_dir` 被忽略。 + ## Docker 部署 ```yaml diff --git a/openviking/daemon/__init__.py b/openviking/daemon/__init__.py index ba94f0d050..5042a04371 100644 --- a/openviking/daemon/__init__.py +++ b/openviking/daemon/__init__.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ OpenViking Active Daemon package. Monitors AI tool logs and automatically extracts knowledge into viking:// storage. diff --git a/openviking/daemon/conversation_reconstructor.py b/openviking/daemon/conversation_reconstructor.py index 179873467d..fdfbcdc1e0 100644 --- a/openviking/daemon/conversation_reconstructor.py +++ b/openviking/daemon/conversation_reconstructor.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Reconstruct conversation turns from flat event lists. Pairs user prompts with assistant responses into structured ConversationTurn objects. diff --git a/openviking/daemon/cursor_manager.py b/openviking/daemon/cursor_manager.py index 6b76bc2565..e5009d6082 100644 --- a/openviking/daemon/cursor_manager.py +++ b/openviking/daemon/cursor_manager.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Cursor Manager for tracking file read positions. Persists state in SQLite so Daemon can resume after restart. diff --git a/openviking/daemon/deduplicator.py b/openviking/daemon/deduplicator.py index 9066b57bb4..5721fc2a67 100644 --- a/openviking/daemon/deduplicator.py +++ b/openviking/daemon/deduplicator.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Knowledge deduplication based on content hashing. Prevents duplicate knowledge from being written to viking:// storage. diff --git a/openviking/daemon/etl_pipeline.py b/openviking/daemon/etl_pipeline.py index b0f35d58d1..1f580b43d4 100644 --- a/openviking/daemon/etl_pipeline.py +++ b/openviking/daemon/etl_pipeline.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Batch ETL pipeline for processing conversation events into structured knowledge. Orchestrates: Filter -> Reconstruct -> Extract -> Deduplicate diff --git a/openviking/daemon/filters.py b/openviking/daemon/filters.py index 4a3a58bbe2..04f32cb198 100644 --- a/openviking/daemon/filters.py +++ b/openviking/daemon/filters.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Rule-based filters for low-value conversations. Removes noise before LLM processing to save cost and improve quality. diff --git a/openviking/daemon/knowledge_extractor.py b/openviking/daemon/knowledge_extractor.py index 260ca2622d..5e69d3b96c 100644 --- a/openviking/daemon/knowledge_extractor.py +++ b/openviking/daemon/knowledge_extractor.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ LLM-based knowledge extraction from conversations. Uses OpenViking's existing VLM configuration for intelligent filtering and summarization. diff --git a/openviking/daemon/knowledge_router.py b/openviking/daemon/knowledge_router.py index e2adcb112f..ea67490510 100644 --- a/openviking/daemon/knowledge_router.py +++ b/openviking/daemon/knowledge_router.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Route extracted knowledge to appropriate viking:// URIs based on category and project. """ diff --git a/openviking/daemon/models.py b/openviking/daemon/models.py index ea88df8953..78a041ae00 100644 --- a/openviking/daemon/models.py +++ b/openviking/daemon/models.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Data models for OpenViking Active Daemon. """ diff --git a/openviking/daemon/service.py b/openviking/daemon/service.py index bd6bc8bebe..bbb07dbd6c 100644 --- a/openviking/daemon/service.py +++ b/openviking/daemon/service.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ OpenViking Active Daemon main service. Orchestrates multi-tool file watching, ETL processing, and knowledge storage. diff --git a/openviking/daemon/storage_adapter.py b/openviking/daemon/storage_adapter.py index e1a76684ef..5678fae925 100644 --- a/openviking/daemon/storage_adapter.py +++ b/openviking/daemon/storage_adapter.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Adapter for writing extracted knowledge to OpenViking via ResourceService. Generates temporary Markdown files and ingests them through the standard resource pipeline. diff --git a/openviking/daemon/watchers/__init__.py b/openviking/daemon/watchers/__init__.py index 450f68faf4..9e47c6fe8f 100644 --- a/openviking/daemon/watchers/__init__.py +++ b/openviking/daemon/watchers/__init__.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Watcher abstractions for OpenViking Active Daemon. Provides BaseWatcher protocol and watcher registry for multi-tool support. diff --git a/openviking/daemon/watchers/aider_watcher.py b/openviking/daemon/watchers/aider_watcher.py deleted file mode 100644 index cd2e20dc95..0000000000 --- a/openviking/daemon/watchers/aider_watcher.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Aider chat history watcher. -Monitors .aider.chat.history.md files in project directories. -""" -import re -import time -import os -from typing import Dict, List, Optional - -from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher -from openviking.daemon.watchers.registry import register_watcher -from openviking.daemon.models import FileCursor -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -@register_watcher("aider") -class AiderWatcher(BaseFileWatcher): - """Watches Aider .aider.chat.history.md files.""" - - # Regex patterns - HEADER_RE = re.compile(r'^# aider chat started at (.+)$') - PROJECT_RE = re.compile(r'^> (.+)$') - USER_RE = re.compile(r'^#### user:\s*$') - ASSISTANT_RE = re.compile(r'^#### assistant:\s*$') - - def __init__(self, watch_dir, cursor_manager, batch_callback, - file_pattern=".aider.chat.history.md", - batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs): - super().__init__( - watch_dir=watch_dir, - cursor_manager=cursor_manager, - batch_callback=batch_callback, - file_pattern=file_pattern, - batch_trigger_lines=batch_trigger_lines, - batch_trigger_seconds=batch_trigger_seconds, - ) - - @property - def tool_name(self) -> str: - return "aider" - - def matches_file_pattern(self, file_path: str) -> bool: - """Match .aider.chat.history.md files.""" - filename = os.path.basename(file_path) - return filename == ".aider.chat.history.md" - - def parse_line(self, line: str) -> Optional[Dict]: - """Not used - Aider uses multi-line parsing via _process_file override.""" - return None - - def normalize_event(self, raw_event: Dict) -> Optional[Dict]: - """Not used directly - _process_file creates normalized events.""" - return raw_event - - def _process_file(self, file_path: str): - """ - Override: Parse Aider's multi-line markdown format. - Extracts user/assistant conversation blocks from .aider.chat.history.md. - """ - try: - cursor = self.cursor_manager.get_cursor(file_path) - file_size = os.path.getsize(file_path) - - if file_size <= cursor.last_position: - return - - with open(file_path, "r", encoding="utf-8", errors="replace") as f: - f.seek(cursor.last_position) - new_content = f.read() - - new_position = cursor.last_position + len(new_content.encode("utf-8")) - - # Parse conversation blocks - events = self._parse_aider_content(new_content) - - for event in events: - event["tool_name"] = self.tool_name - byte_size = len(event.get("content", "").encode("utf-8")) - self._buffer.add_line(event, byte_size) - - self.cursor_manager.update_cursor(file_path, new_position) - self._check_batch_trigger() - - except Exception as e: - logger.error("[%s] Error processing %s: %s", self.tool_name, file_path, e) - - def _parse_aider_content(self, content: str) -> List[Dict]: - """Parse Aider markdown content into normalized events.""" - events = [] - lines = content.splitlines() - - current_role = None - current_content = [] - current_timestamp = None - project_name = None - - for line in lines: - # Check for timestamp header - header_match = self.HEADER_RE.match(line) - if header_match: - current_timestamp = header_match.group(1).strip() - continue - - # Check for project path - project_match = self.PROJECT_RE.match(line) - if project_match: - project_name = project_match.group(1).strip() - continue - - # Check for role markers - if self.USER_RE.match(line): - # Flush previous block - if current_role and current_content: - events.append(self._make_event(current_role, current_content, current_timestamp, project_name)) - current_role = "user" - current_content = [] - continue - - if self.ASSISTANT_RE.match(line): - if current_role and current_content: - events.append(self._make_event(current_role, current_content, current_timestamp, project_name)) - current_role = "assistant" - current_content = [] - continue - - # Accumulate content for current role - if current_role: - current_content.append(line) - - # Flush last block - if current_role and current_content: - events.append(self._make_event(current_role, current_content, current_timestamp, project_name)) - - return events - - def _make_event(self, role: str, content_lines: List[str], - timestamp: Optional[str], project_name: Optional[str]) -> Dict: - """Create a normalized event dict from parsed content.""" - content = "\n".join(content_lines).strip() - return { - "role": role, - "content": content, - "type": "message", - "timestamp": timestamp, - "project_name": project_name, - } diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py index 1f3e71147d..2ea3760b76 100644 --- a/openviking/daemon/watchers/base_file_watcher.py +++ b/openviking/daemon/watchers/base_file_watcher.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Abstract base class for file-append based watchers. Handles watchdog Observer lifecycle, cursor management, and batch buffering. diff --git a/openviking/daemon/watchers/base_polling_watcher.py b/openviking/daemon/watchers/base_polling_watcher.py index 29059b7a93..5e07760187 100644 --- a/openviking/daemon/watchers/base_polling_watcher.py +++ b/openviking/daemon/watchers/base_polling_watcher.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Abstract base class for database/API-based watchers that use periodic polling. Unlike BaseFileWatcher (watchdog + file cursor), this uses Thread + Event.wait(interval). diff --git a/openviking/daemon/watchers/claude_code_watcher.py b/openviking/daemon/watchers/claude_code_watcher.py index 88ffae8c87..ca8b9c4a88 100644 --- a/openviking/daemon/watchers/claude_code_watcher.py +++ b/openviking/daemon/watchers/claude_code_watcher.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Claude Code JSONL log watcher. Monitors ~/.claude/projects/<project>/<session>.jsonl files. diff --git a/openviking/daemon/watchers/continue_dev_watcher.py b/openviking/daemon/watchers/continue_dev_watcher.py deleted file mode 100644 index 0d87b64324..0000000000 --- a/openviking/daemon/watchers/continue_dev_watcher.py +++ /dev/null @@ -1,68 +0,0 @@ -""" -Continue.dev log watcher. -Monitors ~/.continue/ JSON log files for AI conversation events. -""" -import json -from typing import Dict, List, Optional - -from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher -from openviking.daemon.watchers.registry import register_watcher -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -@register_watcher("continue_dev") -class ContinueDevWatcher(BaseFileWatcher): - """Watches Continue.dev JSON log files.""" - - def __init__(self, watch_dir, cursor_manager, batch_callback, - file_pattern="*.json", - batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs): - super().__init__( - watch_dir=watch_dir, - cursor_manager=cursor_manager, - batch_callback=batch_callback, - file_pattern=file_pattern, - batch_trigger_lines=batch_trigger_lines, - batch_trigger_seconds=batch_trigger_seconds, - ) - - @property - def tool_name(self) -> str: - return "continue_dev" - - def parse_line(self, line: str) -> Optional[Dict]: - """Parse a Continue.dev JSON log line.""" - if not line: - return None - try: - data = json.loads(line) - if not isinstance(data, dict): - return None - return data - except (json.JSONDecodeError, ValueError): - return None - - def normalize_event(self, raw_event: Dict) -> Optional[Dict]: - """ - Normalize Continue.dev event. - Continue.dev uses format: - {"role": "user"|"assistant", "content": "...", "timestamp": "..."} - """ - role = raw_event.get("role", "") - content = raw_event.get("content", "") - - if role not in ("user", "assistant"): - return None - if not content: - return None - - return { - "role": role, - "content": content, - "type": "message", - "timestamp": raw_event.get("timestamp"), - "session_id": raw_event.get("sessionId") or raw_event.get("session_id"), - "project_name": raw_event.get("workspaceDirectory"), - } diff --git a/openviking/daemon/watchers/cursor_db_watcher.py b/openviking/daemon/watchers/cursor_db_watcher.py index 77a6444318..4796510da0 100644 --- a/openviking/daemon/watchers/cursor_db_watcher.py +++ b/openviking/daemon/watchers/cursor_db_watcher.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Cursor IDE SQLite database watcher. Monitors Cursor's dual-SQLite storage for AI conversations: diff --git a/openviking/daemon/watchers/cursor_watcher.py b/openviking/daemon/watchers/cursor_watcher.py deleted file mode 100644 index 6224fc67d7..0000000000 --- a/openviking/daemon/watchers/cursor_watcher.py +++ /dev/null @@ -1,87 +0,0 @@ -""" -Cursor IDE log watcher. -Monitors Cursor log files (JSON format) for AI conversation events. -""" -import json -from typing import Dict, List, Optional - -from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher -from openviking.daemon.watchers.registry import register_watcher -from openviking_cli.utils.logger import get_logger - -logger = get_logger(__name__) - - -@register_watcher("cursor") -class CursorWatcher(BaseFileWatcher): - """ - Watches Cursor IDE log files. - - Cursor stores AI conversations in log files under its workspace storage. - The format varies by version, so we implement flexible parsing. - """ - - def __init__(self, watch_dir, cursor_manager, batch_callback, - file_pattern="*.log", - batch_trigger_lines=50, batch_trigger_seconds=300, - extra=None, **kwargs): - super().__init__( - watch_dir=watch_dir, - cursor_manager=cursor_manager, - batch_callback=batch_callback, - file_pattern=file_pattern, - batch_trigger_lines=batch_trigger_lines, - batch_trigger_seconds=batch_trigger_seconds, - ) - self.extra = extra or {} - - @property - def tool_name(self) -> str: - return "cursor" - - def parse_line(self, line: str) -> Optional[Dict]: - """Parse a Cursor log line (JSON format).""" - if not line: - return None - try: - data = json.loads(line) - if not isinstance(data, dict): - return None - return data - except (json.JSONDecodeError, ValueError): - return None - - def normalize_event(self, raw_event: Dict) -> Optional[Dict]: - """ - Normalize Cursor log event. - Cursor logs use various schemas. We look for common patterns: - - {"type": "chat", "role": "user"|"assistant", "message": "..."} - - {"event": "ai_response", "content": "..."} - """ - # Try standard chat format - event_type = raw_event.get("type", "") - role = raw_event.get("role", "") - - if role in ("user", "human", "human_turn"): - content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or "" - if content: - return { - "role": "user", - "content": content, - "type": "message", - "timestamp": raw_event.get("timestamp") or raw_event.get("ts"), - "session_id": raw_event.get("conversationId") or raw_event.get("session_id"), - } - - if role in ("assistant", "ai", "ai_response", "bot"): - content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or "" - if content: - return { - "role": "assistant", - "content": content, - "type": "message", - "timestamp": raw_event.get("timestamp") or raw_event.get("ts"), - "session_id": raw_event.get("conversationId") or raw_event.get("session_id"), - } - - return None diff --git a/openviking/daemon/watchers/generic_jsonl_watcher.py b/openviking/daemon/watchers/generic_jsonl_watcher.py index 1bc1b1da62..a0607fdc2a 100644 --- a/openviking/daemon/watchers/generic_jsonl_watcher.py +++ b/openviking/daemon/watchers/generic_jsonl_watcher.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Generic JSONL log watcher. Supports arbitrary JSONL logs with customizable field mapping. diff --git a/openviking/daemon/watchers/registry.py b/openviking/daemon/watchers/registry.py index b76b9624d3..137d881d1a 100644 --- a/openviking/daemon/watchers/registry.py +++ b/openviking/daemon/watchers/registry.py @@ -1,3 +1,5 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 """ Watcher registry for creating tool-specific watchers by name. """ @@ -50,24 +52,6 @@ def _register_builtins(): except ImportError: pass - try: - from openviking.daemon.watchers.aider_watcher import AiderWatcher - _WATCHER_REGISTRY["aider"] = AiderWatcher - except ImportError: - pass - - try: - from openviking.daemon.watchers.cursor_watcher import CursorWatcher - _WATCHER_REGISTRY["cursor"] = CursorWatcher - except ImportError: - pass - - try: - from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher - _WATCHER_REGISTRY["continue_dev"] = ContinueDevWatcher - except ImportError: - pass - try: from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher _WATCHER_REGISTRY["cursor_db"] = CursorDBWatcher diff --git a/tests/daemon/test_aider_watcher.py b/tests/daemon/test_aider_watcher.py deleted file mode 100644 index 6ea7721d08..0000000000 --- a/tests/daemon/test_aider_watcher.py +++ /dev/null @@ -1,183 +0,0 @@ -"""Tests for AiderWatcher parsing logic.""" -import time -from typing import Dict, Optional - -from openviking.daemon.watchers.aider_watcher import AiderWatcher - - -class FakeCursorManager: - """Minimal stub for testing.""" - def __init__(self): - self.cursors = {} - self.updates = [] - - def get_cursor(self, file_path): - from openviking.daemon.models import FileCursor - return self.cursors.get(file_path, FileCursor(file_path=file_path)) - - def update_cursor(self, file_path, position): - from openviking.daemon.models import FileCursor - self.cursors[file_path] = FileCursor( - file_path=file_path, - last_position=position, - last_read_time=time.time(), - ) - self.updates.append((file_path, position)) - - -def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): - batches = [] - cursor_mgr = FakeCursorManager() - watcher = AiderWatcher( - watch_dir=str(tmp_path), - cursor_manager=cursor_mgr, - batch_callback=lambda events: batches.append(events), - batch_trigger_lines=batch_trigger_lines, - batch_trigger_seconds=batch_trigger_seconds, - ) - return watcher, batches, cursor_mgr - - -def test_matches_file_pattern(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.matches_file_pattern("/project/.aider.chat.history.md") - assert not w.matches_file_pattern("/project/other.md") - assert not w.matches_file_pattern("/project/aider.chat.history.md") - - -def test_parse_single_user_assistant_block(tmp_path): - w, _, _ = _make_watcher(tmp_path) - content = ( - "# aider chat started at 2024-01-15 10:30:00\n" - "\n" - "> /path/to/project\n" - "\n" - "#### user:\n" - "How do I implement a REST API in Flask?\n" - "\n" - "#### assistant:\n" - "Here's how to create a basic Flask REST API:\n" - "Use Flask and add routes.\n" - ) - events = w._parse_aider_content(content) - assert len(events) == 2 - assert events[0]["role"] == "user" - assert events[0]["content"] == "How do I implement a REST API in Flask?" - assert events[1]["role"] == "assistant" - assert "Flask REST API" in events[1]["content"] - - -def test_parse_multiple_blocks(tmp_path): - w, _, _ = _make_watcher(tmp_path) - content = ( - "# aider chat started at 2024-01-15 10:30:00\n" - "> /my/project\n" - "#### user:\n" - "First question\n" - "#### assistant:\n" - "First answer\n" - "#### user:\n" - "Second question\n" - "#### assistant:\n" - "Second answer\n" - ) - events = w._parse_aider_content(content) - assert len(events) == 4 - assert events[0]["role"] == "user" - assert events[0]["content"] == "First question" - assert events[1]["role"] == "assistant" - assert events[1]["content"] == "First answer" - assert events[2]["role"] == "user" - assert events[2]["content"] == "Second question" - assert events[3]["role"] == "assistant" - assert events[3]["content"] == "Second answer" - - -def test_timestamp_and_project_extraction(tmp_path): - w, _, _ = _make_watcher(tmp_path) - content = ( - "# aider chat started at 2024-01-15 10:30:00\n" - "> /home/user/myproject\n" - "#### user:\n" - "Hello\n" - ) - events = w._parse_aider_content(content) - assert len(events) == 1 - assert events[0]["timestamp"] == "2024-01-15 10:30:00" - assert events[0]["project_name"] == "/home/user/myproject" - - -def test_empty_content_handling(tmp_path): - w, _, _ = _make_watcher(tmp_path) - events = w._parse_aider_content("") - assert events == [] - - events = w._parse_aider_content("# aider chat started at 2024-01-15 10:30:00\n") - assert events == [] - - -def test_multiline_content(tmp_path): - w, _, _ = _make_watcher(tmp_path) - content = ( - "#### user:\n" - "Line one\n" - "Line two\n" - "Line three\n" - ) - events = w._parse_aider_content(content) - assert len(events) == 1 - assert events[0]["content"] == "Line one\nLine two\nLine three" - - -def test_process_file_integration(tmp_path): - w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) - - test_file = tmp_path / ".aider.chat.history.md" - test_file.write_text( - "# aider chat started at 2024-01-15 10:30:00\n" - "> /project\n" - "#### user:\n" - "Hello\n" - "#### assistant:\n" - "Hi there\n", - encoding="utf-8", - ) - - w._process_file(str(test_file)) - - assert len(batches) == 1 - assert len(batches[0]) == 2 - assert batches[0][0]["role"] == "user" - assert batches[0][0]["tool_name"] == "aider" - assert batches[0][1]["role"] == "assistant" - - -def test_incremental_read(tmp_path): - w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100) - - test_file = tmp_path / ".aider.chat.history.md" - test_file.write_text( - "#### user:\n" - "First message\n", - encoding="utf-8", - ) - w._process_file(str(test_file)) - - # Append more content - with open(str(test_file), "a", encoding="utf-8") as f: - f.write( - "#### assistant:\n" - "Response\n" - ) - w._process_file(str(test_file)) - - w.flush() - assert len(batches) == 1 - assert len(batches[0]) == 2 - assert batches[0][0]["role"] == "user" - assert batches[0][1]["role"] == "assistant" - - -def test_tool_name(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.tool_name == "aider" diff --git a/tests/daemon/test_continue_dev_watcher.py b/tests/daemon/test_continue_dev_watcher.py deleted file mode 100644 index 2a6e01dc14..0000000000 --- a/tests/daemon/test_continue_dev_watcher.py +++ /dev/null @@ -1,193 +0,0 @@ -"""Tests for ContinueDevWatcher parsing and normalization logic.""" -import time -from typing import Dict, Optional - -from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher - - -class FakeCursorManager: - """Minimal stub for testing.""" - def __init__(self): - self.cursors = {} - self.updates = [] - - def get_cursor(self, file_path): - from openviking.daemon.models import FileCursor - return self.cursors.get(file_path, FileCursor(file_path=file_path)) - - def update_cursor(self, file_path, position): - from openviking.daemon.models import FileCursor - self.cursors[file_path] = FileCursor( - file_path=file_path, - last_position=position, - last_read_time=time.time(), - ) - self.updates.append((file_path, position)) - - -def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): - batches = [] - cursor_mgr = FakeCursorManager() - watcher = ContinueDevWatcher( - watch_dir=str(tmp_path), - cursor_manager=cursor_mgr, - batch_callback=lambda events: batches.append(events), - batch_trigger_lines=batch_trigger_lines, - batch_trigger_seconds=batch_trigger_seconds, - ) - return watcher, batches, cursor_mgr - - -def test_parse_line_valid_json(tmp_path): - w, _, _ = _make_watcher(tmp_path) - line = '{"role": "user", "content": "Hello"}' - result = w.parse_line(line) - assert result is not None - assert result["role"] == "user" - assert result["content"] == "Hello" - - -def test_parse_line_invalid_json(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.parse_line("not valid json") is None - assert w.parse_line("") is None - assert w.parse_line(" ") is None - - -def test_parse_line_non_dict(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.parse_line('"just a string"') is None - assert w.parse_line("[1, 2, 3]") is None - assert w.parse_line("42") is None - - -def test_normalize_standard_format(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "user", "content": "How do I sort a list?"} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "user" - assert event["content"] == "How do I sort a list?" - assert event["type"] == "message" - - -def test_normalize_assistant(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "assistant", "content": "Use the sorted() function."} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "assistant" - assert event["content"] == "Use the sorted() function." - - -def test_normalize_missing_content_filtered(tmp_path): - w, _, _ = _make_watcher(tmp_path) - # Empty content - raw = {"role": "user", "content": ""} - assert w.normalize_event(raw) is None - - # Missing content key entirely - raw = {"role": "user"} - assert w.normalize_event(raw) is None - - -def test_normalize_invalid_role_filtered(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "system", "content": "System message"} - assert w.normalize_event(raw) is None - - raw = {"role": "tool", "content": "Tool output"} - assert w.normalize_event(raw) is None - - raw = {"content": "No role at all"} - assert w.normalize_event(raw) is None - - -def test_normalize_session_id_mapping(tmp_path): - w, _, _ = _make_watcher(tmp_path) - # sessionId field - raw = {"role": "user", "content": "test", "sessionId": "session-abc-123"} - event = w.normalize_event(raw) - assert event is not None - assert event["session_id"] == "session-abc-123" - - # session_id field (alternative) - raw = {"role": "user", "content": "test", "session_id": "session-xyz-789"} - event = w.normalize_event(raw) - assert event is not None - assert event["session_id"] == "session-xyz-789" - - -def test_normalize_workspace_directory_to_project_name(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = { - "role": "user", - "content": "test", - "workspaceDirectory": "/home/user/my-project", - } - event = w.normalize_event(raw) - assert event is not None - assert event["project_name"] == "/home/user/my-project" - - -def test_normalize_timestamp(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "user", "content": "test", "timestamp": "2024-01-15T10:30:00Z"} - event = w.normalize_event(raw) - assert event is not None - assert event["timestamp"] == "2024-01-15T10:30:00Z" - - -def test_normalize_optional_fields_none(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "user", "content": "minimal event"} - event = w.normalize_event(raw) - assert event is not None - assert event["timestamp"] is None - assert event["session_id"] is None - assert event["project_name"] is None - - -def test_tool_name(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.tool_name == "continue_dev" - - -def test_process_file_integration(tmp_path): - w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) - - test_file = tmp_path / "continue.json" - test_file.write_text( - '{"role": "user", "content": "Hello"}\n' - '{"role": "assistant", "content": "Hi there"}\n', - encoding="utf-8", - ) - - w._process_file(str(test_file)) - - assert len(batches) == 1 - assert len(batches[0]) == 2 - assert batches[0][0]["role"] == "user" - assert batches[0][0]["tool_name"] == "continue_dev" - assert batches[0][1]["role"] == "assistant" - - -def test_process_file_filters_non_chat(tmp_path): - w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100) - - test_file = tmp_path / "continue.json" - test_file.write_text( - '{"role": "user", "content": "Hello"}\n' - '{"role": "system", "content": "System prompt"}\n' - '{"role": "assistant", "content": "Hi"}\n' - 'not json at all\n', - encoding="utf-8", - ) - - w._process_file(str(test_file)) - w.flush() - - assert len(batches) == 1 - assert len(batches[0]) == 2 # only user + assistant, system filtered - assert batches[0][0]["role"] == "user" - assert batches[0][1]["role"] == "assistant" diff --git a/tests/daemon/test_cursor_watcher.py b/tests/daemon/test_cursor_watcher.py deleted file mode 100644 index e3982608f4..0000000000 --- a/tests/daemon/test_cursor_watcher.py +++ /dev/null @@ -1,187 +0,0 @@ -"""Tests for CursorWatcher parsing and normalization logic.""" -import time -from typing import Dict, Optional - -from openviking.daemon.watchers.cursor_watcher import CursorWatcher - - -class FakeCursorManager: - """Minimal stub for testing.""" - def __init__(self): - self.cursors = {} - self.updates = [] - - def get_cursor(self, file_path): - from openviking.daemon.models import FileCursor - return self.cursors.get(file_path, FileCursor(file_path=file_path)) - - def update_cursor(self, file_path, position): - from openviking.daemon.models import FileCursor - self.cursors[file_path] = FileCursor( - file_path=file_path, - last_position=position, - last_read_time=time.time(), - ) - self.updates.append((file_path, position)) - - -def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300): - batches = [] - cursor_mgr = FakeCursorManager() - watcher = CursorWatcher( - watch_dir=str(tmp_path), - cursor_manager=cursor_mgr, - batch_callback=lambda events: batches.append(events), - batch_trigger_lines=batch_trigger_lines, - batch_trigger_seconds=batch_trigger_seconds, - ) - return watcher, batches, cursor_mgr - - -def test_parse_line_valid_json(tmp_path): - w, _, _ = _make_watcher(tmp_path) - line = '{"type": "chat", "role": "user", "message": "Hello"}' - result = w.parse_line(line) - assert result is not None - assert result["role"] == "user" - assert result["message"] == "Hello" - - -def test_parse_line_invalid_json(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.parse_line("not valid json") is None - assert w.parse_line("") is None - assert w.parse_line(" ") is None - - -def test_parse_line_non_dict(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.parse_line('"just a string"') is None - assert w.parse_line("[1, 2, 3]") is None - assert w.parse_line("42") is None - - -def test_normalize_standard_chat_format(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"type": "chat", "role": "user", "message": "How do I sort a list?"} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "user" - assert event["content"] == "How do I sort a list?" - assert event["type"] == "message" - - -def test_normalize_human_role(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "human", "content": "What is Python?"} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "user" - assert event["content"] == "What is Python?" - - -def test_normalize_human_turn_role(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "human_turn", "text": "Explain recursion"} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "user" - assert event["content"] == "Explain recursion" - - -def test_normalize_ai_response(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "ai_response", "text": "Recursion is when a function calls itself."} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "assistant" - assert event["content"] == "Recursion is when a function calls itself." - - -def test_normalize_assistant_role(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "assistant", "message": "Here is the answer."} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "assistant" - assert event["content"] == "Here is the answer." - - -def test_normalize_ai_role(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "ai", "content": "AI generated response"} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "assistant" - assert event["content"] == "AI generated response" - - -def test_normalize_bot_role(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "bot", "message": "Bot reply"} - event = w.normalize_event(raw) - assert event is not None - assert event["role"] == "assistant" - assert event["content"] == "Bot reply" - - -def test_normalize_non_chat_event_filtered(tmp_path): - w, _, _ = _make_watcher(tmp_path) - # System message should be filtered - raw = {"role": "system", "content": "System prompt"} - assert w.normalize_event(raw) is None - - # No role - raw = {"type": "info", "content": "some log"} - assert w.normalize_event(raw) is None - - # Empty content - raw = {"role": "user", "message": ""} - assert w.normalize_event(raw) is None - - -def test_normalize_timestamp_fields(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "user", "message": "test", "timestamp": "2024-01-15T10:30:00Z"} - event = w.normalize_event(raw) - assert event["timestamp"] == "2024-01-15T10:30:00Z" - - # Also check ts field - raw = {"role": "user", "message": "test", "ts": "2024-01-15T11:00:00Z"} - event = w.normalize_event(raw) - assert event["timestamp"] == "2024-01-15T11:00:00Z" - - -def test_normalize_session_id_mapping(tmp_path): - w, _, _ = _make_watcher(tmp_path) - raw = {"role": "user", "message": "test", "conversationId": "conv-123"} - event = w.normalize_event(raw) - assert event["session_id"] == "conv-123" - - raw = {"role": "user", "message": "test", "session_id": "sess-456"} - event = w.normalize_event(raw) - assert event["session_id"] == "sess-456" - - -def test_tool_name(tmp_path): - w, _, _ = _make_watcher(tmp_path) - assert w.tool_name == "cursor" - - -def test_process_file_integration(tmp_path): - w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2) - - test_file = tmp_path / "cursor.log" - test_file.write_text( - '{"role": "user", "message": "Hello"}\n' - '{"role": "assistant", "message": "Hi there"}\n', - encoding="utf-8", - ) - - w._process_file(str(test_file)) - - assert len(batches) == 1 - assert len(batches[0]) == 2 - assert batches[0][0]["role"] == "user" - assert batches[0][0]["tool_name"] == "cursor" - assert batches[0][1]["role"] == "assistant" diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py index ad641c39d9..23ffe2f95c 100644 --- a/tests/daemon/test_multi_watcher_integration.py +++ b/tests/daemon/test_multi_watcher_integration.py @@ -33,9 +33,9 @@ def update_cursor(self, file_path, position): # --- Registry Tests --- def test_all_watchers_registered(): - """All 6 built-in watchers should be registered.""" + """All 3 built-in watchers should be registered.""" available = list_available_watchers() - expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev", "cursor_db"} + expected = {"claude_code", "generic_jsonl", "cursor_db"} assert expected.issubset(set(available)), f"Missing: {expected - set(available)}" @@ -47,19 +47,20 @@ def test_create_all_watchers(): file_patterns = { "claude_code": "*.jsonl", "generic_jsonl": "*.jsonl", - "aider": ".aider.chat.history.md", - "cursor": "*.log", - "continue_dev": "*.json", } for tool_name in list_available_watchers(): - watcher = create_watcher( + kwargs = dict( tool_name=tool_name, watch_dir="/tmp/test", cursor_manager=cm, batch_callback=lambda e: batches.append(e), - file_pattern=file_patterns.get(tool_name, "*.jsonl"), ) + if tool_name in file_patterns: + kwargs["file_pattern"] = file_patterns[tool_name] + if tool_name == "cursor_db": + kwargs["poll_interval"] = 60 + watcher = create_watcher(**kwargs) assert isinstance(watcher, BaseWatcher) assert watcher.tool_name == tool_name @@ -128,88 +129,6 @@ def test_generic_jsonl_custom_mapping(tmp_path): assert batches[0][0]["tool_name"] == "generic_jsonl" -def test_aider_multiline_parsing(tmp_path): - """Aider markdown format should produce user+assistant events.""" - batches = [] - cm = FakeCursorManager() - watcher = create_watcher( - tool_name="aider", - watch_dir=str(tmp_path), - cursor_manager=cm, - batch_callback=lambda e: batches.append(e), - batch_trigger_lines=100, - ) - - test_file = tmp_path / ".aider.chat.history.md" - test_file.write_text( - "# aider chat started at 2026-01-15 10:30:00\n\n" - "> /my/project\n\n" - "#### user:\n" - "How do I use fastapi?\n\n" - "#### assistant:\n" - "FastAPI is great for building APIs.\n" - ) - watcher._process_file(str(test_file)) - watcher.flush() - - assert len(batches) == 1 - events = batches[0] - assert len(events) == 2 - assert events[0]["role"] == "user" - assert events[0]["content"] == "How do I use fastapi?" - assert events[0]["tool_name"] == "aider" - assert events[1]["role"] == "assistant" - - -def test_cursor_log_parsing(tmp_path): - """Cursor log format should be normalized correctly.""" - batches = [] - cm = FakeCursorManager() - watcher = create_watcher( - tool_name="cursor", - watch_dir=str(tmp_path), - cursor_manager=cm, - batch_callback=lambda e: batches.append(e), - file_pattern="*.log", - batch_trigger_lines=1, - ) - - test_file = tmp_path / "cursor.log" - test_file.write_text( - '{"type": "chat", "role": "user", "message": "Fix this bug"}\n' - ) - watcher._process_file(str(test_file)) - - assert len(batches) == 1 - assert batches[0][0]["role"] == "user" - assert batches[0][0]["content"] == "Fix this bug" - assert batches[0][0]["tool_name"] == "cursor" - - -def test_continue_dev_log_parsing(tmp_path): - """Continue.dev JSON format should be normalized.""" - batches = [] - cm = FakeCursorManager() - watcher = create_watcher( - tool_name="continue_dev", - watch_dir=str(tmp_path), - cursor_manager=cm, - batch_callback=lambda e: batches.append(e), - file_pattern="*.json", - batch_trigger_lines=1, - ) - - test_file = tmp_path / "continue.json" - test_file.write_text( - '{"role": "user", "content": "Explain this code", "sessionId": "abc123"}\n' - ) - watcher._process_file(str(test_file)) - - assert len(batches) == 1 - assert batches[0][0]["role"] == "user" - assert batches[0][0]["tool_name"] == "continue_dev" - assert batches[0][0]["session_id"] == "abc123" - # --- Cross-Watcher ETL Compatibility --- @@ -222,9 +141,9 @@ def test_normalized_events_compatible_with_reconstructor(): "timestamp": "2026-01-15T10:00:00Z", "session_id": "s1", "project_name": "proj"}, {"role": "assistant", "content": "Answer from CC", "tool_name": "claude_code", "timestamp": "2026-01-15T10:00:01Z", "session_id": "s1", "project_name": "proj"}, - {"role": "user", "content": "Question from Aider", "tool_name": "aider", + {"role": "user", "content": "Question from CursorDB", "tool_name": "cursor_db", "timestamp": "2026-01-15T10:00:02Z"}, - {"role": "assistant", "content": "Answer from Aider", "tool_name": "aider", + {"role": "assistant", "content": "Answer from CursorDB", "tool_name": "cursor_db", "timestamp": "2026-01-15T10:00:03Z"}, ] @@ -233,7 +152,7 @@ def test_normalized_events_compatible_with_reconstructor(): assert len(turns) == 2 assert turns[0].user_prompt == "Question from CC" - assert turns[1].user_prompt == "Question from Aider" + assert turns[1].user_prompt == "Question from CursorDB" def test_normalized_events_compatible_with_filter(): @@ -244,9 +163,9 @@ def test_normalized_events_compatible_with_filter(): {"role": "user", "content": "A meaningful question about architecture", "tool_name": "claude_code"}, {"role": "user", "content": "npm install express", - "tool_name": "cursor"}, + "tool_name": "cursor_db"}, {"role": "assistant", "content": "Here is a detailed explanation of the design pattern", - "tool_name": "aider"}, + "tool_name": "generic_jsonl"}, ] f = LowValueFilter() @@ -261,9 +180,9 @@ def test_source_tool_propagated_through_pipeline(): from openviking.daemon.conversation_reconstructor import ConversationReconstructor events = [ - {"role": "user", "content": "How to use FastAPI?", "tool_name": "cursor", + {"role": "user", "content": "How to use FastAPI?", "tool_name": "cursor_db", "timestamp": "2026-01-15T10:00:00Z"}, - {"role": "assistant", "content": "Install FastAPI with pip...", "tool_name": "cursor", + {"role": "assistant", "content": "Install FastAPI with pip...", "tool_name": "cursor_db", "timestamp": "2026-01-15T10:00:01Z"}, ] @@ -271,7 +190,7 @@ def test_source_tool_propagated_through_pipeline(): turns = reconstructor.reconstruct(events) assert len(turns) == 1 - assert turns[0].source_tool == "cursor" + assert turns[0].source_tool == "cursor_db" def test_multi_watcher_config_effective_watchers(): @@ -283,14 +202,14 @@ def test_multi_watcher_config_effective_watchers(): enabled=True, watchers=[ WatcherConfig(tool_name="claude_code", watch_dir="/a"), - WatcherConfig(tool_name="aider", watch_dir="/b"), - WatcherConfig(tool_name="cursor", watch_dir="/c", enabled=False), + WatcherConfig(tool_name="generic_jsonl", watch_dir="/b"), + WatcherConfig(tool_name="cursor_db", watch_dir="/c", enabled=False), ], ) effective = cfg.get_effective_watchers() assert len(effective) == 2 # disabled watcher filtered out assert effective[0].tool_name == "claude_code" - assert effective[1].tool_name == "aider" + assert effective[1].tool_name == "generic_jsonl" # Backward compat: watch_dir only cfg2 = DaemonConfig(enabled=True, watch_dir="~/.claude/projects") @@ -315,10 +234,10 @@ def test_knowledge_router_uses_source_tool(): # With source_tool k1 = ExtractedKnowledge( status="EXTRACTED", category="skills", title="FastAPI Tips", - content="...", confidence=0.9, source_tool="cursor", + content="...", confidence=0.9, source_tool="cursor_db", ) uri1 = router.route(k1) - assert "cursor" in uri1 + assert "cursor_db" in uri1 assert "claude_code" not in uri1 # Without source_tool (fallback) diff --git a/tests/daemon/test_service_multi.py b/tests/daemon/test_service_multi.py index 60fe888eff..a84b997cdd 100644 --- a/tests/daemon/test_service_multi.py +++ b/tests/daemon/test_service_multi.py @@ -9,7 +9,7 @@ def test_service_stores_watcher_configs(): from openviking.daemon.service import DaemonService configs = [ WatcherConfig(tool_name="claude_code", watch_dir="/tmp/cc"), - WatcherConfig(tool_name="aider", watch_dir="/tmp/aider"), + WatcherConfig(tool_name="cursor_db", watch_dir="/tmp/cursor"), ] svc = DaemonService( resource_service=MagicMock(), From 3b8a63d0cbda66cb06efc2e24b114af3d07fb6a1 Mon Sep 17 00:00:00 2001 From: baobaodae <2014596548@qq.com> Date: Thu, 25 Jun 2026 21:57:23 +0800 Subject: [PATCH 6/6] fix(daemon): resolve CRLF cursor drift, file truncation, and quiet-session flush gap - Switch _process_file() to binary mode read for exact byte-offset cursors, eliminating CRLF drift on Windows (universal newline translation stripped \r) - Detect file truncation/rotation (file_size < cursor) and reset cursor to 0 instead of silently skipping the file forever - Add independent periodic flush timer thread so buffered events are flushed after batch_trigger_seconds even when no new file events arrive - Ensure final flush on stop() to prevent data loss during shutdown - Add 3 new tests: CRLF accuracy, truncation reset, periodic flush on quiet session --- .../daemon/watchers/base_file_watcher.py | 50 ++++++++-- tests/daemon/test_base_file_watcher.py | 95 +++++++++++++++++++ 2 files changed, 138 insertions(+), 7 deletions(-) diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py index 2ea3760b76..4f62408c8d 100644 --- a/openviking/daemon/watchers/base_file_watcher.py +++ b/openviking/daemon/watchers/base_file_watcher.py @@ -6,6 +6,7 @@ Subclasses only need to implement parse_line() and normalize_event(). """ import os +import threading import time from abc import ABC, abstractmethod from pathlib import Path @@ -112,16 +113,33 @@ def matches_file_pattern(self, file_path: str) -> bool: return filename == self.file_pattern def start(self) -> None: - """Start the watchdog Observer.""" + """Start the watchdog Observer and periodic flush timer.""" self._handler = _FileHandler(self) self._observer = Observer() self._observer.schedule(self._handler, self.watch_dir, recursive=True) self._observer.daemon = True self._observer.start() + + # Periodic flush timer for time-based batch trigger + self._stop_event = threading.Event() + self._flush_thread = threading.Thread( + target=self._periodic_flush_loop, daemon=True + ) + self._flush_thread.start() + logger.info("[%s] Watcher started on %s", self.tool_name, self.watch_dir) def stop(self) -> None: - """Stop the watchdog Observer.""" + """Stop the watchdog Observer and periodic flush timer, flushing remaining data.""" + # Signal flush thread to stop and wait for it + if hasattr(self, "_stop_event"): + self._stop_event.set() + if hasattr(self, "_flush_thread") and self._flush_thread: + self._flush_thread.join(timeout=5) + + # Final flush to avoid losing buffered events on shutdown + self._flush_buffer() + if self._observer: self._observer.stop() self._observer.join(timeout=5) @@ -131,23 +149,41 @@ def flush(self) -> None: """Force flush the buffer.""" self._flush_buffer() + def _periodic_flush_loop(self): + """Background thread that periodically flushes the buffer based on time threshold.""" + while not self._stop_event.wait(timeout=self.batch_trigger_seconds): + if not self._buffer.is_empty(): + age = time.time() - self._buffer.created_at + if age >= self.batch_trigger_seconds: + logger.debug("[%s] Periodic flush: %d events (age %.0fs)", + self.tool_name, len(self._buffer.lines), age) + self._flush_buffer() + def _process_file(self, file_path: str): """Read new content from file using cursor, parse, normalize, buffer.""" try: cursor = self.cursor_manager.get_cursor(file_path) file_size = os.path.getsize(file_path) - if file_size <= cursor.last_position: + if file_size < cursor.last_position: + # File truncated or rotated — reset cursor to beginning + logger.warning("[%s] File truncated (size %d < cursor %d), resetting", + self.tool_name, file_size, cursor.last_position) + self.cursor_manager.update_cursor(file_path, 0) + return + if file_size == cursor.last_position: return - with open(file_path, "r", encoding="utf-8", errors="replace") as f: + # Use binary mode to get exact byte offsets (avoids CRLF drift on Windows) + with open(file_path, "rb") as f: f.seek(cursor.last_position) - new_content = f.read() + raw_bytes = f.read() - new_position = cursor.last_position + len(new_content.encode("utf-8")) + new_position = cursor.last_position + len(raw_bytes) + new_content = raw_bytes.decode("utf-8", errors="replace") logger.info("[%s] Processing %s: %d bytes new content from pos %d", - self.tool_name, file_path, len(new_content.encode("utf-8")), cursor.last_position) + self.tool_name, file_path, len(raw_bytes), cursor.last_position) event_count = 0 for line in new_content.splitlines(): diff --git a/tests/daemon/test_base_file_watcher.py b/tests/daemon/test_base_file_watcher.py index badefba21f..8ab404435b 100644 --- a/tests/daemon/test_base_file_watcher.py +++ b/tests/daemon/test_base_file_watcher.py @@ -161,3 +161,98 @@ def filter_event(self, event): assert len(batches) == 1 assert len(batches[0]) == 1 assert batches[0][0]["content"] == "keep this" + + +def test_crlf_byte_offset_accuracy(tmp_path): + """Binary mode read ensures cursor byte offset is exact even with CRLF line endings.""" + w, batches, cm = _make_watcher(tmp_path, batch_trigger_lines=100) + + test_file = tmp_path / "test.jsonl" + # Write CRLF-terminated lines (simulating Windows line endings) + with open(str(test_file), "wb") as f: + f.write(b'{"role": "user", "content": "hello"}\r\n') + f.write(b'{"role": "assistant", "content": "hi"}\r\n') + + w._process_file(str(test_file)) + + # Cursor should point to exact end of file (including \r\n bytes) + cursor = cm.get_cursor(str(test_file)) + actual_size = os.path.getsize(str(test_file)) + assert cursor.last_position == actual_size, ( + f"Cursor {cursor.last_position} != file size {actual_size} (CRLF drift)" + ) + + # Append more and verify incremental read still works + with open(str(test_file), "ab") as f: + f.write(b'{"role": "user", "content": "second"}\r\n') + w._process_file(str(test_file)) + + cursor2 = cm.get_cursor(str(test_file)) + assert cursor2.last_position == os.path.getsize(str(test_file)) + + w.flush() + assert len(batches) == 1 + assert len(batches[0]) == 3 + + +def test_file_truncation_resets_cursor(tmp_path): + """When a file is truncated/rotated (size < cursor), cursor resets to 0.""" + w, batches, cm = _make_watcher(tmp_path, batch_trigger_lines=100) + + test_file = tmp_path / "test.jsonl" + test_file.write_text( + '{"role": "user", "content": "first line"}\n' + '{"role": "assistant", "content": "first response"}\n' + ) + w._process_file(str(test_file)) + + # Cursor should be at end of file + cursor = cm.get_cursor(str(test_file)) + assert cursor.last_position > 0 + + # Flush to clear buffer from first read + w.flush() + batches.clear() + + # Simulate file truncation/rotation: rewrite with shorter content + test_file.write_text('{"role": "user", "content": "new"}\n') + + # First call detects truncation and resets cursor to 0 + w._process_file(str(test_file)) + cursor_reset = cm.get_cursor(str(test_file)) + assert cursor_reset.last_position == 0 + + # Second call reads from beginning + w._process_file(str(test_file)) + cursor2 = cm.get_cursor(str(test_file)) + assert cursor2.last_position == os.path.getsize(str(test_file)) + + w.flush() + assert len(batches) == 1 + assert batches[0][0]["content"] == "new" + + +def test_periodic_flush_on_quiet_session(tmp_path): + """Periodic flush thread should auto-flush buffered events after timeout.""" + w, batches, _ = _make_watcher( + tmp_path, batch_trigger_lines=100, batch_trigger_seconds=1 + ) + + test_file = tmp_path / "test.jsonl" + test_file.write_text('{"role": "user", "content": "lonely message"}\n') + w._process_file(str(test_file)) + + # Not flushed yet (line threshold not reached) + assert len(batches) == 0 + + # Start the watcher (launches periodic flush thread) + w.start() + try: + # Wait for the periodic flush (1 second trigger + buffer) + time.sleep(2.5) + finally: + w.stop() + + # The periodic flush thread should have flushed the buffer + assert len(batches) == 1 + assert batches[0][0]["content"] == "lonely message"