From 52b0a34f9fdc1097070140a1c9f58f4f6eb14f2a Mon Sep 17 00:00:00 2001
From: baobaodae <2014596548@qq.com>
Date: Wed, 17 Jun 2026 23:27:36 +0800
Subject: [PATCH 1/6] feat: Active Daemon - multi-source AI tool log watcher
framework
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- Background daemon monitors AI coding tool logs (Claude Code, Aider,
Cursor, Continue.dev, generic JSONL) via watchdog filesystem observers
- Incremental ETL pipeline: watch → filter → reconstruct → LLM extract
→ deduplicate → route → viking:// storage
- Watcher abstraction with BaseFileWatcher ABC and registry-based factory
- SQLite cursor persistence for incremental file reads across restarts
- Server integration: GET /api/v1/daemon/status, --with-daemon CLI flag
- Web UI: daemon status card on home dashboard with per-watcher metrics
- Full test suite (18 test files) and documentation
---
Echo2.md | 429 ++++++
docs/daemon/README.md | 61 +
docs/daemon/configuration.md | 50 +
docs/daemon/multi-source-design.md | 1227 +++++++++++++++++
examples/daemon/docker-compose-daemon.yml | 14 +
examples/daemon/start-daemon.bat | 10 +
examples/daemon/start-daemon.sh | 12 +
openviking/daemon/__init__.py | 7 +
.../daemon/conversation_reconstructor.py | 55 +
openviking/daemon/cursor_manager.py | 92 ++
openviking/daemon/deduplicator.py | 35 +
openviking/daemon/etl_pipeline.py | 64 +
openviking/daemon/filters.py | 47 +
openviking/daemon/knowledge_extractor.py | 121 ++
openviking/daemon/knowledge_router.py | 56 +
openviking/daemon/models.py | 88 ++
openviking/daemon/service.py | 190 +++
openviking/daemon/storage_adapter.py | 155 +++
openviking/daemon/watchers/__init__.py | 27 +
openviking/daemon/watchers/aider_watcher.py | 148 ++
.../daemon/watchers/base_file_watcher.py | 192 +++
.../daemon/watchers/claude_code_watcher.py | 60 +
.../daemon/watchers/continue_dev_watcher.py | 68 +
openviking/daemon/watchers/cursor_watcher.py | 87 ++
.../daemon/watchers/generic_jsonl_watcher.py | 104 ++
openviking/daemon/watchers/registry.py | 72 +
openviking/server/app.py | 39 +
openviking/server/bootstrap.py | 10 +-
openviking/server/config.py | 82 +-
openviking/server/routers/__init__.py | 2 +
openviking/server/routers/daemon.py | 81 ++
tests/daemon/__init__.py | 0
tests/daemon/test_aider_watcher.py | 183 +++
tests/daemon/test_base_file_watcher.py | 163 +++
tests/daemon/test_claude_code_watcher.py | 134 ++
tests/daemon/test_config.py | 85 ++
tests/daemon/test_continue_dev_watcher.py | 193 +++
.../daemon/test_conversation_reconstructor.py | 58 +
tests/daemon/test_cursor_manager.py | 61 +
tests/daemon/test_cursor_watcher.py | 187 +++
tests/daemon/test_deduplicator.py | 42 +
tests/daemon/test_filters.py | 53 +
tests/daemon/test_generic_jsonl_watcher.py | 156 +++
tests/daemon/test_integration.py | 189 +++
tests/daemon/test_knowledge_extractor.py | 117 ++
tests/daemon/test_knowledge_router.py | 73 +
.../daemon/test_multi_watcher_integration.py | 330 +++++
tests/daemon/test_registry.py | 31 +
tests/daemon/test_service_multi.py | 31 +
web-studio/src/i18n/locales/en.ts | 15 +
web-studio/src/i18n/locales/zh-CN.ts | 15 +
.../home/-components/daemon-status-card.tsx | 186 +++
web-studio/src/routes/home/route.tsx | 3 +
53 files changed, 5988 insertions(+), 2 deletions(-)
create mode 100644 Echo2.md
create mode 100644 docs/daemon/README.md
create mode 100644 docs/daemon/configuration.md
create mode 100644 docs/daemon/multi-source-design.md
create mode 100644 examples/daemon/docker-compose-daemon.yml
create mode 100644 examples/daemon/start-daemon.bat
create mode 100644 examples/daemon/start-daemon.sh
create mode 100644 openviking/daemon/__init__.py
create mode 100644 openviking/daemon/conversation_reconstructor.py
create mode 100644 openviking/daemon/cursor_manager.py
create mode 100644 openviking/daemon/deduplicator.py
create mode 100644 openviking/daemon/etl_pipeline.py
create mode 100644 openviking/daemon/filters.py
create mode 100644 openviking/daemon/knowledge_extractor.py
create mode 100644 openviking/daemon/knowledge_router.py
create mode 100644 openviking/daemon/models.py
create mode 100644 openviking/daemon/service.py
create mode 100644 openviking/daemon/storage_adapter.py
create mode 100644 openviking/daemon/watchers/__init__.py
create mode 100644 openviking/daemon/watchers/aider_watcher.py
create mode 100644 openviking/daemon/watchers/base_file_watcher.py
create mode 100644 openviking/daemon/watchers/claude_code_watcher.py
create mode 100644 openviking/daemon/watchers/continue_dev_watcher.py
create mode 100644 openviking/daemon/watchers/cursor_watcher.py
create mode 100644 openviking/daemon/watchers/generic_jsonl_watcher.py
create mode 100644 openviking/daemon/watchers/registry.py
create mode 100644 openviking/server/routers/daemon.py
create mode 100644 tests/daemon/__init__.py
create mode 100644 tests/daemon/test_aider_watcher.py
create mode 100644 tests/daemon/test_base_file_watcher.py
create mode 100644 tests/daemon/test_claude_code_watcher.py
create mode 100644 tests/daemon/test_config.py
create mode 100644 tests/daemon/test_continue_dev_watcher.py
create mode 100644 tests/daemon/test_conversation_reconstructor.py
create mode 100644 tests/daemon/test_cursor_manager.py
create mode 100644 tests/daemon/test_cursor_watcher.py
create mode 100644 tests/daemon/test_deduplicator.py
create mode 100644 tests/daemon/test_filters.py
create mode 100644 tests/daemon/test_generic_jsonl_watcher.py
create mode 100644 tests/daemon/test_integration.py
create mode 100644 tests/daemon/test_knowledge_extractor.py
create mode 100644 tests/daemon/test_knowledge_router.py
create mode 100644 tests/daemon/test_multi_watcher_integration.py
create mode 100644 tests/daemon/test_registry.py
create mode 100644 tests/daemon/test_service_multi.py
create mode 100644 web-studio/src/routes/home/-components/daemon-status-card.tsx
diff --git a/Echo2.md b/Echo2.md
new file mode 100644
index 0000000000..3f18cb473d
--- /dev/null
+++ b/Echo2.md
@@ -0,0 +1,429 @@
+# Echo II: Multi-Source Watcher Extension
+
+> 从单一 Claude Code 监听到多工具 AI 编码助手日志的统一监听与知识提取框架
+
+---
+
+## 一、动机与背景
+
+Active Daemon 的 v1 实现仅支持 Claude Code JSONL 日志监听。然而现实中,开发者的日常工具栈远不止于此:
+
+- **Claude Code CLI** — JSONL 日志(已实现)
+- **Cursor** — 日志文件 + SQLite 数据库
+- **Aider** — `.aider.chat.history.md` Markdown 文件
+- **GitHub Copilot** — VS Code workspace 日志
+- **Continue.dev** — `~/.continue/` 下的 JSON 日志
+- **Windsurf / Cascade** — 类 Cursor 的日志目录
+
+Echo.md 原文已明确提到需要支持 Cursor(SQLite polling)和 OpenCode(rotating text logs),但 v1 仅完成了最简单的 Claude Code 路径。本次扩展的核心目标是:**将 Daemon 从单源监听器升级为多源监听框架**,使开发者可以自由配置任意数量的工具监听路径。
+
+---
+
+## 二、目标工具可行性分析
+
+| 工具 | 存储格式 | 驱动模式 | 可行性 | 优先级 |
+|------|----------|----------|--------|--------|
+| **Claude Code** | JSONL append-only | 文件增量监听 | ★★★★★ 已实现 | P0 |
+| **Aider** | Markdown `.aider.chat.history.md` | 文件增量监听 | ★★★★☆ | P1 |
+| **Cursor** | SQLite `state.vscdb` + 日志 | DB polling + 文件监听 | ★★★☆☆ | P1 |
+| **Continue.dev** | JSON 日志 | 文件增量监听 | ★★★★☆ | P2 |
+| **GitHub Copilot** | VS Code workspace logs | 文件增量监听 | ★★★☆☆ | P2 |
+| **Generic JSONL** | 任意 JSONL | 文件增量监听 | ★★★★★ | P1 |
+
+### 驱动模式分类
+
+根据日志格式差异,Watcher 分为两种驱动模式:
+
+**File-Append Watcher**(文件追加监听)
+- 使用 watchdog 监听文件变更事件
+- 通过 FileCursor(byte offset)实现增量读取
+- 适用于:Claude Code、Aider、Continue.dev、Generic JSONL
+
+**Database Polling Watcher**(数据库轮询监听)
+- 定时轮询 SQLite 数据库
+- 通过 timestamp cursor 实现增量查询
+- 适用于:Cursor(state.vscdb)
+
+---
+
+## 三、架构设计
+
+### 3.1 核心原则
+
+**每个 Watcher 负责将自己的原始事件归一化为统一格式**,下游 ETL pipeline 完全不需要感知工具差异。
+
+```
+ ┌─ ClaudeCodeWatcher ──┐
+ │ │
+AiderWatcher ────────┤ ├──→ normalized events ──→ ETL Pipeline ──→ viking://
+ │ (shared queue) │
+GenericJSONLWatcher──┘ │
+ │
+CursorWatcher (polling) ────────────────────┘
+```
+
+### 3.2 归一化事件格式 (NormalizedEvent)
+
+所有 Watcher 将原始日志转换为以下统一格式的 dict:
+
+```python
+{
+ "role": "user" | "assistant", # 必须
+ "content": "...", # 必须,消息正文
+ "type": "message", # 必须,固定 "message"
+ "timestamp": "2026-06-16T10:30:00Z", # 可选
+ "session_id": "...", # 可选,工具特定
+ "project_name": "...", # 可选
+ "tool_name": "claude_code", # 必须,来源工具标识
+}
+```
+
+新增 `tool_name` 字段用于区分来源。现有的 `ConversationReconstructor`、`LowValueFilter`、`KnowledgeExtractor` 均通过 `event.get("role")` / `event.get("content")` 读取,**完全兼容**。
+
+### 3.3 BaseWatcher Protocol
+
+```python
+# openviking/daemon/watchers/__init__.py
+
+from typing import Protocol, runtime_checkable
+
+@runtime_checkable
+class BaseWatcher(Protocol):
+ """All tool-specific watchers must implement this interface."""
+
+ @property
+ def tool_name(self) -> str: ...
+
+ def start(self) -> None: ...
+ def stop(self) -> None: ...
+ def flush(self) -> None: ...
+```
+
+### 3.4 BaseFileWatcher 抽象基类
+
+为 File-Append 模式的 Watcher 提取公共逻辑(watchdog Observer + 增量读取 + 游标管理 + 批处理缓冲):
+
+```python
+# openviking/daemon/watchers/base_file_watcher.py
+
+class BaseFileWatcher(ABC):
+ """Abstract base for file-append based watchers."""
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ file_pattern, batch_trigger_lines, batch_trigger_seconds):
+ ...
+
+ @abstractmethod
+ def parse_line(self, line: str) -> Optional[Dict]:
+ """Parse a raw line into a raw event dict. Tool-specific."""
+ ...
+
+ @abstractmethod
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """Convert raw event to NormalizedEvent format. Tool-specific."""
+ ...
+
+ def filter_event(self, event: Dict) -> bool:
+ """Optional: additional tool-specific filtering. Default: True."""
+ return True
+
+ # --- 公共逻辑(不需要子类重写) ---
+ def start(self) -> None: ... # 创建 Observer + Handler
+ def stop(self) -> None: ...
+ def flush(self) -> None: ...
+```
+
+子类只需实现 `parse_line()` 和 `normalize_event()` 两个方法。
+
+### 3.5 Watcher 注册表 (WatcherRegistry)
+
+```python
+# openviking/daemon/watchers/registry.py
+
+WATCHER_REGISTRY: Dict[str, Type[BaseWatcher]] = {
+ "claude_code": ClaudeCodeWatcher,
+ "aider": AiderWatcher,
+ "cursor": CursorWatcher,
+ "continue_dev": ContinueDevWatcher,
+ "generic_jsonl": GenericJSONLWatcher,
+}
+
+def create_watcher(tool_name: str, **kwargs) -> BaseWatcher:
+ """Factory function to create a watcher by tool name."""
+ cls = WATCHER_REGISTRY.get(tool_name)
+ if not cls:
+ raise ValueError(f"Unknown tool: {tool_name}. Available: {list(WATCHER_REGISTRY.keys())}")
+ return cls(**kwargs)
+```
+
+---
+
+## 四、配置设计
+
+### 4.1 新配置结构
+
+`DaemonConfig` 从单一 `watch_dir` 演进为支持 `watchers` 列表:
+
+```python
+class WatcherConfig(BaseModel):
+ tool_name: str # 必填:工具标识
+ watch_dir: str # 必填:监听路径
+ file_pattern: str = "*.jsonl" # 可选:文件匹配模式
+ enabled: bool = True # 可选:是否启用
+ batch_trigger_lines: int = Field(50, gt=0) # 可选:批处理行数
+ batch_trigger_seconds: int = Field(300, gt=0) # 可选:批处理秒数
+ extra: Dict[str, Any] = Field(default_factory=dict) # 可选:工具特定参数
+
+class DaemonConfig(BaseModel):
+ enabled: bool = False
+ db_path: Optional[str] = None
+ watchers: List[WatcherConfig] = Field(default_factory=list)
+
+ # 向后兼容:保留旧字段
+ watch_dir: Optional[str] = None # deprecated
+ batch_trigger_lines: int = Field(50, gt=0) # deprecated, 全局默认值
+ batch_trigger_seconds: int = Field(300, gt=0) # deprecated, 全局默认值
+
+ def get_effective_watchers(self) -> List[WatcherConfig]:
+ """向后兼容:如果 watchers 为空但 watch_dir 存在,自动构造单个 Claude Code watcher."""
+ if self.watchers:
+ return [w for w in self.watchers if w.enabled]
+ elif self.watch_dir:
+ return [WatcherConfig(
+ tool_name="claude_code",
+ watch_dir=self.watch_dir,
+ batch_trigger_lines=self.batch_trigger_lines,
+ batch_trigger_seconds=self.batch_trigger_seconds,
+ )]
+ else:
+ return [WatcherConfig(
+ tool_name="claude_code",
+ watch_dir=str(Path.home() / ".claude" / "projects"),
+ batch_trigger_lines=self.batch_trigger_lines,
+ batch_trigger_seconds=self.batch_trigger_seconds,
+ )]
+```
+
+### 4.2 ov.conf 示例
+
+```json
+{
+ "server": {
+ "daemon": {
+ "enabled": true,
+ "db_path": "~/.openviking/daemon/cursors.db",
+ "watchers": [
+ {
+ "tool_name": "claude_code",
+ "watch_dir": "~/.claude/projects",
+ "file_pattern": "*.jsonl"
+ },
+ {
+ "tool_name": "aider",
+ "watch_dir": "~/Projects",
+ "file_pattern": ".aider.chat.history.md"
+ },
+ {
+ "tool_name": "cursor",
+ "watch_dir": "~/.cursor",
+ "extra": {"poll_interval_seconds": 60}
+ },
+ {
+ "tool_name": "generic_jsonl",
+ "watch_dir": "~/ai-logs/copilot",
+ "file_pattern": "*.jsonl",
+ "extra": {"user_role_field": "author", "assistant_role_value": "copilot"}
+ }
+ ]
+ }
+ }
+}
+```
+
+### 4.3 环境变量
+
+| 变量 | 说明 |
+|------|------|
+| `OV_DAEMON_ENABLED` | 全局开关 |
+| `OV_DAEMON_WATCHERS` | JSON 格式的 watchers 列表(覆盖 ov.conf) |
+
+---
+
+## 五、实现计划
+
+### Batch 1:基础抽象层(3 文件 + 6 tests)
+
+| 任务 | 文件 | 说明 |
+|------|------|------|
+| 1.1 | `watchers/__init__.py` | 定义 `BaseWatcher` Protocol |
+| 1.2 | `watchers/base_file_watcher.py` | 抽象基类,提取 ClaudeCodeWatcher 的公共逻辑 |
+| 1.3 | `watchers/registry.py` | Watcher 注册表 + `create_watcher()` 工厂 |
+| 1.4 | `tests/daemon/test_base_file_watcher.py` | BaseFileWatcher 的 parse/normalize/buffer 测试 |
+| 1.5 | `tests/daemon/test_registry.py` | 注册表查找、未知工具报错测试 |
+
+### Batch 2:重构 ClaudeCodeWatcher + GenericJSONLWatcher(3 文件 + 8 tests)
+
+| 任务 | 文件 | 说明 |
+|------|------|------|
+| 2.1 | `watchers/claude_code_watcher.py` | 重构为继承 BaseFileWatcher,实现 parse_line/normalize_event |
+| 2.2 | `watchers/generic_jsonl_watcher.py` | 通用 JSONL Watcher,支持自定义字段映射 |
+| 2.3 | `models.py` | 在 BatchBuffer 中增加 `tool_name` 字段 |
+| 2.4 | `tests/daemon/test_claude_code_watcher.py` | 更新已有测试(保持兼容) |
+| 2.5 | `tests/daemon/test_generic_jsonl_watcher.py` | GenericJSONLWatcher 的解析/归一化/字段映射测试 |
+
+### Batch 3:AiderWatcher + CursorWatcher(4 文件 + 10 tests)
+
+| 任务 | 文件 | 说明 |
+|------|------|------|
+| 3.1 | `watchers/aider_watcher.py` | Aider Markdown 历史解析 → 归一化事件 |
+| 3.2 | `watchers/cursor_watcher.py` | Cursor SQLite polling + 日志文件监听 |
+| 3.3 | `watchers/continue_dev_watcher.py` | Continue.dev JSON 日志监听 |
+| 3.4 | `tests/daemon/test_aider_watcher.py` | Markdown 解析、对话重组、增量读取测试 |
+| 3.5 | `tests/daemon/test_cursor_watcher.py` | SQLite polling、增量查询测试 |
+| 3.6 | `tests/daemon/test_continue_dev_watcher.py` | JSON 日志解析测试 |
+
+### Batch 4:配置 + 服务层重构(4 文件 + 6 tests)
+
+| 任务 | 文件 | 说明 |
+|------|------|------|
+| 4.1 | `server/config.py` | 新增 WatcherConfig,重构 DaemonConfig |
+| 4.2 | `daemon/service.py` | 单 watcher → watchers 列表,使用 registry 创建 |
+| 4.3 | `daemon/knowledge_router.py` | URI 中包含 tool_name:`viking://skills/{tool_name}/...` |
+| 4.4 | `daemon/storage_adapter.py` | Markdown 模板中 Source 字段参数化 |
+| 4.5 | `daemon/models.py` | ExtractedKnowledge 增加 `source_tool` 字段 |
+| 4.6 | `server/bootstrap.py` | 环境变量 OV_DAEMON_WATCHERS 支持 |
+| 4.7 | `tests/daemon/test_config.py` | WatcherConfig 验证 + 向后兼容测试 |
+| 4.8 | `tests/daemon/test_service_multi.py` | 多 watcher 启动/停止/路由测试 |
+
+### Batch 5:API + Web Studio(3 文件 + 0 tests)
+
+| 任务 | 文件 | 说明 |
+|------|------|------|
+| 5.1 | `server/routers/daemon.py` | 扩展 status 端点:返回每个 watcher 的状态 |
+| 5.2 | `web-studio/.../daemon-status-card.tsx` | 重构为多 watcher 视图:表格 + 状态 |
+| 5.3 | `web-studio/.../i18n` | 多 watcher 相关翻译 |
+
+### Batch 6:集成测试 + 收尾(2 文件)
+
+| 任务 | 文件 | 说明 |
+|------|------|------|
+| 6.1 | `tests/daemon/test_multi_watcher_integration.py` | 多 watcher → ETL → storage 全链路 |
+| 6.2 | 文档更新 | README、Echo.md 引用更新 |
+
+---
+
+## 六、关键设计决策
+
+### 6.1 归一化 vs 分支管道
+
+**选择:Watcher 层归一化**,而非 ETL 层分支。
+
+理由:
+- ETL Pipeline 的四个阶段(Filter → Reconstruct → Extract → Deduplicate)已经设计为通用的
+- 如果在 ETL 层引入工具分支,每个阶段都需要 if/else,复杂度 O(tools × stages)
+- Watcher 层归一化后,新增工具只需实现 `parse_line()` + `normalize_event()`,ETL 零修改
+
+### 6.2 向后兼容策略
+
+旧的 `DaemonConfig` 字段(`watch_dir`、`batch_trigger_lines`、`batch_trigger_seconds`)保留为 deprecated,`get_effective_watchers()` 方法自动将旧配置转换为单个 `WatcherConfig`。这确保现有用户的 ov.conf 不需要修改。
+
+### 6.3 Cursor SQLite Polling
+
+Cursor 使用 SQLite `state.vscdb` 存储对话历史。与文件监听不同,需要:
+- 定时轮询(默认 60 秒)
+- 通过 `last_timestamp` 游标实现增量查询
+- 复用 `CursorManager` 存储轮询位置
+
+这是一个独立于 watchdog 的驱动模式,但通过实现相同的 `BaseWatcher` Protocol,对 DaemonService 完全透明。
+
+### 6.4 GenericJSONLWatcher 字段映射
+
+为了支持任意 JSONL 日志(Copilot、自研工具等),GenericJSONLWatcher 支持通过 `extra` 配置自定义字段映射:
+
+```json
+{
+ "tool_name": "generic_jsonl",
+ "watch_dir": "~/ai-logs/my-tool",
+ "extra": {
+ "role_field": "author",
+ "user_role_value": "human",
+ "assistant_role_value": "ai",
+ "content_field": "text",
+ "timestamp_field": "ts"
+ }
+}
+```
+
+默认值兼容 Claude Code 格式(`role` / `user` / `assistant` / `content` / `timestamp`)。
+
+---
+
+## 七、API 变更
+
+### 7.1 GET /api/v1/daemon/status(增强)
+
+```json
+{
+ "enabled": true,
+ "running": true,
+ "watchers": [
+ {
+ "tool_name": "claude_code",
+ "watch_dir": "/Users/xxx/.claude/projects",
+ "enabled": true,
+ "running": true,
+ "cursor_count": 12,
+ "batch_trigger_lines": 50,
+ "batch_trigger_seconds": 300
+ },
+ {
+ "tool_name": "aider",
+ "watch_dir": "/Users/xxx/Projects",
+ "enabled": true,
+ "running": true,
+ "cursor_count": 3,
+ "batch_trigger_lines": 50,
+ "batch_trigger_seconds": 300
+ }
+ ],
+ "db_path": "...",
+ "available_tools": ["claude_code", "aider", "cursor", "continue_dev", "generic_jsonl"]
+}
+```
+
+---
+
+## 八、Web Studio 变更
+
+Home 页面 DaemonStatusCard 从单状态卡片升级为多 Watcher 表格视图:
+
+- 顶部:全局状态(enabled/running)+ watcher 数量统计
+- 中部:Watcher 列表表格(工具名、路径、状态、游标数、最后活动)
+- 底部:可用工具列表(available_tools)
+
+每 30 秒自动刷新。
+
+---
+
+## 九、风险与缓解
+
+| 风险 | 影响 | 缓解措施 |
+|------|------|----------|
+| Cursor SQLite schema 变更 | 解析失败 | 防御性编程 + 版本检测 + 降级到日志文件监听 |
+| 多 watcher 并发写入 SQLite cursor DB | 竞态条件 | SQLite WAL mode + 每个 watcher 独立 cursor key |
+| Aider Markdown 格式不稳定 | 解析错误 | 宽松正则 + 跳过不可解析段落 + 详细日志 |
+| 大量 watcher 导致 ETL 队列拥堵 | 延迟增加 | Queue 大小限制 + 背压 + 优先级队列 |
+
+---
+
+## 十、里程碑
+
+| 阶段 | 内容 | 预计 commits |
+|------|------|-------------|
+| Batch 1 | 基础抽象层 | 3-4 |
+| Batch 2 | 重构 + GenericJSONL | 4-5 |
+| Batch 3 | Aider + Cursor + Continue | 5-6 |
+| Batch 4 | 配置 + 服务层 | 4-5 |
+| Batch 5 | API + Web Studio | 3-4 |
+| Batch 6 | 集成 + 收尾 | 2-3 |
+| **总计** | | **21-27 commits** |
diff --git a/docs/daemon/README.md b/docs/daemon/README.md
new file mode 100644
index 0000000000..a7e5173c1f
--- /dev/null
+++ b/docs/daemon/README.md
@@ -0,0 +1,61 @@
+# OpenViking Active Daemon
+
+自动监听 Claude Code 会话日志,提取知识并写入 OpenViking 知识库。
+
+## 快速开始
+
+### 启用 Daemon
+
+设置环境变量:
+
+```bash
+export OV_DAEMON_ENABLED=true
+export OV_DAEMON_WATCH_DIR=~/.claude/projects # 监听目录(可选)
+export OV_DAEMON_BATCH_LINES=50 # 批处理触发行数(可选)
+export OV_DAEMON_BATCH_SECONDS=300 # 批处理触发秒数(可选)
+```
+
+### 启动 OpenViking Server
+
+```bash
+openviking serve
+```
+
+Daemon 会在服务器启动时自动运行(如果 `OV_DAEMON_ENABLED=true`)。
+
+## 工作原理
+
+1. **文件监听** — 监控 `~/.claude/projects/` 下的 `.jsonl` 文件变化
+2. **增量读取** — 文件游标技术,只处理新增内容
+3. **批量处理** — 累积 50 行或 5 分钟后触发 ETL 管道
+4. **知识提取** — 调用 LLM 过滤噪声,提取有价值的知识
+5. **自动存储** — 写入 `viking://skills/`、`viking://memories/`、`viking://resources/`
+
+## 知识分类
+
+| 类型 | 目标路径 | 说明 |
+|------|---------|------|
+| Skills | `viking://skills/claude_code/
.md` | 可复用的操作指南 |
+| Memories (有项目) | `viking://memories/projects//decisions.md` | 项目决策日志 |
+| Memories (无项目) | `viking://memories/global/.md` | 全局记忆 |
+| Resources | `viking://resources//.md` | 参考资源 |
+
+## 架构
+
+```
+Claude Code JSONL → File Watcher → Batch Buffer → Filter → Reconstruct
+ → LLM Extract → Deduplicate → Route → viking:// Storage
+```
+
+## 故障排查
+
+### Daemon 未启动
+检查日志中是否有 `Active Daemon is disabled` 消息,确认 `OV_DAEMON_ENABLED=true`。
+
+### 没有提取到知识
+- 确认 Claude Code 正在写入 JSONL 文件(`~/.claude/projects/` 下有 `.jsonl` 文件)
+- 对话内容可能不够有价值(简单问答会被过滤)
+- 查看日志中的 ETL 处理信息
+
+### 知识写入失败
+检查 OpenViking ResourceService 是否正常运行。
diff --git a/docs/daemon/configuration.md b/docs/daemon/configuration.md
new file mode 100644
index 0000000000..169bf341e1
--- /dev/null
+++ b/docs/daemon/configuration.md
@@ -0,0 +1,50 @@
+# Active Daemon 配置指南
+
+## 环境变量
+
+| 变量 | 说明 | 默认值 |
+|------|------|--------|
+| `OV_DAEMON_ENABLED` | 启用 Daemon | `false` |
+| `OV_DAEMON_WATCH_DIR` | 监听目录 | `~/.claude/projects` |
+| `OV_DAEMON_DB_PATH` | 游标数据库路径 | `~/.qoderworkcn/openviking/daemon_cursors.db` |
+| `OV_DAEMON_BATCH_LINES` | 批处理触发行数 | `50` |
+| `OV_DAEMON_BATCH_SECONDS` | 批处理触发秒数 | `300` |
+
+## YAML 配置 (ov.conf)
+
+```json
+{
+ "daemon": {
+ "enabled": true,
+ "watch_dir": "~/.claude/projects",
+ "batch_trigger_lines": 50,
+ "batch_trigger_seconds": 300
+ }
+}
+```
+
+## Docker 部署
+
+```yaml
+services:
+ openviking:
+ image: openviking:latest
+ environment:
+ - OV_DAEMON_ENABLED=true
+ - OV_DAEMON_WATCH_DIR=/data/claude-projects
+ - OV_DAEMON_DB_PATH=/data/daemon.db
+ volumes:
+ - ./claude-projects:/data/claude-projects
+ - ./daemon-data:/data
+ ports:
+ - "1933:1933"
+```
+
+## 日志
+
+Daemon 使用 OpenViking 标准日志系统。关键日志:
+
+- `Claude Code watcher started on ...` — 监听器启动
+- `Flushing batch with N events` — 批处理触发
+- `Extracted N knowledge items` — 知识提取完成
+- `Knowledge ingested: viking://...` — 知识写入成功
diff --git a/docs/daemon/multi-source-design.md b/docs/daemon/multi-source-design.md
new file mode 100644
index 0000000000..abf3828a09
--- /dev/null
+++ b/docs/daemon/multi-source-design.md
@@ -0,0 +1,1227 @@
+# Daemon 多源监听拓展设计
+
+| 项目 | 信息 |
+|-----|------|
+| 状态 | `草案` |
+| 创建日期 | 2026-06-16 |
+| 基线分支 | `feature/active-daemon` |
+
+---
+
+## 概述
+
+将 Active Daemon 从单一 Claude Code CLI 监听源拓展为支持多个可配置监听源的通用知识采集框架。核心变更包括:引入 Watcher 抽象层、多源配置模型、Claude Desktop 专用 Watcher、以及 Source-aware 的知识路由。
+
+---
+
+## 目录
+
+- [背景与问题](#背景与问题)
+- [目标与非目标](#目标与非目标)
+- [架构设计](#架构设计)
+- [核心抽象](#核心抽象)
+- [配置设计](#配置设计)
+- [各 Watcher 实现](#各-watcher-实现)
+- [ETL 管道适配](#etl-管道适配)
+- [路由与存储适配](#路由与存储适配)
+- [文件变更清单](#文件变更清单)
+- [分阶段实施](#分阶段实施)
+- [风险与权衡](#风险与权衡)
+- [验证方案](#验证方案)
+
+---
+
+## 背景与问题
+
+### 当前架构
+
+Daemon 目前仅支持一个监听源:Claude Code CLI 的 JSONL 会话日志(`~/.claude/projects/`)。架构是单 Watcher 硬编码:
+
+```
+DaemonService
+ └── ClaudeCodeWatcher(watch_dir="~/.claude/projects")
+ └── ClaudeCodeLogHandler → BatchBuffer → asyncio.Queue
+ ↓
+ BatchETLPipeline (Filter → Reconstruct → Extract → Dedup)
+ ↓
+ VikingStorageAdapter → viking://skills/claude_code/*
+```
+
+### 问题
+
+| 问题 | 说明 |
+|-----|------|
+| 单源硬编码 | `DaemonService` 只实例化一个 `ClaudeCodeWatcher`,无法扩展 |
+| 无抽象基类 | `watchers/__init__.py` 为空,没有 Watcher 协议定义 |
+| 路由硬编码 | `KnowledgeRouter` 把 skills 统一路由到 `viking://skills/claude_code/`,不区分来源 |
+| 模板硬编码 | `VikingStorageAdapter._format_skill()` 固定写 `Source: Claude Code session` |
+| 配置扁平 | `DaemonConfig` 只有 `watch_dir`(单路径),无法表达多源 |
+| Claude Desktop 未覆盖 | Claude Desktop 的会话存储在 IndexedDB 中(`%APPDATA%\Claude\`),当前完全忽略 |
+
+### 动机
+
+用户可能同时使用多个 AI 编码工具(Claude Code CLI、Claude Desktop、Cursor、Aider 等),每个工具产生不同格式的日志。Daemon 应能从所有这些源中提取知识,统一汇入 OpenViking。
+
+---
+
+## 目标与非目标
+
+### 目标
+
+1. **Watcher 抽象**:定义统一接口,新增监听源只需实现一个 Watcher 类
+2. **多路径配置**:支持配置任意数量的监听源,每个源可指定独立路径和参数
+3. **Claude Desktop 支持**:提取 Claude Desktop 的会话数据(JSONL + IndexedDB)
+4. **Source-aware 路由**:知识路由和存储模板反映数据来源
+5. **向后兼容**:单源配置自动迁移为多源格式,现有行为不变
+6. **共享 ETL**:所有 Watcher 的输出进入同一个 ETL 管道,无需管道层改动
+
+### 非目标
+
+1. **非 Cursor/Aider/Copilot 支持** —— 本期只做 Claude 生态(CLI + Desktop),架构预留扩展点但不实现
+2. **非实时处理** —— 保持批量处理模型(50 行 / 5 分钟阈值)
+3. **不改变 LLM 提取逻辑** —— Prompt 和置信度阈值不变
+4. **不引入新依赖** —— IndexedDB 提取如需要 plyvel,作为 optional dependency
+
+---
+
+## 架构设计
+
+### 多源架构总览
+
+```
+DaemonService
+ │
+ ├── WatcherRegistry
+ │ │
+ │ ├── ClaudeCodeWatcher (JSONL, ~/.claude/projects/)
+ │ │ └── ClaudeCodeLogHandler → normalize → ┐
+ │ │ │
+ │ ├── ClaudeDesktopWatcher (JSONL + IndexedDB) │
+ │ │ ├── DesktopJsonlHandler → normalize → ──┤
+ │ │ └── IndexedDBPoller → normalize → ──────┤
+ │ │ │
+ │ └── [未来] GenericJsonlWatcher (自定义路径) │
+ │ └── GenericLogHandler → normalize → ────┤
+ │ │
+ │ ┌───────────────┘
+ │ ↓
+ │ asyncio.Queue[NormalizedEvent]
+ │ ↓
+ └──────────────────→ BatchETLPipeline (Filter → Reconstruct → Extract → Dedup)
+ ↓
+ SourceAwareRouter → viking://skills/{source}/*
+ ↓
+ VikingStorageAdapter (source-aware templates)
+```
+
+### 数据流对比
+
+```
+┌─ 当前 ───────────────────────────────────────────────────────┐
+│ │
+│ ~/.claude/projects/**/*.jsonl │
+│ ↓ ClaudeCodeWatcher │
+│ {role, content, type, timestamp} │
+│ ↓ Queue │
+│ ETL → viking://skills/claude_code/xxx │
+│ │
+└──────────────────────────────────────────────────────────────┘
+
+┌─ 目标 ───────────────────────────────────────────────────────┐
+│ │
+│ Source A: ~/.claude/projects/**/*.jsonl │
+│ ↓ ClaudeCodeWatcher → normalize → { ..., source: │
+│ "claude_code" } │
+│ Source B: %APPDATA%/Claude/... (JSONL / IndexedDB) │
+│ ↓ ClaudeDesktopWatcher → normalize → { ..., source: │
+│ "claude_desktop"│
+│ } │
+│ Source C: /custom/path/**/*.jsonl (用户自定义) │
+│ ↓ GenericJsonlWatcher → normalize → { ..., source: │
+│ "custom_xxx" } │
+│ ↓ asyncio.Queue (共享) │
+│ ETL (unchanged) │
+│ ↓ │
+│ SourceAwareRouter → viking://skills/{source}/xxx │
+│ │
+└──────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 核心抽象
+
+### 1. NormalizedEvent — 统一事件模型
+
+在现有 `BatchBuffer` 的 `Dict[str, Any]` 事件基础上,增加 `source` 字段:
+
+```python
+# openviking/daemon/models.py — 新增字段
+
+@dataclass
+class NormalizedEvent:
+ """所有 Watcher 产出的标准化事件。"""
+ role: str # "user" | "assistant"
+ content: str # 消息正文
+ type: str # "message"
+ timestamp: Optional[str] # ISO-8601
+ session_id: Optional[str] # 会话 ID
+ project_name: Optional[str] # 项目名
+ source: str # 来源标识: "claude_code" | "claude_desktop" | "custom:xxx"
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "role": self.role,
+ "content": self.content,
+ "type": self.type,
+ "timestamp": self.timestamp,
+ "session_id": self.session_id,
+ "project_name": self.project_name,
+ "source": self.source,
+ }
+```
+
+**为什么新增 NormalizedEvent 而不是复用 Dict**:当前 pipeline 里事件是裸 Dict,各 Watcher 自行构造。增加 `source` 字段后需要类型约束,避免拼写错误导致路由分错。同时 `to_dict()` 保证与现有 `BatchETLPipeline.process_batch(events: List[Dict])` 接口兼容。
+
+### 2. BaseWatcher — Watcher 抽象基类
+
+```python
+# openviking/daemon/watchers/base.py — 新文件
+
+from abc import ABC, abstractmethod
+from typing import Callable, List, Dict, Optional
+
+from openviking.daemon.cursor_manager import CursorManager
+
+
+class BaseWatcher(ABC):
+ """所有日志监听器的抽象基类。"""
+
+ def __init__(
+ self,
+ paths: List[str],
+ source_id: str,
+ cursor_manager: CursorManager,
+ batch_callback: Callable[[List[Dict]], None],
+ batch_trigger_lines: int = 50,
+ batch_trigger_seconds: int = 300,
+ ):
+ self.paths = paths
+ self.source_id = source_id
+ self.cursor_manager = cursor_manager
+ self.batch_callback = batch_callback
+ self.batch_trigger_lines = batch_trigger_lines
+ self.batch_trigger_seconds = batch_trigger_seconds
+
+ @abstractmethod
+ def start(self):
+ """启动监听。可启动一个或多个 watchdog Observer 线程。"""
+ ...
+
+ @abstractmethod
+ def stop(self):
+ """停止所有监听线程。"""
+ ...
+
+ @abstractmethod
+ def flush(self):
+ """强制刷新所有缓冲区。"""
+ ...
+
+ @property
+ @abstractmethod
+ def watcher_type(self) -> str:
+ """Watcher 类型标识,用于配置解析。"""
+ ...
+
+ @property
+ def status(self) -> Dict:
+ """返回当前状态摘要,供 /daemon/status API 使用。"""
+ return {
+ "type": self.watcher_type,
+ "source_id": self.source_id,
+ "paths": self.paths,
+ }
+```
+
+**设计要点**:
+- `paths: List[str]` — 支持多个监听路径(一个 Watcher 可监听多个目录)
+- `source_id: str` — 来源标识,注入到 NormalizedEvent.source 和路由路径
+- `cursor_manager` 共享 — 所有 Watcher 共用同一个 SQLite 游标库,通过 file_path 主键天然隔离
+- `batch_callback` 共享 — 所有 Watcher 向同一个 `asyncio.Queue` 投递事件
+
+### 3. WatcherFactory — 配置驱动的实例化
+
+```python
+# openviking/daemon/watchers/factory.py — 新文件
+
+from typing import Dict, List, Type
+
+from openviking.daemon.watchers.base import BaseWatcher
+
+
+class WatcherFactory:
+ """根据配置创建 Watcher 实例。"""
+
+ _registry: Dict[str, Type[BaseWatcher]] = {}
+
+ @classmethod
+ def register(cls, watcher_type: str, watcher_class: Type[BaseWatcher]):
+ cls._registry[watcher_type] = watcher_class
+
+ @classmethod
+ def create(
+ cls,
+ source_config: Dict,
+ cursor_manager,
+ batch_callback,
+ default_lines: int = 50,
+ default_seconds: int = 300,
+ ) -> BaseWatcher:
+ watcher_type = source_config["type"]
+ watcher_class = cls._registry.get(watcher_type)
+ if not watcher_class:
+ raise ValueError(
+ f"Unknown watcher type: {watcher_type}. "
+ f"Available: {list(cls._registry.keys())}"
+ )
+ return watcher_class(
+ paths=source_config.get("paths", []),
+ source_id=source_config.get("source_id", watcher_type),
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ batch_trigger_lines=source_config.get(
+ "batch_trigger_lines", default_lines
+ ),
+ batch_trigger_seconds=source_config.get(
+ "batch_trigger_seconds", default_seconds
+ ),
+ # source-specific options
+ options=source_config.get("options", {}),
+ )
+```
+
+注册时机在 `DaemonService.start()` 中,通过 import 触发:
+
+```python
+# openviking/daemon/watchers/__init__.py
+
+from openviking.daemon.watchers.factory import WatcherFactory
+from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher
+from openviking.daemon.watchers.claude_desktop_watcher import ClaudeDesktopWatcher
+
+WatcherFactory.register("claude_code", ClaudeCodeWatcher)
+WatcherFactory.register("claude_desktop", ClaudeDesktopWatcher)
+```
+
+---
+
+## 配置设计
+
+### 新配置模型
+
+```python
+# openviking/server/config.py — DaemonConfig 重构
+
+class SourceConfig(BaseModel):
+ """单个监听源的配置。"""
+ type: str # "claude_code" | "claude_desktop"
+ paths: List[str] # 监听路径列表
+ source_id: Optional[str] = None # 来源标识(默认 = type)
+ enabled: bool = True
+ batch_trigger_lines: Optional[int] = None # 覆盖全局值
+ batch_trigger_seconds: Optional[int] = None # 覆盖全局值
+ options: Dict[str, Any] = {} # Watcher 特定选项
+
+class DaemonConfig(BaseModel):
+ """Active Daemon 配置(多源)。"""
+ enabled: bool = False
+ sources: List[SourceConfig] = [] # 多源配置
+ db_path: Optional[str] = None
+ batch_trigger_lines: int = Field(50, gt=0)
+ batch_trigger_seconds: int = Field(300, gt=0)
+
+ @classmethod
+ def from_env(cls) -> "DaemonConfig":
+ ... # 见下方兼容性设计
+```
+
+### ov.conf 示例
+
+```json
+{
+ "daemon": {
+ "enabled": true,
+ "db_path": "~/.qoderworkcn/openviking/daemon_cursors.db",
+ "batch_trigger_lines": 50,
+ "batch_trigger_seconds": 300,
+ "sources": [
+ {
+ "type": "claude_code",
+ "paths": ["~/.claude/projects"],
+ "source_id": "claude_code"
+ },
+ {
+ "type": "claude_desktop",
+ "paths": ["%APPDATA%/Claude"],
+ "source_id": "claude_desktop",
+ "options": {
+ "enable_indexeddb": true,
+ "indexeddb_path": "IndexedDB/https_claude.ai_0.indexeddb.leveldb"
+ }
+ },
+ {
+ "type": "claude_code",
+ "paths": ["D:/Work/other-claude/projects", "E:/Shared/logs/claude"],
+ "source_id": "claude_code_work"
+ }
+ ]
+ }
+}
+```
+
+### 向后兼容:单源 → 多源自动迁移
+
+旧配置格式仍可使用:
+
+```json
+{
+ "daemon": {
+ "enabled": true,
+ "watch_dir": "~/.claude/projects"
+ }
+}
+```
+
+迁移逻辑在 `DaemonConfig` 的 model_validator 中:
+
+```python
+class DaemonConfig(BaseModel):
+ enabled: bool = False
+ # 旧字段(兼容)
+ watch_dir: Optional[str] = None
+ # 新字段
+ sources: List[SourceConfig] = []
+ db_path: Optional[str] = None
+ ...
+
+ @model_validator(mode="after")
+ def migrate_legacy_config(self):
+ if self.watch_dir and not self.sources:
+ self.sources = [
+ SourceConfig(
+ type="claude_code",
+ paths=[self.watch_dir],
+ source_id="claude_code",
+ )
+ ]
+ return self
+```
+
+### 环境变量兼容
+
+```python
+@classmethod
+def from_env(cls) -> "DaemonConfig":
+ import os
+
+ # 单源(旧): OV_DAEMON_WATCH_DIR
+ watch_dir = os.getenv("OV_DAEMON_WATCH_DIR")
+
+ # 多源(新): OV_DAEMON_SOURCES (JSON 数组)
+ sources_json = os.getenv("OV_DAEMON_SOURCES")
+
+ sources = []
+ if sources_json:
+ sources = [SourceConfig(**s) for s in json.loads(sources_json)]
+ elif watch_dir:
+ sources = [SourceConfig(
+ type="claude_code",
+ paths=[watch_dir],
+ source_id="claude_code",
+ )]
+
+ return cls(
+ enabled=os.getenv("OV_DAEMON_ENABLED", "false").lower() == "true",
+ sources=sources,
+ db_path=os.getenv("OV_DAEMON_DB_PATH"),
+ batch_trigger_lines=int(os.getenv("OV_DAEMON_BATCH_LINES", "50")),
+ batch_trigger_seconds=int(os.getenv("OV_DAEMON_BATCH_SECONDS", "300")),
+ )
+```
+
+---
+
+## 各 Watcher 实现
+
+### 1. ClaudeCodeWatcher(重构)
+
+将现有 `ClaudeCodeWatcher` 从硬编码单路径重构为继承 `BaseWatcher`、支持多路径:
+
+```python
+# openviking/daemon/watchers/claude_code_watcher.py — 重构
+
+class ClaudeCodeWatcher(BaseWatcher):
+ """监听 Claude Code CLI 的 JSONL 会话日志。"""
+
+ watcher_type = "claude_code"
+
+ def __init__(self, paths, source_id, cursor_manager, batch_callback,
+ batch_trigger_lines=50, batch_trigger_seconds=300,
+ options=None):
+ super().__init__(paths, source_id or "claude_code",
+ cursor_manager, batch_callback,
+ batch_trigger_lines, batch_trigger_seconds)
+ self.observers: List[Observer] = []
+ self.handlers: List[ClaudeCodeLogHandler] = []
+
+ def start(self):
+ for path in self.paths:
+ expanded = os.path.expanduser(os.path.expandvars(path))
+ handler = ClaudeCodeLogHandler(
+ cursor_manager=self.cursor_manager,
+ batch_callback=self.batch_callback,
+ batch_trigger_lines=self.batch_trigger_lines,
+ batch_trigger_seconds=self.batch_trigger_seconds,
+ source_id=self.source_id, # 注入到 NormalizedEvent
+ )
+ observer = Observer()
+ observer.schedule(handler, expanded, recursive=True)
+ observer.start()
+ self.observers.append(observer)
+ self.handlers.append(handler)
+ logger.info("ClaudeCodeWatcher started on %s (source=%s)",
+ expanded, self.source_id)
+
+ def stop(self):
+ for obs in self.observers:
+ obs.stop()
+ obs.join()
+
+ def flush(self):
+ for h in self.handlers:
+ h.force_flush()
+```
+
+**ClaudeCodeLogHandler 变更**:在 `_process_file` 产出的事件 Dict 中注入 `"source": self.source_id`。事件过滤逻辑(`_filter_events`)不变。
+
+### 2. ClaudeDesktopWatcher(新实现)
+
+Claude Desktop 的会话数据有两个存储位置:
+
+| 位置 | 格式 | 内容 |
+|-----|------|------|
+| `%APPDATA%/Claude/logs/main.log` | 纯文本 | Electron 应用日志,包含 API 调用痕迹 |
+| `%APPDATA%/Claude/IndexedDB/` | LevelDB | Chromium IndexedDB,存储 claude.ai Web 会话 |
+
+**策略:JSONL 子路径扫描 + IndexedDB 轮询双通道**
+
+```python
+# openviking/daemon/watchers/claude_desktop_watcher.py — 新文件
+
+class ClaudeDesktopWatcher(BaseWatcher):
+ """
+ 监听 Claude Desktop 的会话数据。
+
+ 双通道:
+ 1. JSONL 扫描 — 扫描 paths 下的 .jsonl 文件(如果有外部导出)
+ 2. IndexedDB 轮询 — 读取 LevelDB 中的会话数据
+ """
+
+ watcher_type = "claude_desktop"
+
+ def __init__(self, paths, source_id, cursor_manager, batch_callback,
+ batch_trigger_lines=50, batch_trigger_seconds=300,
+ options=None):
+ super().__init__(paths, source_id or "claude_desktop",
+ cursor_manager, batch_callback,
+ batch_trigger_lines, batch_trigger_seconds)
+ self.options = options or {}
+ self.enable_indexeddb = self.options.get("enable_indexeddb", False)
+ self.indexeddb_relpath = self.options.get(
+ "indexeddb_path",
+ "IndexedDB/https_claude.ai_0.indexeddb.leveldb"
+ )
+ self.observers: List[Observer] = []
+ self.handlers: List[DesktopLogHandler] = []
+ self._indexeddb_poller: Optional[IndexedDBPoller] = None
+
+ def start(self):
+ # 通道 1: JSONL 文件监听
+ for path in self.paths:
+ expanded = os.path.expanduser(os.path.expandvars(path))
+ handler = DesktopLogHandler(
+ cursor_manager=self.cursor_manager,
+ batch_callback=self.batch_callback,
+ source_id=self.source_id,
+ ...
+ )
+ observer = Observer()
+ observer.schedule(handler, expanded, recursive=True)
+ observer.start()
+ self.observers.append(observer)
+ self.handlers.append(handler)
+
+ # 通道 2: IndexedDB 轮询(可选)
+ if self.enable_indexeddb:
+ self._indexeddb_poller = IndexedDBPoller(
+ base_paths=self.paths,
+ relpath=self.indexeddb_relpath,
+ cursor_manager=self.cursor_manager,
+ batch_callback=self.batch_callback,
+ source_id=self.source_id,
+ poll_interval=self.batch_trigger_seconds,
+ )
+ self._indexeddb_poller.start()
+
+ def stop(self):
+ for obs in self.observers:
+ obs.stop()
+ obs.join()
+ if self._indexeddb_poller:
+ self._indexeddb_poller.stop()
+
+ def flush(self):
+ for h in self.handlers:
+ h.force_flush()
+ if self._indexeddb_poller:
+ self._indexeddb_poller.force_flush()
+```
+
+#### DesktopLogHandler — 纯文本日志解析
+
+`main.log` 是 Electron 应用日志,格式如下:
+
+```
+2026-03-23 12:34:38 [info] Starting app { ... }
+2026-03-23 12:34:39 [info] API call to /v1/messages { model: "claude-sonnet-4-20250514" }
+```
+
+DesktopLogHandler 的策略是**仅提取 JSON 负载中包含对话内容的行**,过滤掉纯应用日志:
+
+```python
+class DesktopLogHandler(FileSystemEventHandler):
+ """解析 Claude Desktop 的 main.log 文件。"""
+
+ # 匹配带 JSON 负载的日志行
+ LOG_PATTERN = re.compile(
+ r"^(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+\[(\w+)\]\s+(.+)$"
+ )
+
+ def on_modified(self, event):
+ if event.is_directory or not event.src_path.endswith(".log"):
+ return
+ self._process_file(event.src_path)
+
+ def _process_file(self, file_path):
+ cursor = self.cursor_manager.get_cursor(file_path)
+ current_size = os.path.getsize(file_path)
+ if current_size <= cursor.last_position:
+ return
+
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+ f.seek(cursor.last_position)
+ for line in f:
+ event = self._parse_log_line(line)
+ if event:
+ self.buffer.add_line(event, len(line))
+
+ new_position = f.tell()
+
+ self.cursor_manager.update_cursor(file_path, new_position)
+ self._check_batch_trigger()
+
+ def _parse_log_line(self, line) -> Optional[Dict]:
+ """
+ 从日志行中提取对话事件。
+ 仅保留包含 user/assistant 消息内容的行。
+ """
+ match = self.LOG_PATTERN.match(line.strip())
+ if not match:
+ return None
+
+ timestamp, level, message = match.groups()
+
+ # 尝试从 message 中提取 JSON 负载
+ try:
+ # 找到第一个 { 并解析到末尾
+ json_start = message.index("{")
+ payload = json.loads(message[json_start:])
+ except (ValueError, json.JSONDecodeError):
+ return None
+
+ # 仅保留包含对话角色的事件
+ role = payload.get("role")
+ if role not in ("user", "assistant"):
+ return None
+
+ return {
+ "role": role,
+ "content": payload.get("content", ""),
+ "type": "message",
+ "timestamp": timestamp,
+ "session_id": payload.get("sessionId"),
+ "project_name": payload.get("projectName"),
+ "source": self.source_id,
+ }
+```
+
+#### IndexedDBPoller — LevelDB 轮询提取(可选)
+
+```python
+# openviking/daemon/watchers/indexeddb_poller.py — 新文件
+
+class IndexedDBPoller:
+ """
+ 定期轮询 Claude Desktop 的 IndexedDB LevelDB 存储。
+ 提取新增/更新的会话记录,转换为 NormalizedEvent。
+
+ 需要 plyvel 库(pip install plyvel)。
+ """
+
+ def __init__(self, base_paths, relpath, cursor_manager,
+ batch_callback, source_id, poll_interval=300):
+ self.base_paths = base_paths
+ self.relpath = relpath
+ self.cursor_manager = cursor_manager
+ self.batch_callback = batch_callback
+ self.source_id = source_id
+ self.poll_interval = poll_interval
+ self._thread: Optional[threading.Thread] = None
+ self._stop_event = threading.Event()
+
+ def start(self):
+ self._thread = threading.Thread(
+ target=self._poll_loop, daemon=True
+ )
+ self._thread.start()
+ logger.info("IndexedDB poller started (interval=%ds)", self.poll_interval)
+
+ def stop(self):
+ self._stop_event.set()
+ if self._thread:
+ self._thread.join(timeout=10)
+
+ def _poll_loop(self):
+ while not self._stop_event.is_set():
+ for base_path in self.base_paths:
+ db_path = os.path.join(
+ os.path.expanduser(os.path.expandvars(base_path)),
+ self.relpath,
+ )
+ if os.path.isdir(db_path):
+ try:
+ self._extract_from_leveldb(db_path)
+ except ImportError:
+ logger.error(
+ "plyvel not installed. "
+ "Run: pip install plyvel"
+ )
+ return # 不重试
+ except Exception as e:
+ logger.error("IndexedDB extraction failed: %s", e)
+
+ self._stop_event.wait(self.poll_interval)
+
+ def _extract_from_leveldb(self, db_path: str):
+ """
+ 从 LevelDB 中提取会话数据。
+
+ IndexedDB 在 LevelDB 中的 key 格式:
+ _file_version
+ ---
+
+ Claude Desktop 的会话数据存储在 object store 中,
+ key 前缀通常为 conversation 或 message 相关。
+
+ 由于 schema 未公开文档化,采用启发式提取:
+ 1. 遍历所有 key-value 对
+ 2. 尝试 JSON 解析 value
+ 3. 筛选包含 role=user/assistant 的记录
+ """
+ import plyvel
+
+ cursor_key = f"indexeddb:{db_path}"
+ cursor = self.cursor_manager.get_cursor(cursor_key)
+ last_count = cursor.last_position # 复用 last_position 存已处理记录数
+
+ db = plyvel.DB(db_path, create_if_missing=False)
+ try:
+ events = []
+ count = 0
+ for key, value in db.iterator():
+ count += 1
+ if count <= last_count:
+ continue
+
+ try:
+ record = json.loads(value)
+ except (json.JSONDecodeError, UnicodeDecodeError):
+ continue
+
+ event = self._try_extract_event(record)
+ if event:
+ events.append(event)
+
+ if events:
+ self.batch_callback(events)
+
+ self.cursor_manager.update_cursor(cursor_key, count)
+ finally:
+ db.close()
+
+ def _try_extract_event(self, record: Dict) -> Optional[Dict]:
+ """启发式提取:从 JSON 记录中查找对话消息。"""
+ # 场景 1: 记录本身是消息
+ if record.get("role") in ("user", "assistant"):
+ return {
+ "role": record["role"],
+ "content": record.get("content", ""),
+ "type": "message",
+ "timestamp": record.get("timestamp"),
+ "session_id": record.get("conversationId",
+ record.get("sessionId")),
+ "project_name": None,
+ "source": self.source_id,
+ }
+
+ # 场景 2: 记录包含 messages 数组
+ messages = record.get("messages", [])
+ for msg in messages:
+ if isinstance(msg, dict) and msg.get("role") in ("user", "assistant"):
+ return {
+ "role": msg["role"],
+ "content": msg.get("content", ""),
+ "type": "message",
+ "timestamp": msg.get("timestamp"),
+ "session_id": record.get("id",
+ record.get("conversationId")),
+ "project_name": None,
+ "source": self.source_id,
+ }
+
+ return None
+
+ def force_flush(self):
+ """触发一次即时提取。"""
+ # 简化实现:设置 wait 为 0 让循环立即执行
+ pass
+```
+
+### 3. GenericJsonlWatcher(预留,本期不实现)
+
+```python
+# openviking/daemon/watchers/generic_jsonl_watcher.py — 预留接口
+
+class GenericJsonlWatcher(BaseWatcher):
+ """
+ 通用 JSONL 文件监听器。
+ 支持自定义 JSON 字段映射(role_field, content_field 等)。
+ 本期不实现,仅注册到 Factory 以便未来扩展。
+ """
+ watcher_type = "generic_jsonl"
+ # ...
+```
+
+---
+
+## ETL 管道适配
+
+### BatchETLPipeline — 最小改动
+
+当前 `process_batch(events: List[Dict])` 接收裸 Dict 列表,不需要改接口。
+
+需要改动的地方:
+
+**1. ConversationTurn 增加 source 字段**
+
+```python
+# models.py
+
+@dataclass
+class ConversationTurn:
+ user_prompt: str
+ assistant_response: str
+ session_id: Optional[str] = None
+ project_name: Optional[str] = None
+ timestamp: Optional[str] = None
+ source: Optional[str] = None # 新增
+```
+
+**2. ConversationReconstructor 传递 source**
+
+```python
+# conversation_reconstructor.py — reconstruct() 方法
+
+def reconstruct(self, events: List[Dict]) -> List[ConversationTurn]:
+ # ... 现有配对逻辑 ...
+ turns.append(ConversationTurn(
+ user_prompt=user_event["content"],
+ assistant_response=assistant_event["content"],
+ session_id=user_event.get("session_id"),
+ project_name=user_event.get("project_name"),
+ timestamp=user_event.get("timestamp"),
+ source=user_event.get("source"), # 新增
+ ))
+ return turns
+```
+
+**3. ExtractedKnowledge 增加 source 字段**
+
+```python
+# models.py
+
+@dataclass
+class ExtractedKnowledge:
+ status: str
+ category: str
+ title: str
+ content: str
+ confidence: float = 0.0
+ project_name: Optional[str] = None
+ entity_links: List[str] = field(default_factory=list)
+ actionable_steps: List[str] = field(default_factory=list)
+ timestamp: Optional[str] = None
+ source: Optional[str] = None # 新增
+```
+
+**4. KnowledgeExtractor 传递 source**
+
+`extract()` 方法在构造 `ExtractedKnowledge` 时,从 `ConversationTurn.source` 赋值到 `ExtractedKnowledge.source`。LLM prompt 不需要改。
+
+---
+
+## 路由与存储适配
+
+### KnowledgeRouter — Source-aware 路由
+
+```python
+# knowledge_router.py — 重构 route()
+
+class KnowledgeRouter:
+ """Routes knowledge items to viking:// URIs based on category and source."""
+
+ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]:
+ category = knowledge.category
+ source = knowledge.source or "unknown"
+ project_name = knowledge.project_name
+ title = self._sanitize_filename(knowledge.title)
+
+ if category == "skills":
+ # viking://skills/{source}/{title}.md
+ # 例: viking://skills/claude_code/fix-docker-networking.md
+ # viking://skills/claude_desktop/design-review-pattern.md
+ return f"viking://skills/{self._sanitize_filename(source)}/{title}.md"
+
+ elif category == "memories":
+ if project_name:
+ safe_project = self._sanitize_filename(project_name)
+ return f"viking://memories/projects/{safe_project}/decisions.md"
+ else:
+ return f"viking://memories/global/{title}.md"
+
+ elif category == "resources":
+ entity_links = knowledge.entity_links
+ tech_stack = (
+ self._sanitize_filename(entity_links[0])
+ if entity_links else "general"
+ )
+ return f"viking://resources/{tech_stack}/{title}.md"
+
+ else:
+ logger.warning("Unknown category: %s", category)
+ return None
+```
+
+**变更影响**:原来 skills 统一写入 `viking://skills/claude_code/`,现在按 source 分目录。已有数据不受影响(路径不变),新数据会写入 `viking://skills/claude_desktop/` 等子目录。
+
+### VikingStorageAdapter — Source-aware 模板
+
+```python
+# storage_adapter.py — 修改 _format_skill() 和 _format_resource()
+
+@staticmethod
+def _format_skill(knowledge: ExtractedKnowledge, timestamp: str) -> str:
+ source_label = knowledge.source or "Claude Code"
+ # 将 source_id 映射为人类可读名称
+ SOURCE_LABELS = {
+ "claude_code": "Claude Code CLI",
+ "claude_desktop": "Claude Desktop",
+ }
+ label = SOURCE_LABELS.get(source_label, source_label)
+
+ steps = ""
+ if knowledge.actionable_steps:
+ steps = "\n## Steps\n\n"
+ for i, step in enumerate(knowledge.actionable_steps, 1):
+ steps += f"{i}. {step}\n"
+ else:
+ steps = "\n## Steps\n\n(No specific steps)\n"
+
+ return f"""# {knowledge.title}
+
+**Extracted**: {timestamp}
+**Source**: {label} session
+**Confidence**: {knowledge.confidence}
+
+## Content
+
+{knowledge.content}
+{steps}
+---
+*Auto-generated by OpenViking Active Daemon*
+"""
+```
+
+`_format_resource()` 同理,将 `Source: Claude Code session` 替换为动态 source label。
+
+### REST API 扩展
+
+```python
+# routers/daemon.py — 扩展 status 响应
+
+@router.get("/api/v1/daemon/status")
+async def get_daemon_status():
+ daemon = get_daemon_service()
+ if not daemon:
+ return {"enabled": False}
+
+ return {
+ "enabled": True,
+ "running": daemon.is_running,
+ "sources": [w.status for w in daemon.watchers], # 多源状态
+ "db_path": daemon.db_path,
+ "batch_trigger_lines": daemon.batch_trigger_lines,
+ "batch_trigger_seconds": daemon.batch_trigger_seconds,
+ "cursor_count": daemon.cursor_manager.count() if daemon.cursor_manager else 0,
+ }
+```
+
+---
+
+## DaemonService 重构
+
+```python
+# service.py — 多源版本
+
+class DaemonService:
+
+ def __init__(self, resource_service, config: DaemonConfig):
+ self.resource_service = resource_service
+ self.config = config
+ self.db_path = config.db_path or str(
+ Path.home() / ".qoderworkcn" / "openviking" / "daemon_cursors.db"
+ )
+ self.batch_trigger_lines = config.batch_trigger_lines
+ self.batch_trigger_seconds = config.batch_trigger_seconds
+
+ self.cursor_manager: Optional[CursorManager] = None
+ self.watchers: List[BaseWatcher] = [] # 从单个变为列表
+ self.etl_pipeline: Optional[BatchETLPipeline] = None
+ self.storage_adapter: Optional[VikingStorageAdapter] = None
+
+ self._running = False
+ self._etl_task: Optional[asyncio.Task] = None
+ self._batch_queue: asyncio.Queue = asyncio.Queue()
+
+ async def start(self):
+ logger.info("Starting OpenViking Active Daemon (multi-source)...")
+
+ self.cursor_manager = CursorManager(self.db_path)
+ self.etl_pipeline = BatchETLPipeline()
+ self.storage_adapter = VikingStorageAdapter(self.resource_service)
+
+ # 确保 Watcher 类型已注册
+ import openviking.daemon.watchers # noqa: F401 触发注册
+
+ # 为每个 source 创建 Watcher
+ for source_config in self.config.sources:
+ if not source_config.enabled:
+ continue
+
+ # 展开路径中的环境变量和 ~
+ expanded_paths = [
+ os.path.expanduser(os.path.expandvars(p))
+ for p in source_config.paths
+ ]
+
+ # 确保至少一个路径存在
+ for p in expanded_paths:
+ Path(p).mkdir(parents=True, exist_ok=True)
+
+ watcher = WatcherFactory.create(
+ source_config=source_config.model_dump(),
+ cursor_manager=self.cursor_manager,
+ batch_callback=self._enqueue_batch,
+ default_lines=self.batch_trigger_lines,
+ default_seconds=self.batch_trigger_seconds,
+ )
+ watcher.start()
+ self.watchers.append(watcher)
+
+ self._etl_task = asyncio.create_task(self._etl_loop())
+ self._running = True
+ logger.info("Daemon started with %d source(s)", len(self.watchers))
+
+ async def stop(self):
+ self._running = False
+ for watcher in self.watchers:
+ watcher.stop()
+ if self._etl_task:
+ await self._batch_queue.put(None)
+ try:
+ await asyncio.wait_for(self._etl_task, timeout=10)
+ except asyncio.TimeoutError:
+ self._etl_task.cancel()
+ logger.info("Daemon stopped")
+
+ async def flush(self):
+ for watcher in self.watchers:
+ watcher.flush()
+```
+
+---
+
+## 文件变更清单
+
+| 文件路径 | 操作 | 说明 |
+|---------|------|------|
+| `openviking/daemon/models.py` | 修改 | 新增 `NormalizedEvent`,`ConversationTurn` 和 `ExtractedKnowledge` 增加 `source` 字段 |
+| `openviking/daemon/watchers/base.py` | **新建** | `BaseWatcher` 抽象基类 |
+| `openviking/daemon/watchers/factory.py` | **新建** | `WatcherFactory` 配置驱动实例化 |
+| `openviking/daemon/watchers/__init__.py` | 修改 | 注册所有 Watcher 类型 |
+| `openviking/daemon/watchers/claude_code_watcher.py` | 修改 | 继承 `BaseWatcher`,支持多路径,注入 `source_id` |
+| `openviking/daemon/watchers/claude_desktop_watcher.py` | **新建** | `ClaudeDesktopWatcher` + `DesktopLogHandler` |
+| `openviking/daemon/watchers/indexeddb_poller.py` | **新建** | IndexedDB LevelDB 轮询提取器(可选) |
+| `openviking/daemon/service.py` | 修改 | 多 Watcher 管理,配置驱动初始化 |
+| `openviking/daemon/conversation_reconstructor.py` | 修改 | 传递 `source` 到 `ConversationTurn` |
+| `openviking/daemon/knowledge_extractor.py` | 修改 | 传递 `source` 到 `ExtractedKnowledge` |
+| `openviking/daemon/knowledge_router.py` | 修改 | Source-aware URI 路由 |
+| `openviking/daemon/storage_adapter.py` | 修改 | Source-aware Markdown 模板 |
+| `openviking/server/config.py` | 修改 | 新增 `SourceConfig`,重构 `DaemonConfig`,兼容迁移 |
+| `openviking/server/routers/daemon.py` | 修改 | 扩展 status API 返回多源状态 |
+| `openviking/server/app.py` | 修改 | 适配新的 `DaemonService(config=...)` 签名 |
+| `tests/daemon/test_multi_source.py` | **新建** | 多源配置、Factory、路由测试 |
+| `tests/daemon/test_claude_desktop_watcher.py` | **新建** | Desktop Watcher 单元测试 |
+| `tests/daemon/test_indexeddb_poller.py` | **新建** | IndexedDB 提取测试(mock LevelDB) |
+| `docs/daemon/configuration.md` | 修改 | 新增多源配置文档 |
+
+---
+
+## 分阶段实施
+
+### Phase 1: 抽象层与多源框架(Day 1-2)
+
+| 任务 | 文件 | 验收标准 |
+|-----|------|---------|
+| 创建 `BaseWatcher` ABC | `watchers/base.py` | 抽象方法定义完整,`status` 属性可用 |
+| 创建 `WatcherFactory` | `watchers/factory.py` | `register()` + `create()` 工作正常 |
+| 重构 `ClaudeCodeWatcher` | `watchers/claude_code_watcher.py` | 继承 `BaseWatcher`,支持 `paths: List[str]`,注入 `source`,现有测试仍通过 |
+| 新增 `NormalizedEvent` | `models.py` | `to_dict()` 输出与旧 Dict 格式兼容 |
+| 重构 `DaemonConfig` | `server/config.py` | 旧配置自动迁移,新配置正常解析 |
+| 重构 `DaemonService` | `service.py` | 多 Watcher 列表管理,ETL 循环不变 |
+| Source-aware 路由 | `knowledge_router.py` | skills 路由包含 source 前缀 |
+| Source-aware 模板 | `storage_adapter.py` | Markdown 模板显示正确 source |
+| 传递 source 到 Turn/Knowledge | `reconstructor.py`, `extractor.py` | `source` 字段端到端传递 |
+| 集成测试 | `tests/daemon/test_multi_source.py` | 模拟双源配置,验证两条数据链路 |
+
+**Phase 1 里程碑**:配置多个 `claude_code` 类型源(不同路径),两个 Watcher 各自监听、共享 ETL、路由到不同 `viking://skills/{source_id}/` 路径。
+
+### Phase 2: Claude Desktop Watcher(Day 3-4)
+
+| 任务 | 文件 | 验收标准 |
+|-----|------|---------|
+| 实现 `ClaudeDesktopWatcher` | `watchers/claude_desktop_watcher.py` | JSONL 通道可用 |
+| 实现 `DesktopLogHandler` | 同上 | 正确解析 `main.log` 格式,过滤非对话行 |
+| 单元测试 | `tests/daemon/test_claude_desktop_watcher.py` | 覆盖日志解析、过滤、缓冲触发 |
+| REST API 扩展 | `routers/daemon.py` | `/daemon/status` 返回多源状态 |
+| 文档更新 | `docs/daemon/configuration.md` | 多源配置示例完整 |
+
+**Phase 2 里程碑**:配置 `claude_desktop` 源后,能解析 Desktop 日志文件并提取知识。
+
+### Phase 3: IndexedDB 提取(Day 5,可选)
+
+| 任务 | 文件 | 验收标准 |
+|-----|------|---------|
+| 实现 `IndexedDBPoller` | `watchers/indexeddb_poller.py` | 能从 LevelDB 提取会话 |
+| plyvel 可选依赖 | `pyproject.toml` | 未安装时优雅降级(日志警告) |
+| 单元测试 | `tests/daemon/test_indexeddb_poller.py` | mock LevelDB 数据,验证提取逻辑 |
+| 端到端验证 | — | 实际 Claude Desktop IndexedDB 数据提取成功 |
+
+**Phase 3 里程碑**:Claude Desktop 的 IndexedDB 会话数据可被自动提取和入库。
+
+---
+
+## 风险与权衡
+
+| 风险 | 影响 | 缓解方案 |
+|-----|------|---------|
+| Claude Desktop IndexedDB schema 未公开 | 提取逻辑可能在版本更新后失效 | 启发式提取 + 异常兜底 + 日志告警 |
+| `main.log` 不含完整对话内容 | Desktop Watcher 可能提取不到知识 | IndexedDB 作为补充通道;纯 JSONL 作为 fallback |
+| plyvel 在 Windows 上安装困难 | IndexedDB 提取不可用 | 设为 optional dependency,未安装时跳过 |
+| 多 Watcher 共享 CursorManager 的并发写入 | SQLite 锁争用 | CursorManager 已使用线程安全连接;每个 Watcher 操作不同 file_path 主键 |
+| 多源事件到达顺序不确定 | 同一会话的 user/assistant 可能被分到不同 batch | ConversationReconstructor 已有排序逻辑;共享 Queue 保证 FIFO |
+| source_id 冲突(两个源用同一 ID) | 路由路径碰撞 | 配置校验:启动时检查 source_id 唯一性 |
+| 旧配置迁移出错 | 现有部署启动失败 | `model_validator` 中 try-except 兜底,日志警告 |
+
+---
+
+## 验证方案
+
+### 单元测试
+
+```bash
+# Phase 1 验收
+python -m pytest tests/daemon/test_multi_source.py -v
+# 预期:多源 Factory 创建、配置迁移、Source-aware 路由 全部通过
+
+# Phase 2 验收
+python -m pytest tests/daemon/test_claude_desktop_watcher.py -v
+# 预期:日志解析、过滤规则、缓冲触发 全部通过
+
+# 回归测试
+python -m pytest tests/daemon/ -v
+# 预期:所有现有测试(test_claude_code_watcher, test_cursor_manager, test_integration)仍通过
+```
+
+### 集成测试场景
+
+**场景 A:双 Claude Code 源**
+
+```json
+{
+ "sources": [
+ {"type": "claude_code", "paths": ["~/.claude/projects"], "source_id": "cc_main"},
+ {"type": "claude_code", "paths": ["D:/Work/claude-logs"], "source_id": "cc_work"}
+ ]
+}
+```
+
+验证:两个目录的 JSONL 文件各自被监听,知识分别路由到 `viking://skills/cc_main/` 和 `viking://skills/cc_work/`。
+
+**场景 B:CLI + Desktop 混合**
+
+```json
+{
+ "sources": [
+ {"type": "claude_code", "paths": ["~/.claude/projects"]},
+ {"type": "claude_desktop", "paths": ["%APPDATA%/Claude"]}
+ ]
+}
+```
+
+验证:两个源各自正常采集,CLI 源走 JSONL 通道,Desktop 源走 log 解析通道,两者共享 ETL 管道。
+
+### 手动验证步骤
+
+1. 启动 Daemon:`openviking-server --with-daemon --config test-config.json`
+2. 向 `~/.claude/projects` 下写入测试 JSONL → 观察 `cc_main` 路由
+3. 向 `%APPDATA%/Claude/logs/` 下追加模拟 `main.log` 行 → 观察 `claude_desktop` 路由
+4. 调用 `GET /api/v1/daemon/status` → 确认两个 source 状态正确
+5. 检查 `viking://skills/` 下的目录结构 → 确认按 source 分目录
+
+---
+
+## 一句话总结
+
+通过引入 BaseWatcher 抽象层 + WatcherFactory + SourceConfig 配置模型,将 Daemon 从单源硬编码升级为多源可配置框架,在保持 ETL 管道不变的前提下支持 Claude Code CLI、Claude Desktop、以及未来任意 JSONL 日志源的知识采集。
diff --git a/examples/daemon/docker-compose-daemon.yml b/examples/daemon/docker-compose-daemon.yml
new file mode 100644
index 0000000000..81dba7604b
--- /dev/null
+++ b/examples/daemon/docker-compose-daemon.yml
@@ -0,0 +1,14 @@
+version: '3.8'
+
+services:
+ openviking:
+ image: openviking:latest
+ environment:
+ - OV_DAEMON_ENABLED=true
+ - OV_DAEMON_WATCH_DIR=/data/claude-projects
+ - OV_DAEMON_DB_PATH=/data/daemon.db
+ volumes:
+ - ./claude-projects:/data/claude-projects
+ - ./daemon-data:/data
+ ports:
+ - "1933:1933"
diff --git a/examples/daemon/start-daemon.bat b/examples/daemon/start-daemon.bat
new file mode 100644
index 0000000000..3e06e7e327
--- /dev/null
+++ b/examples/daemon/start-daemon.bat
@@ -0,0 +1,10 @@
+@echo off
+REM Start OpenViking with Active Daemon enabled on Windows
+
+set OV_DAEMON_ENABLED=true
+set OV_DAEMON_WATCH_DIR=%USERPROFILE%\.claude\projects
+set OV_DAEMON_BATCH_LINES=50
+set OV_DAEMON_BATCH_SECONDS=300
+
+echo Starting OpenViking with Active Daemon...
+openviking serve %*
diff --git a/examples/daemon/start-daemon.sh b/examples/daemon/start-daemon.sh
new file mode 100644
index 0000000000..26129bb4f1
--- /dev/null
+++ b/examples/daemon/start-daemon.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Start OpenViking with Active Daemon enabled
+set -e
+
+echo "Starting OpenViking with Active Daemon..."
+
+export OV_DAEMON_ENABLED=true
+export OV_DAEMON_WATCH_DIR="${OV_DAEMON_WATCH_DIR:-$HOME/.claude/projects}"
+export OV_DAEMON_BATCH_LINES="${OV_DAEMON_BATCH_LINES:-50}"
+export OV_DAEMON_BATCH_SECONDS="${OV_DAEMON_BATCH_SECONDS:-300}"
+
+openviking serve "$@"
diff --git a/openviking/daemon/__init__.py b/openviking/daemon/__init__.py
new file mode 100644
index 0000000000..ba94f0d050
--- /dev/null
+++ b/openviking/daemon/__init__.py
@@ -0,0 +1,7 @@
+"""
+OpenViking Active Daemon package.
+Monitors AI tool logs and automatically extracts knowledge into viking:// storage.
+"""
+from openviking.daemon.service import DaemonService
+
+__all__ = ["DaemonService"]
diff --git a/openviking/daemon/conversation_reconstructor.py b/openviking/daemon/conversation_reconstructor.py
new file mode 100644
index 0000000000..179873467d
--- /dev/null
+++ b/openviking/daemon/conversation_reconstructor.py
@@ -0,0 +1,55 @@
+"""
+Reconstruct conversation turns from flat event lists.
+Pairs user prompts with assistant responses into structured ConversationTurn objects.
+"""
+from typing import Dict, List
+
+from openviking.daemon.models import ConversationTurn
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class ConversationReconstructor:
+ """Reconstructs paired conversation turns from chronological events."""
+
+ def reconstruct(self, events: List[Dict]) -> List[ConversationTurn]:
+ """
+ Pair user prompts with assistant responses.
+ Events are sorted by timestamp. An assistant response is paired
+ with the most recent unpaired user prompt.
+ """
+ turns: List[ConversationTurn] = []
+ current_user_prompt = None
+ current_metadata: Dict = {}
+
+ sorted_events = sorted(events, key=lambda e: e.get("timestamp", ""))
+
+ for event in sorted_events:
+ role = event.get("role")
+ content = event.get("content", "")
+
+ if role == "user":
+ current_user_prompt = content
+ current_metadata = {
+ "session_id": event.get("session_id"),
+ "project_name": event.get("project_name"),
+ "timestamp": event.get("timestamp"),
+ "source_tool": event.get("tool_name"),
+ }
+ elif role == "assistant" and current_user_prompt:
+ turns.append(
+ ConversationTurn(
+ user_prompt=current_user_prompt,
+ assistant_response=content,
+ session_id=current_metadata.get("session_id"),
+ project_name=current_metadata.get("project_name"),
+ timestamp=current_metadata.get("timestamp"),
+ source_tool=current_metadata.get("source_tool"),
+ )
+ )
+ current_user_prompt = None
+ current_metadata = {}
+
+ logger.info("Reconstructed %d conversation turns from %d events", len(turns), len(events))
+ return turns
diff --git a/openviking/daemon/cursor_manager.py b/openviking/daemon/cursor_manager.py
new file mode 100644
index 0000000000..6b76bc2565
--- /dev/null
+++ b/openviking/daemon/cursor_manager.py
@@ -0,0 +1,92 @@
+"""
+Cursor Manager for tracking file read positions.
+Persists state in SQLite so Daemon can resume after restart.
+"""
+import sqlite3
+import time
+from pathlib import Path
+from typing import Dict
+
+from openviking.daemon.models import FileCursor
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class CursorManager:
+ """Manages file cursor state with SQLite persistence."""
+
+ def __init__(self, db_path: str):
+ self.db_path = db_path
+ self._init_db()
+
+ def _init_db(self):
+ """Initialize the SQLite database and schema."""
+ Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
+
+ conn = sqlite3.connect(self.db_path)
+ try:
+ conn.execute("""
+ CREATE TABLE IF NOT EXISTS file_cursors (
+ file_path TEXT PRIMARY KEY,
+ last_position INTEGER NOT NULL DEFAULT 0,
+ last_read_time REAL NOT NULL DEFAULT 0.0,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+ )
+ """)
+ conn.commit()
+ finally:
+ conn.close()
+
+ def get_cursor(self, file_path: str) -> FileCursor:
+ """Get the cursor state for a file. Returns zero-position cursor if not found."""
+ conn = sqlite3.connect(self.db_path)
+ try:
+ row = conn.execute(
+ "SELECT last_position, last_read_time FROM file_cursors WHERE file_path = ?",
+ (file_path,),
+ ).fetchone()
+
+ if row:
+ return FileCursor(
+ file_path=file_path,
+ last_position=row[0],
+ last_read_time=row[1],
+ )
+ return FileCursor(file_path=file_path)
+ finally:
+ conn.close()
+
+ def update_cursor(self, file_path: str, position: int):
+ """Update the cursor position for a file."""
+ conn = sqlite3.connect(self.db_path)
+ try:
+ conn.execute(
+ """
+ INSERT OR REPLACE INTO file_cursors (file_path, last_position, last_read_time)
+ VALUES (?, ?, ?)
+ """,
+ (file_path, position, time.time()),
+ )
+ conn.commit()
+ finally:
+ conn.close()
+
+ def get_all_cursors(self) -> Dict[str, FileCursor]:
+ """Get all tracked cursor states."""
+ conn = sqlite3.connect(self.db_path)
+ try:
+ rows = conn.execute(
+ "SELECT file_path, last_position, last_read_time FROM file_cursors"
+ ).fetchall()
+
+ return {
+ row[0]: FileCursor(
+ file_path=row[0],
+ last_position=row[1],
+ last_read_time=row[2],
+ )
+ for row in rows
+ }
+ finally:
+ conn.close()
diff --git a/openviking/daemon/deduplicator.py b/openviking/daemon/deduplicator.py
new file mode 100644
index 0000000000..9066b57bb4
--- /dev/null
+++ b/openviking/daemon/deduplicator.py
@@ -0,0 +1,35 @@
+"""
+Knowledge deduplication based on content hashing.
+Prevents duplicate knowledge from being written to viking:// storage.
+"""
+import hashlib
+from typing import Set
+
+from openviking.daemon.models import ExtractedKnowledge
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class KnowledgeDeduplicator:
+ """Deduplicates knowledge items based on MD5 content hash."""
+
+ def __init__(self):
+ self.seen_hashes: Set[str] = set()
+
+ def is_duplicate(self, knowledge: ExtractedKnowledge) -> bool:
+ """Check if this knowledge is a duplicate of something already seen."""
+ content_hash = hashlib.md5(
+ knowledge.content.encode("utf-8")
+ ).hexdigest()
+
+ if content_hash in self.seen_hashes:
+ logger.debug("Duplicate knowledge skipped: %s", knowledge.title)
+ return True
+
+ self.seen_hashes.add(content_hash)
+ return False
+
+ def clear(self):
+ """Clear the dedup cache."""
+ self.seen_hashes.clear()
diff --git a/openviking/daemon/etl_pipeline.py b/openviking/daemon/etl_pipeline.py
new file mode 100644
index 0000000000..35a47315f8
--- /dev/null
+++ b/openviking/daemon/etl_pipeline.py
@@ -0,0 +1,64 @@
+"""
+Batch ETL pipeline for processing conversation events into structured knowledge.
+Orchestrates: Filter -> Reconstruct -> Extract -> Deduplicate
+"""
+import asyncio
+from typing import Dict, List
+
+from openviking.daemon.models import ExtractedKnowledge
+from openviking.daemon.filters import LowValueFilter
+from openviking.daemon.conversation_reconstructor import ConversationReconstructor
+from openviking.daemon.knowledge_extractor import KnowledgeExtractor
+from openviking.daemon.deduplicator import KnowledgeDeduplicator
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class BatchETLPipeline:
+ """Orchestrates the full ETL flow from raw events to structured knowledge."""
+
+ def __init__(self, vlm_config=None):
+ self.filter = LowValueFilter()
+ self.reconstructor = ConversationReconstructor()
+ self.extractor = KnowledgeExtractor(vlm_config=vlm_config)
+ self.deduplicator = KnowledgeDeduplicator()
+
+ async def process_batch(self, events: List[Dict]) -> List[ExtractedKnowledge]:
+ """
+ Process a batch of raw conversation events.
+
+ Flow: events -> filter -> reconstruct -> extract (parallel) -> deduplicate
+ """
+ logger.info("Processing batch with %d events", len(events))
+
+ # Step 1: Filter low-value content
+ filtered_events = self.filter.apply(events)
+ logger.info("After filtering: %d events", len(filtered_events))
+
+ if not filtered_events:
+ return []
+
+ # Step 2: Reconstruct conversation turns
+ turns = self.reconstructor.reconstruct(filtered_events)
+ logger.info("Reconstructed %d conversation turns", len(turns))
+
+ if not turns:
+ return []
+
+ # Step 3: Extract knowledge in parallel
+ tasks = [self.extractor.extract(turn) for turn in turns]
+ results = await asyncio.gather(*tasks, return_exceptions=True)
+
+ # Step 4: Filter errors and deduplicate
+ extracted: List[ExtractedKnowledge] = []
+ for result in results:
+ if isinstance(result, Exception):
+ logger.error("Extraction failed: %s", result)
+ continue
+
+ if result is not None and not self.deduplicator.is_duplicate(result):
+ extracted.append(result)
+
+ logger.info("Extracted %d knowledge items from batch", len(extracted))
+ return extracted
diff --git a/openviking/daemon/filters.py b/openviking/daemon/filters.py
new file mode 100644
index 0000000000..4a3a58bbe2
--- /dev/null
+++ b/openviking/daemon/filters.py
@@ -0,0 +1,47 @@
+"""
+Rule-based filters for low-value conversations.
+Removes noise before LLM processing to save cost and improve quality.
+"""
+import re
+from typing import Dict, List
+
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class LowValueFilter:
+ """Filters out low-value conversations using regex rules."""
+
+ NOISE_PATTERNS = [
+ r"^npm\s+(install|update|remove)",
+ r"^yarn\s+(add|remove)",
+ r"^pip\s+(install|uninstall)",
+ r"^git\s+(commit|push|pull|merge)",
+ r"^(SyntaxError|TypeError|ImportError|ModuleNotFoundError)",
+ r"^Retry\s+\d+/",
+ r"^Loading\.+",
+ r"^(format|indent|align)\s+(this|the)\s+code",
+ ]
+
+ MIN_CONTENT_LENGTH = 20
+
+ def apply(self, events: List[Dict]) -> List[Dict]:
+ """Apply filtering rules to a list of events."""
+ filtered = []
+
+ for event in events:
+ content = event.get("content", "").strip()
+
+ # Rule 1: too short
+ if len(content) < self.MIN_CONTENT_LENGTH:
+ continue
+
+ # Rule 2: noise pattern match
+ if any(re.match(p, content, re.IGNORECASE) for p in self.NOISE_PATTERNS):
+ continue
+
+ filtered.append(event)
+
+ logger.debug("Filtered %d events down to %d", len(events), len(filtered))
+ return filtered
diff --git a/openviking/daemon/knowledge_extractor.py b/openviking/daemon/knowledge_extractor.py
new file mode 100644
index 0000000000..260ca2622d
--- /dev/null
+++ b/openviking/daemon/knowledge_extractor.py
@@ -0,0 +1,121 @@
+"""
+LLM-based knowledge extraction from conversations.
+Uses OpenViking's existing VLM configuration for intelligent filtering and summarization.
+"""
+from typing import Dict, Optional
+
+from openviking.daemon.models import ConversationTurn, ExtractedKnowledge
+from openviking_cli.utils.config import get_openviking_config
+from openviking_cli.utils.llm import parse_json_from_response
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+KNOWLEDGE_EXTRACTION_PROMPT = """# Role: OpenViking Automated Context Extraction Expert
+
+# Task:
+Analyze the AI-assisted development conversation below and convert it into structured knowledge for viking:// storage.
+
+# Input:
+User question: {user_prompt}
+AI answer: {assistant_response}
+
+# Filtering rules:
+Output if the conversation is just:
+- Minor syntax error fixes (missing semicolons, typos)
+- Invalid commands or retry loops
+- Pure code formatting or indentation changes
+- Simple factual questions ("what is Python")
+
+Extract ONLY when the conversation contains:
+- Explicit system configuration decisions
+- Root cause analysis for complex bugs
+- New architectural rules for the project
+- Core development intent
+- Reusable skills or best practices
+
+# Output (strict JSON, no markdown code blocks):
+{{
+ "status": "EXTRACTED" | "IGNORED",
+ "category": "skills" | "memories" | "resources",
+ "confidence": 0.0-1.0,
+ "title": "One-line summary (max 30 chars)",
+ "project_name": "project name or null",
+ "entity_links": ["tech tags", "module names"],
+ "content": "Concise conclusion. What was the problem, what was the solution, why this choice.",
+ "actionable_steps": ["steps if skills, else empty"]
+}}
+"""
+
+
+class KnowledgeExtractor:
+ """Extracts structured knowledge from conversation turns using LLM."""
+
+ def __init__(self, vlm_config=None):
+ """
+ Args:
+ vlm_config: Optional VLMConfig instance. If None, uses OpenViking's global config.
+ Pass a mock for testing.
+ """
+ self._vlm_config = vlm_config
+
+ def _get_vlm(self):
+ """Lazy-load the VLM config from OpenViking if not provided."""
+ if self._vlm_config is None:
+ self._vlm_config = get_openviking_config().vlm
+ return self._vlm_config
+
+ async def extract(self, turn: ConversationTurn) -> Optional[ExtractedKnowledge]:
+ """Extract knowledge from a conversation turn. Returns None if not valuable."""
+ prompt = KNOWLEDGE_EXTRACTION_PROMPT.format(
+ user_prompt=turn.user_prompt,
+ assistant_response=turn.assistant_response,
+ )
+
+ try:
+ response = await self._call_llm(prompt)
+
+ if not response or response.get("status") != "EXTRACTED":
+ return None
+
+ if response.get("confidence", 0) < 0.6:
+ return None
+
+ return ExtractedKnowledge(
+ status=response["status"],
+ category=response["category"],
+ title=response.get("title", "")[:50],
+ content=self._clean_content(response.get("content", "")),
+ confidence=response.get("confidence", 0.0),
+ project_name=response.get("project_name"),
+ entity_links=response.get("entity_links", []),
+ actionable_steps=response.get("actionable_steps", []),
+ timestamp=turn.timestamp,
+ source_tool=turn.source_tool,
+ )
+
+ except Exception as e:
+ logger.error("Error extracting knowledge: %s", e)
+ return None
+
+ async def _call_llm(self, prompt: str) -> Optional[Dict]:
+ """Call VLM via get_completion_async and parse JSON response."""
+ try:
+ vlm = self._get_vlm()
+ # get_completion_async returns str when no tools are provided
+ raw_response = await vlm.get_completion_async(prompt=prompt)
+
+ # parse_json_from_response handles markdown code block stripping,
+ # JSON extraction, and json_repair fallback
+ return parse_json_from_response(raw_response)
+
+ except Exception as e:
+ logger.error("LLM call failed: %s", e)
+ return None
+
+ @staticmethod
+ def _clean_content(content: str) -> str:
+ """Remove markdown artifacts from content."""
+ import re
+ content = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
+ return content.strip()
diff --git a/openviking/daemon/knowledge_router.py b/openviking/daemon/knowledge_router.py
new file mode 100644
index 0000000000..cd043d905a
--- /dev/null
+++ b/openviking/daemon/knowledge_router.py
@@ -0,0 +1,56 @@
+"""
+Route extracted knowledge to appropriate viking:// URIs based on category and project.
+"""
+import re
+from typing import Optional
+
+from openviking.daemon.models import ExtractedKnowledge
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class KnowledgeRouter:
+ """Routes knowledge items to viking:// URIs based on category."""
+
+ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]:
+ """
+ Determine the target URI for a knowledge item.
+
+ Routing rules:
+ - skills -> viking://skills//.md
+ - memories (with project) -> viking://memories/projects//decisions.md
+ - memories (no project) -> viking://memories/global/.md
+ - resources -> viking://resources//.md
+ """
+ category = knowledge.category
+ project_name = knowledge.project_name
+ title = self._sanitize_filename(knowledge.title)
+ source = knowledge.source_tool or "general"
+
+ if category == "skills":
+ safe_source = self._sanitize_filename(source)
+ return f"viking://skills/{safe_source}/{title}.md"
+
+ elif category == "memories":
+ if project_name:
+ safe_project = self._sanitize_filename(project_name)
+ return f"viking://memories/projects/{safe_project}/decisions.md"
+ else:
+ return f"viking://memories/global/{title}.md"
+
+ elif category == "resources":
+ entity_links = knowledge.entity_links
+ tech_stack = self._sanitize_filename(entity_links[0]) if entity_links else "general"
+ return f"viking://resources/{tech_stack}/{title}.md"
+
+ else:
+ logger.warning("Unknown category: %s", category)
+ return None
+
+ @staticmethod
+ def _sanitize_filename(name: str) -> str:
+ """Sanitize a string for use as a filename."""
+ sanitized = re.sub(r'[<>:"/\\|?*]', '_', name)
+ sanitized = sanitized.strip().replace(' ', '_')
+ return sanitized[:50]
diff --git a/openviking/daemon/models.py b/openviking/daemon/models.py
new file mode 100644
index 0000000000..f409762b53
--- /dev/null
+++ b/openviking/daemon/models.py
@@ -0,0 +1,88 @@
+"""
+Data models for OpenViking Active Daemon.
+"""
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class FileCursor:
+ """Tracks file read position for incremental processing."""
+ file_path: str
+ last_position: int = 0
+ last_read_time: float = 0.0
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "file_path": self.file_path,
+ "last_position": self.last_position,
+ "last_read_time": self.last_read_time,
+ }
+
+ @classmethod
+ def from_dict(cls, data: Dict[str, Any]) -> "FileCursor":
+ return cls(
+ file_path=data["file_path"],
+ last_position=data.get("last_position", 0),
+ last_read_time=data.get("last_read_time", 0.0),
+ )
+
+
+@dataclass
+class BatchBuffer:
+ """Buffer for accumulating events before batch processing."""
+ lines: List[Dict[str, Any]] = field(default_factory=list)
+ byte_count: int = 0
+ created_at: float = 0.0
+
+ def add_line(self, line: Dict[str, Any], byte_size: int):
+ self.lines.append(line)
+ self.byte_count += byte_size
+
+ def is_empty(self) -> bool:
+ return len(self.lines) == 0
+
+ def clear(self):
+ self.lines.clear()
+ self.byte_count = 0
+ self.created_at = 0.0
+
+
+@dataclass
+class ConversationTurn:
+ """A complete user-assistant conversation turn."""
+ user_prompt: str
+ assistant_response: str
+ session_id: Optional[str] = None
+ project_name: Optional[str] = None
+ timestamp: Optional[str] = None
+ source_tool: Optional[str] = None
+
+
+@dataclass
+class ExtractedKnowledge:
+ """Structured knowledge extracted from a conversation."""
+ status: str # "EXTRACTED" | "IGNORED"
+ category: str # "skills" | "memories" | "resources"
+ title: str
+ content: str
+ confidence: float = 0.0
+ project_name: Optional[str] = None
+ entity_links: List[str] = field(default_factory=list)
+ actionable_steps: List[str] = field(default_factory=list)
+ timestamp: Optional[str] = None
+ source_tool: Optional[str] = None
+
+ def to_dict(self) -> Dict[str, Any]:
+ return {
+ "status": self.status,
+ "category": self.category,
+ "title": self.title,
+ "content": self.content,
+ "confidence": self.confidence,
+ "project_name": self.project_name,
+ "entity_links": self.entity_links,
+ "actionable_steps": self.actionable_steps,
+ "timestamp": self.timestamp,
+ "source_tool": self.source_tool,
+ }
diff --git a/openviking/daemon/service.py b/openviking/daemon/service.py
new file mode 100644
index 0000000000..8ca1c1e2d1
--- /dev/null
+++ b/openviking/daemon/service.py
@@ -0,0 +1,190 @@
+"""
+OpenViking Active Daemon main service.
+Orchestrates multi-tool file watching, ETL processing, and knowledge storage.
+"""
+import asyncio
+import os
+from pathlib import Path
+from typing import List, Optional
+
+from openviking.daemon.cursor_manager import CursorManager
+from openviking.daemon.etl_pipeline import BatchETLPipeline
+from openviking.daemon.storage_adapter import VikingStorageAdapter
+from openviking.daemon.watchers.registry import create_watcher
+from openviking.daemon.watchers import BaseWatcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class DaemonService:
+ """
+ OpenViking Active Daemon main service.
+ Monitors multiple AI tool logs and extracts knowledge into viking:// storage.
+ """
+
+ def __init__(
+ self,
+ resource_service,
+ watcher_configs: Optional[List] = None,
+ db_path: Optional[str] = None,
+ # Backward-compatible single watcher args
+ watch_dir: Optional[str] = None,
+ batch_trigger_lines: int = 50,
+ batch_trigger_seconds: int = 300,
+ ):
+ self.resource_service = resource_service
+
+ home = Path.home()
+ self.db_path = db_path or str(
+ home / ".qoderworkcn" / "openviking" / "daemon_cursors.db"
+ )
+
+ self.batch_trigger_lines = batch_trigger_lines
+ self.batch_trigger_seconds = batch_trigger_seconds
+
+ # Build watcher config list
+ if watcher_configs:
+ self._watcher_configs = watcher_configs
+ else:
+ # Backward compatible: single claude_code watcher
+ from openviking.server.config import WatcherConfig
+ wd = watch_dir or str(home / ".claude" / "projects")
+ self._watcher_configs = [WatcherConfig(
+ tool_name="claude_code",
+ watch_dir=wd,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )]
+
+ # Components
+ self.cursor_manager: Optional[CursorManager] = None
+ self.watchers: List[BaseWatcher] = []
+ self.etl_pipeline: Optional[BatchETLPipeline] = None
+ self.storage_adapter: Optional[VikingStorageAdapter] = None
+
+ self._running = False
+ self._etl_task: Optional[asyncio.Task] = None
+ self._batch_queue: asyncio.Queue = asyncio.Queue()
+
+ async def start(self):
+ """Start the Daemon service with all configured watchers."""
+ logger.info("Starting OpenViking Active Daemon...")
+
+ self.cursor_manager = CursorManager(self.db_path)
+ self.etl_pipeline = BatchETLPipeline()
+ self.storage_adapter = VikingStorageAdapter(self.resource_service)
+
+ # Ensure db directory exists
+ Path(self.db_path).parent.mkdir(parents=True, exist_ok=True)
+
+ # Start ETL loop
+ self._etl_task = asyncio.create_task(self._etl_loop())
+
+ # Create and start each watcher
+ for wc in self._watcher_configs:
+ watch_dir = os.path.expanduser(wc.watch_dir)
+ Path(watch_dir).mkdir(parents=True, exist_ok=True)
+
+ try:
+ watcher = create_watcher(
+ tool_name=wc.tool_name,
+ watch_dir=watch_dir,
+ cursor_manager=self.cursor_manager,
+ batch_callback=self._enqueue_batch,
+ file_pattern=wc.file_pattern,
+ batch_trigger_lines=wc.batch_trigger_lines,
+ batch_trigger_seconds=wc.batch_trigger_seconds,
+ extra=wc.extra,
+ )
+ watcher.start()
+ self.watchers.append(watcher)
+ logger.info("Watcher started: %s -> %s", wc.tool_name, watch_dir)
+ except Exception as e:
+ logger.warning("Failed to start watcher %s: %s", wc.tool_name, e)
+
+ self._running = True
+ logger.info("Daemon started with %d watcher(s)", len(self.watchers))
+
+ async def stop(self):
+ """Stop all watchers and the ETL loop."""
+ logger.info("Stopping OpenViking Active Daemon...")
+
+ self._running = False
+
+ for watcher in self.watchers:
+ try:
+ watcher.stop()
+ except Exception as e:
+ logger.warning("Error stopping watcher: %s", e)
+
+ if self._etl_task:
+ await self._batch_queue.put(None)
+ try:
+ await asyncio.wait_for(self._etl_task, timeout=10)
+ except asyncio.TimeoutError:
+ self._etl_task.cancel()
+
+ logger.info("Daemon stopped")
+
+ def _enqueue_batch(self, events):
+ """Sync callback from watcher thread - puts events onto async queue."""
+ try:
+ self._batch_queue.put_nowait(events)
+ except Exception as e:
+ logger.error("Failed to enqueue batch: %s", e)
+
+ async def _etl_loop(self):
+ """Background loop that processes batches from the queue."""
+ logger.info("ETL processing loop started")
+
+ while self._running:
+ try:
+ events = await asyncio.wait_for(
+ self._batch_queue.get(), timeout=5.0
+ )
+ except asyncio.TimeoutError:
+ continue
+
+ if events is None:
+ break
+
+ try:
+ extracted = await self.etl_pipeline.process_batch(events)
+ if not extracted:
+ logger.info("No knowledge extracted from batch")
+ continue
+
+ for knowledge in extracted:
+ try:
+ from openviking.server.identity import RequestContext, Role
+ from openviking_cli.session.user_id import UserIdentifier
+
+ ctx = RequestContext(
+ user=UserIdentifier.the_default_user(),
+ role=Role.ROOT,
+ )
+ success = await self.storage_adapter.write_knowledge(
+ knowledge, ctx
+ )
+ if success:
+ logger.info("Successfully wrote: %s", knowledge.title)
+ else:
+ logger.warning("Failed to write: %s", knowledge.title)
+ except Exception as e:
+ logger.error("Error writing knowledge: %s", e)
+
+ except Exception as e:
+ logger.error("Error in ETL processing: %s", e, exc_info=True)
+
+ logger.info("ETL processing loop stopped")
+
+ async def flush(self):
+ """Force flush all watchers' buffers."""
+ for watcher in self.watchers:
+ watcher.flush()
+ logger.info("Manual flush triggered for %d watchers", len(self.watchers))
+
+ @property
+ def is_running(self) -> bool:
+ return self._running
diff --git a/openviking/daemon/storage_adapter.py b/openviking/daemon/storage_adapter.py
new file mode 100644
index 0000000000..e1a76684ef
--- /dev/null
+++ b/openviking/daemon/storage_adapter.py
@@ -0,0 +1,155 @@
+"""
+Adapter for writing extracted knowledge to OpenViking via ResourceService.
+Generates temporary Markdown files and ingests them through the standard resource pipeline.
+"""
+import os
+import tempfile
+from datetime import datetime
+from typing import Optional
+
+from openviking.daemon.models import ExtractedKnowledge
+from openviking.daemon.knowledge_router import KnowledgeRouter
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class VikingStorageAdapter:
+ """Writes structured knowledge to OpenViking via ResourceService.add_resource()."""
+
+ def __init__(self, resource_service):
+ self.resource_service = resource_service
+ self.router = KnowledgeRouter()
+
+ async def write_knowledge(
+ self,
+ knowledge: ExtractedKnowledge,
+ ctx,
+ ) -> bool:
+ """
+ Write a knowledge item to viking:// storage.
+
+ Args:
+ knowledge: The extracted knowledge to write
+ ctx: OpenViking RequestContext
+
+ Returns:
+ True if successful, False otherwise
+ """
+ uri = self.router.route(knowledge)
+ if not uri:
+ logger.warning("Cannot route knowledge: %s", knowledge.title)
+ return False
+
+ temp_file = self._generate_temp_markdown(knowledge)
+
+ try:
+ await self.resource_service.add_resource(
+ path=temp_file,
+ ctx=ctx,
+ to=uri,
+ build_index=True,
+ summarize=False,
+ )
+ logger.info("Knowledge ingested: %s", uri)
+ return True
+
+ except Exception as e:
+ logger.error("Failed to ingest knowledge to %s: %s", uri, e)
+ return False
+
+ finally:
+ if os.path.exists(temp_file):
+ os.unlink(temp_file)
+
+ def _generate_temp_markdown(self, knowledge: ExtractedKnowledge) -> str:
+ """Generate a temporary Markdown file for the knowledge item."""
+ fd, path = tempfile.mkstemp(suffix=".md")
+ content = self._format_content(knowledge)
+
+ with os.fdopen(fd, "w", encoding="utf-8") as f:
+ f.write(content)
+
+ return path
+
+ def _format_content(self, knowledge: ExtractedKnowledge) -> str:
+ """Format knowledge content as Markdown based on category."""
+ timestamp = datetime.now().isoformat()
+
+ if knowledge.category == "skills":
+ return self._format_skill(knowledge, timestamp)
+ elif knowledge.category == "memories":
+ return self._format_memory(knowledge, timestamp)
+ else:
+ return self._format_resource(knowledge, timestamp)
+
+ @staticmethod
+ def _format_skill(knowledge: ExtractedKnowledge, timestamp: str) -> str:
+ """Format a skill knowledge item."""
+ steps = ""
+ if knowledge.actionable_steps:
+ steps = "\n## Steps\n\n"
+ for i, step in enumerate(knowledge.actionable_steps, 1):
+ steps += f"{i}. {step}\n"
+ else:
+ steps = "\n## Steps\n\n(No specific steps)\n"
+
+ source = knowledge.source_tool or "Claude Code"
+
+ return f"""# {knowledge.title}
+
+**Extracted**: {timestamp}
+**Source**: {source} session
+**Confidence**: {knowledge.confidence}
+
+## Content
+
+{knowledge.content}
+{steps}
+---
+*Auto-generated by OpenViking Active Daemon*
+"""
+
+ @staticmethod
+ def _format_memory(knowledge: ExtractedKnowledge, timestamp: str) -> str:
+ """Format a memory knowledge item (append-friendly)."""
+ ts = ""
+ if knowledge.timestamp:
+ try:
+ ts = datetime.fromisoformat(knowledge.timestamp).strftime("%Y-%m-%d %H:%M")
+ except ValueError:
+ ts = timestamp
+ else:
+ ts = timestamp
+
+ tags = ", ".join(knowledge.entity_links) if knowledge.entity_links else "none"
+
+ return f"""## [{ts}] {knowledge.title}
+
+{knowledge.content}
+
+**Tags**: {tags}
+
+---
+
+"""
+
+ @staticmethod
+ def _format_resource(knowledge: ExtractedKnowledge, timestamp: str) -> str:
+ """Format a resource knowledge item."""
+ tech = knowledge.entity_links[0] if knowledge.entity_links else "general"
+ source = knowledge.source_tool or "Claude Code"
+
+ return f"""# {knowledge.title}
+
+**Category**: {tech}
+**Extracted**: {timestamp}
+**Source**: {source} session
+
+## Content
+
+{knowledge.content}
+
+---
+*Auto-generated by OpenViking Active Daemon*
+"""
diff --git a/openviking/daemon/watchers/__init__.py b/openviking/daemon/watchers/__init__.py
new file mode 100644
index 0000000000..450f68faf4
--- /dev/null
+++ b/openviking/daemon/watchers/__init__.py
@@ -0,0 +1,27 @@
+"""
+Watcher abstractions for OpenViking Active Daemon.
+Provides BaseWatcher protocol and watcher registry for multi-tool support.
+"""
+from typing import Protocol, runtime_checkable
+
+
+@runtime_checkable
+class BaseWatcher(Protocol):
+ """Protocol that all tool-specific watchers must implement."""
+
+ @property
+ def tool_name(self) -> str:
+ """Return the identifier for this watcher's tool (e.g. 'claude_code', 'aider')."""
+ ...
+
+ def start(self) -> None:
+ """Start watching for file/database changes."""
+ ...
+
+ def stop(self) -> None:
+ """Stop watching and release resources."""
+ ...
+
+ def flush(self) -> None:
+ """Force flush any buffered events."""
+ ...
diff --git a/openviking/daemon/watchers/aider_watcher.py b/openviking/daemon/watchers/aider_watcher.py
new file mode 100644
index 0000000000..cd2e20dc95
--- /dev/null
+++ b/openviking/daemon/watchers/aider_watcher.py
@@ -0,0 +1,148 @@
+"""
+Aider chat history watcher.
+Monitors .aider.chat.history.md files in project directories.
+"""
+import re
+import time
+import os
+from typing import Dict, List, Optional
+
+from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
+from openviking.daemon.watchers.registry import register_watcher
+from openviking.daemon.models import FileCursor
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_watcher("aider")
+class AiderWatcher(BaseFileWatcher):
+ """Watches Aider .aider.chat.history.md files."""
+
+ # Regex patterns
+ HEADER_RE = re.compile(r'^# aider chat started at (.+)$')
+ PROJECT_RE = re.compile(r'^> (.+)$')
+ USER_RE = re.compile(r'^#### user:\s*$')
+ ASSISTANT_RE = re.compile(r'^#### assistant:\s*$')
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ file_pattern=".aider.chat.history.md",
+ batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs):
+ super().__init__(
+ watch_dir=watch_dir,
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ file_pattern=file_pattern,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+
+ @property
+ def tool_name(self) -> str:
+ return "aider"
+
+ def matches_file_pattern(self, file_path: str) -> bool:
+ """Match .aider.chat.history.md files."""
+ filename = os.path.basename(file_path)
+ return filename == ".aider.chat.history.md"
+
+ def parse_line(self, line: str) -> Optional[Dict]:
+ """Not used - Aider uses multi-line parsing via _process_file override."""
+ return None
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """Not used directly - _process_file creates normalized events."""
+ return raw_event
+
+ def _process_file(self, file_path: str):
+ """
+ Override: Parse Aider's multi-line markdown format.
+ Extracts user/assistant conversation blocks from .aider.chat.history.md.
+ """
+ try:
+ cursor = self.cursor_manager.get_cursor(file_path)
+ file_size = os.path.getsize(file_path)
+
+ if file_size <= cursor.last_position:
+ return
+
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+ f.seek(cursor.last_position)
+ new_content = f.read()
+
+ new_position = cursor.last_position + len(new_content.encode("utf-8"))
+
+ # Parse conversation blocks
+ events = self._parse_aider_content(new_content)
+
+ for event in events:
+ event["tool_name"] = self.tool_name
+ byte_size = len(event.get("content", "").encode("utf-8"))
+ self._buffer.add_line(event, byte_size)
+
+ self.cursor_manager.update_cursor(file_path, new_position)
+ self._check_batch_trigger()
+
+ except Exception as e:
+ logger.error("[%s] Error processing %s: %s", self.tool_name, file_path, e)
+
+ def _parse_aider_content(self, content: str) -> List[Dict]:
+ """Parse Aider markdown content into normalized events."""
+ events = []
+ lines = content.splitlines()
+
+ current_role = None
+ current_content = []
+ current_timestamp = None
+ project_name = None
+
+ for line in lines:
+ # Check for timestamp header
+ header_match = self.HEADER_RE.match(line)
+ if header_match:
+ current_timestamp = header_match.group(1).strip()
+ continue
+
+ # Check for project path
+ project_match = self.PROJECT_RE.match(line)
+ if project_match:
+ project_name = project_match.group(1).strip()
+ continue
+
+ # Check for role markers
+ if self.USER_RE.match(line):
+ # Flush previous block
+ if current_role and current_content:
+ events.append(self._make_event(current_role, current_content, current_timestamp, project_name))
+ current_role = "user"
+ current_content = []
+ continue
+
+ if self.ASSISTANT_RE.match(line):
+ if current_role and current_content:
+ events.append(self._make_event(current_role, current_content, current_timestamp, project_name))
+ current_role = "assistant"
+ current_content = []
+ continue
+
+ # Accumulate content for current role
+ if current_role:
+ current_content.append(line)
+
+ # Flush last block
+ if current_role and current_content:
+ events.append(self._make_event(current_role, current_content, current_timestamp, project_name))
+
+ return events
+
+ def _make_event(self, role: str, content_lines: List[str],
+ timestamp: Optional[str], project_name: Optional[str]) -> Dict:
+ """Create a normalized event dict from parsed content."""
+ content = "\n".join(content_lines).strip()
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": timestamp,
+ "project_name": project_name,
+ }
diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py
new file mode 100644
index 0000000000..0e6cce1e0f
--- /dev/null
+++ b/openviking/daemon/watchers/base_file_watcher.py
@@ -0,0 +1,192 @@
+"""
+Abstract base class for file-append based watchers.
+Handles watchdog Observer lifecycle, cursor management, and batch buffering.
+Subclasses only need to implement parse_line() and normalize_event().
+"""
+import os
+import time
+from abc import ABC, abstractmethod
+from pathlib import Path
+from typing import Callable, Dict, List, Optional
+
+from watchdog.events import FileSystemEventHandler, FileModifiedEvent
+from watchdog.observers import Observer
+
+from openviking.daemon.models import BatchBuffer, FileCursor
+from openviking.daemon.cursor_manager import CursorManager
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class _FileHandler(FileSystemEventHandler):
+ """Internal watchdog handler that delegates to BaseFileWatcher methods."""
+
+ def __init__(self, watcher: "BaseFileWatcher"):
+ super().__init__()
+ self._watcher = watcher
+
+ def on_modified(self, event):
+ if event.is_directory:
+ return
+ file_path = event.src_path
+ if not self._watcher.matches_file_pattern(file_path):
+ return
+ self._watcher._process_file(file_path)
+
+
+class BaseFileWatcher(ABC):
+ """
+ Abstract base for file-append based watchers.
+
+ Subclasses must implement:
+ - tool_name (property): Return tool identifier string
+ - parse_line(line): Parse a raw text line into a raw event dict (or None)
+ - normalize_event(raw_event): Convert raw event to normalized dict (or None to skip)
+
+ Optional overrides:
+ - filter_event(event): Additional filtering. Return True to keep, False to skip.
+ - matches_file_pattern(path): Custom file matching logic.
+ """
+
+ def __init__(
+ self,
+ watch_dir: str,
+ cursor_manager: CursorManager,
+ batch_callback: Callable[[List[Dict]], None],
+ file_pattern: str = "*.jsonl",
+ batch_trigger_lines: int = 50,
+ batch_trigger_seconds: int = 300,
+ ):
+ self.watch_dir = os.path.expanduser(watch_dir)
+ self.cursor_manager = cursor_manager
+ self.batch_callback = batch_callback
+ self.file_pattern = file_pattern
+ self.batch_trigger_lines = batch_trigger_lines
+ self.batch_trigger_seconds = batch_trigger_seconds
+
+ self._buffer = BatchBuffer()
+ self._observer: Optional[Observer] = None
+ self._handler: Optional[_FileHandler] = None
+
+ @property
+ @abstractmethod
+ def tool_name(self) -> str:
+ """Return tool identifier (e.g. 'claude_code', 'aider')."""
+ ...
+
+ @abstractmethod
+ def parse_line(self, line: str) -> Optional[Dict]:
+ """Parse a raw text line into a raw event dict. Return None to skip."""
+ ...
+
+ @abstractmethod
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """
+ Convert a raw event dict to normalized format.
+ Normalized format must have at minimum: role, content, type, tool_name.
+ Return None to skip this event.
+ """
+ ...
+
+ def filter_event(self, event: Dict) -> bool:
+ """Additional filtering. Override for tool-specific rules. Default: keep all."""
+ return True
+
+ def matches_file_pattern(self, file_path: str) -> bool:
+ """Check if file matches the watcher's file pattern."""
+ filename = os.path.basename(file_path)
+ if self.file_pattern.startswith("*."):
+ return filename.endswith(self.file_pattern[1:])
+ elif self.file_pattern.startswith("."):
+ return filename.startswith(self.file_pattern) or filename == self.file_pattern.lstrip(".")
+ return filename == self.file_pattern
+
+ def start(self) -> None:
+ """Start the watchdog Observer."""
+ self._handler = _FileHandler(self)
+ self._observer = Observer()
+ self._observer.schedule(self._handler, self.watch_dir, recursive=True)
+ self._observer.daemon = True
+ self._observer.start()
+ logger.info("[%s] Watcher started on %s", self.tool_name, self.watch_dir)
+
+ def stop(self) -> None:
+ """Stop the watchdog Observer."""
+ if self._observer:
+ self._observer.stop()
+ self._observer.join(timeout=5)
+ logger.info("[%s] Watcher stopped", self.tool_name)
+
+ def flush(self) -> None:
+ """Force flush the buffer."""
+ self._flush_buffer()
+
+ def _process_file(self, file_path: str):
+ """Read new content from file using cursor, parse, normalize, buffer."""
+ try:
+ cursor = self.cursor_manager.get_cursor(file_path)
+ file_size = os.path.getsize(file_path)
+
+ if file_size <= cursor.last_position:
+ return
+
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+ f.seek(cursor.last_position)
+ new_content = f.read()
+
+ new_position = cursor.last_position + len(new_content.encode("utf-8"))
+
+ for line in new_content.splitlines():
+ line = line.strip()
+ if not line:
+ continue
+
+ raw_event = self.parse_line(line)
+ if raw_event is None:
+ continue
+
+ normalized = self.normalize_event(raw_event)
+ if normalized is None:
+ continue
+
+ if not self.filter_event(normalized):
+ continue
+
+ # Ensure tool_name is set
+ normalized["tool_name"] = self.tool_name
+
+ byte_size = len(line.encode("utf-8"))
+ self._buffer.add_line(normalized, byte_size)
+
+ self.cursor_manager.update_cursor(file_path, new_position)
+
+ self._check_batch_trigger()
+
+ except Exception as e:
+ logger.error("[%s] Error processing file %s: %s", self.tool_name, file_path, e)
+
+ def _check_batch_trigger(self):
+ """Check if batch trigger conditions are met."""
+ if self._buffer.is_empty():
+ return
+
+ line_count = len(self._buffer.lines)
+ age = time.time() - self._buffer.created_at if self._buffer.created_at > 0 else 0
+
+ if line_count >= self.batch_trigger_lines or age >= self.batch_trigger_seconds:
+ self._flush_buffer()
+
+ def _flush_buffer(self):
+ """Flush buffered events via batch_callback."""
+ if self._buffer.is_empty():
+ return
+
+ events = self._buffer.lines.copy()
+ self._buffer.clear()
+
+ logger.info("[%s] Flushing %d events", self.tool_name, len(events))
+ try:
+ self.batch_callback(events)
+ except Exception as e:
+ logger.error("[%s] Batch callback failed: %s", self.tool_name, e)
diff --git a/openviking/daemon/watchers/claude_code_watcher.py b/openviking/daemon/watchers/claude_code_watcher.py
new file mode 100644
index 0000000000..4174d559b7
--- /dev/null
+++ b/openviking/daemon/watchers/claude_code_watcher.py
@@ -0,0 +1,60 @@
+"""
+Claude Code JSONL log watcher.
+Monitors ~/.claude/projects//.jsonl files.
+"""
+import json
+from typing import Dict, List, Optional
+
+from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
+from openviking.daemon.watchers.registry import register_watcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_watcher("claude_code")
+class ClaudeCodeWatcher(BaseFileWatcher):
+ """Watches Claude Code JSONL log files and extracts conversation events."""
+
+ @property
+ def tool_name(self) -> str:
+ return "claude_code"
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs):
+ super().__init__(
+ watch_dir=watch_dir,
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ file_pattern="*.jsonl",
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+
+ def parse_line(self, line: str) -> Optional[Dict]:
+ """Parse a JSONL line into a raw event dict."""
+ if not line:
+ return None
+ try:
+ return json.loads(line)
+ except (json.JSONDecodeError, ValueError):
+ return None
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """Normalize Claude Code event to common format."""
+ role = raw_event.get("role")
+ event_type = raw_event.get("type", "")
+
+ if role not in ("user", "assistant"):
+ return None
+ if event_type and event_type != "message":
+ return None
+
+ return {
+ "role": role,
+ "content": raw_event.get("content", ""),
+ "type": "message",
+ "timestamp": raw_event.get("timestamp"),
+ "session_id": raw_event.get("session_id"),
+ "project_name": raw_event.get("project_name"),
+ }
diff --git a/openviking/daemon/watchers/continue_dev_watcher.py b/openviking/daemon/watchers/continue_dev_watcher.py
new file mode 100644
index 0000000000..0d87b64324
--- /dev/null
+++ b/openviking/daemon/watchers/continue_dev_watcher.py
@@ -0,0 +1,68 @@
+"""
+Continue.dev log watcher.
+Monitors ~/.continue/ JSON log files for AI conversation events.
+"""
+import json
+from typing import Dict, List, Optional
+
+from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
+from openviking.daemon.watchers.registry import register_watcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_watcher("continue_dev")
+class ContinueDevWatcher(BaseFileWatcher):
+ """Watches Continue.dev JSON log files."""
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ file_pattern="*.json",
+ batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs):
+ super().__init__(
+ watch_dir=watch_dir,
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ file_pattern=file_pattern,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+
+ @property
+ def tool_name(self) -> str:
+ return "continue_dev"
+
+ def parse_line(self, line: str) -> Optional[Dict]:
+ """Parse a Continue.dev JSON log line."""
+ if not line:
+ return None
+ try:
+ data = json.loads(line)
+ if not isinstance(data, dict):
+ return None
+ return data
+ except (json.JSONDecodeError, ValueError):
+ return None
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """
+ Normalize Continue.dev event.
+ Continue.dev uses format:
+ {"role": "user"|"assistant", "content": "...", "timestamp": "..."}
+ """
+ role = raw_event.get("role", "")
+ content = raw_event.get("content", "")
+
+ if role not in ("user", "assistant"):
+ return None
+ if not content:
+ return None
+
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": raw_event.get("timestamp"),
+ "session_id": raw_event.get("sessionId") or raw_event.get("session_id"),
+ "project_name": raw_event.get("workspaceDirectory"),
+ }
diff --git a/openviking/daemon/watchers/cursor_watcher.py b/openviking/daemon/watchers/cursor_watcher.py
new file mode 100644
index 0000000000..6224fc67d7
--- /dev/null
+++ b/openviking/daemon/watchers/cursor_watcher.py
@@ -0,0 +1,87 @@
+"""
+Cursor IDE log watcher.
+Monitors Cursor log files (JSON format) for AI conversation events.
+"""
+import json
+from typing import Dict, List, Optional
+
+from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
+from openviking.daemon.watchers.registry import register_watcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_watcher("cursor")
+class CursorWatcher(BaseFileWatcher):
+ """
+ Watches Cursor IDE log files.
+
+ Cursor stores AI conversations in log files under its workspace storage.
+ The format varies by version, so we implement flexible parsing.
+ """
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ file_pattern="*.log",
+ batch_trigger_lines=50, batch_trigger_seconds=300,
+ extra=None, **kwargs):
+ super().__init__(
+ watch_dir=watch_dir,
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ file_pattern=file_pattern,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ self.extra = extra or {}
+
+ @property
+ def tool_name(self) -> str:
+ return "cursor"
+
+ def parse_line(self, line: str) -> Optional[Dict]:
+ """Parse a Cursor log line (JSON format)."""
+ if not line:
+ return None
+ try:
+ data = json.loads(line)
+ if not isinstance(data, dict):
+ return None
+ return data
+ except (json.JSONDecodeError, ValueError):
+ return None
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """
+ Normalize Cursor log event.
+ Cursor logs use various schemas. We look for common patterns:
+ - {"type": "chat", "role": "user"|"assistant", "message": "..."}
+ - {"event": "ai_response", "content": "..."}
+ """
+ # Try standard chat format
+ event_type = raw_event.get("type", "")
+ role = raw_event.get("role", "")
+
+ if role in ("user", "human", "human_turn"):
+ content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or ""
+ if content:
+ return {
+ "role": "user",
+ "content": content,
+ "type": "message",
+ "timestamp": raw_event.get("timestamp") or raw_event.get("ts"),
+ "session_id": raw_event.get("conversationId") or raw_event.get("session_id"),
+ }
+
+ if role in ("assistant", "ai", "ai_response", "bot"):
+ content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or ""
+ if content:
+ return {
+ "role": "assistant",
+ "content": content,
+ "type": "message",
+ "timestamp": raw_event.get("timestamp") or raw_event.get("ts"),
+ "session_id": raw_event.get("conversationId") or raw_event.get("session_id"),
+ }
+
+ return None
diff --git a/openviking/daemon/watchers/generic_jsonl_watcher.py b/openviking/daemon/watchers/generic_jsonl_watcher.py
new file mode 100644
index 0000000000..1bc1b1da62
--- /dev/null
+++ b/openviking/daemon/watchers/generic_jsonl_watcher.py
@@ -0,0 +1,104 @@
+"""
+Generic JSONL log watcher.
+Supports arbitrary JSONL logs with customizable field mapping.
+"""
+import json
+from typing import Dict, List, Optional
+
+from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
+from openviking.daemon.watchers.registry import register_watcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_watcher("generic_jsonl")
+class GenericJSONLWatcher(BaseFileWatcher):
+ """
+ Generic JSONL watcher with customizable field mapping.
+
+ Extra config options (passed via constructor **kwargs or extra dict):
+ - role_field (str): JSON key for role. Default: "role"
+ - user_role_value (str): Value that indicates user. Default: "user"
+ - assistant_role_value (str): Value that indicates assistant. Default: "assistant"
+ - content_field (str): JSON key for content. Default: "content"
+ - timestamp_field (str): JSON key for timestamp. Default: "timestamp"
+ - session_id_field (str): JSON key for session_id. Default: "session_id"
+ - project_name_field (str): JSON key for project_name. Default: "project_name"
+ - type_field (str): JSON key for event type. Default: "type"
+ - message_type_value (str): Value that indicates a message event. Default: "message"
+ """
+
+ # Default field mappings
+ DEFAULTS = {
+ "role_field": "role",
+ "user_role_value": "user",
+ "assistant_role_value": "assistant",
+ "content_field": "content",
+ "timestamp_field": "timestamp",
+ "session_id_field": "session_id",
+ "project_name_field": "project_name",
+ "type_field": "type",
+ "message_type_value": "message",
+ }
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ file_pattern="*.jsonl",
+ batch_trigger_lines=50, batch_trigger_seconds=300,
+ extra=None, **kwargs):
+ super().__init__(
+ watch_dir=watch_dir,
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ file_pattern=file_pattern,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ self.extra = extra or {}
+ self.mapping = {**self.DEFAULTS, **self.extra}
+
+ @property
+ def tool_name(self) -> str:
+ return "generic_jsonl"
+
+ def parse_line(self, line: str) -> Optional[Dict]:
+ if not line:
+ return None
+ try:
+ return json.loads(line)
+ except (json.JSONDecodeError, ValueError):
+ return None
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ m = self.mapping
+
+ role_field = m["role_field"]
+ raw_role = raw_event.get(role_field, "")
+
+ # Map raw role to normalized role
+ if raw_role == m["user_role_value"]:
+ role = "user"
+ elif raw_role == m["assistant_role_value"]:
+ role = "assistant"
+ else:
+ return None
+
+ content = raw_event.get(m["content_field"], "")
+ if not content:
+ return None
+
+ # Optional type check
+ type_field = m.get("type_field")
+ message_type = m.get("message_type_value")
+ if type_field and type_field in raw_event:
+ if raw_event[type_field] != message_type:
+ return None
+
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": raw_event.get(m["timestamp_field"]),
+ "session_id": raw_event.get(m["session_id_field"]),
+ "project_name": raw_event.get(m["project_name_field"]),
+ }
diff --git a/openviking/daemon/watchers/registry.py b/openviking/daemon/watchers/registry.py
new file mode 100644
index 0000000000..18a436347d
--- /dev/null
+++ b/openviking/daemon/watchers/registry.py
@@ -0,0 +1,72 @@
+"""
+Watcher registry for creating tool-specific watchers by name.
+"""
+from typing import Dict, Type
+
+from openviking.daemon.watchers import BaseWatcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+# Registry mapping tool names to watcher classes
+_WATCHER_REGISTRY: Dict[str, Type] = {}
+
+
+def register_watcher(tool_name: str):
+ """Decorator to register a watcher class for a tool name."""
+ def decorator(cls):
+ _WATCHER_REGISTRY[tool_name] = cls
+ return cls
+ return decorator
+
+
+def create_watcher(tool_name: str, **kwargs) -> BaseWatcher:
+ """Factory: create a watcher instance by tool name."""
+ cls = _WATCHER_REGISTRY.get(tool_name)
+ if cls is None:
+ available = list(_WATCHER_REGISTRY.keys())
+ raise ValueError(f"Unknown watcher tool: '{tool_name}'. Available: {available}")
+ return cls(**kwargs)
+
+
+def list_available_watchers() -> list:
+ """Return list of registered watcher tool names."""
+ return list(_WATCHER_REGISTRY.keys())
+
+
+def _register_builtins():
+ """Register built-in watchers. Called lazily to avoid import cycles."""
+ if _WATCHER_REGISTRY:
+ return
+ try:
+ from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher
+ _WATCHER_REGISTRY["claude_code"] = ClaudeCodeWatcher
+ except ImportError:
+ pass
+
+ try:
+ from openviking.daemon.watchers.generic_jsonl_watcher import GenericJSONLWatcher
+ _WATCHER_REGISTRY["generic_jsonl"] = GenericJSONLWatcher
+ except ImportError:
+ pass
+
+ try:
+ from openviking.daemon.watchers.aider_watcher import AiderWatcher
+ _WATCHER_REGISTRY["aider"] = AiderWatcher
+ except ImportError:
+ pass
+
+ try:
+ from openviking.daemon.watchers.cursor_watcher import CursorWatcher
+ _WATCHER_REGISTRY["cursor"] = CursorWatcher
+ except ImportError:
+ pass
+
+ try:
+ from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher
+ _WATCHER_REGISTRY["continue_dev"] = ContinueDevWatcher
+ except ImportError:
+ pass
+
+
+_register_builtins()
diff --git a/openviking/server/app.py b/openviking/server/app.py
index f221b289bd..31811504ee 100644
--- a/openviking/server/app.py
+++ b/openviking/server/app.py
@@ -34,6 +34,7 @@
code_router,
console_router,
content_router,
+ daemon_router,
debug_router,
filesystem_router,
metrics_router,
@@ -276,6 +277,33 @@ async def _oauth_gc_loop(store) -> None: # noqa: ANN001
task_tracker = get_task_tracker()
task_tracker.start_cleanup_loop()
+ # Start Active Daemon if enabled
+ daemon_service = None
+ if config.daemon.enabled:
+ try:
+ from openviking.daemon.service import DaemonService
+ from openviking.server.config import DaemonConfig
+
+ daemon_config = config.daemon
+ if not daemon_config.enabled:
+ daemon_config = DaemonConfig.from_env()
+
+ if daemon_config.enabled:
+ resource_service = service.resources
+ daemon_service = DaemonService(
+ resource_service=resource_service,
+ watcher_configs=daemon_config.get_effective_watchers(),
+ db_path=daemon_config.db_path,
+ batch_trigger_lines=daemon_config.batch_trigger_lines,
+ batch_trigger_seconds=daemon_config.batch_trigger_seconds,
+ )
+ await daemon_service.start()
+ from openviking.server.routers.daemon import set_daemon_service
+ set_daemon_service(daemon_service)
+ logger.info("Active Daemon started with %d watcher(s)", len(daemon_service.watchers))
+ except Exception as e:
+ logger.warning("Failed to start Active Daemon: %s", e)
+
# Initialize tracing and OTLP log export from server.observability.
from openviking.telemetry import tracer_module
@@ -290,6 +318,16 @@ async def _oauth_gc_loop(store) -> None: # noqa: ANN001
await _initialize_runtime_state(app, service, config)
yield
+ # Stop Active Daemon on shutdown
+ if daemon_service is not None:
+ try:
+ from openviking.server.routers.daemon import set_daemon_service
+ set_daemon_service(None)
+ await daemon_service.stop()
+ logger.info("Active Daemon stopped")
+ except Exception as e:
+ logger.warning("Failed to stop Active Daemon: %s", e)
+
# Cleanup
from openviking.metrics.global_api import shutdown_metrics_async
from openviking.observability.usage_audit import shutdown_usage_audit
@@ -539,6 +577,7 @@ async def general_error_handler(request: Request, exc: Exception):
app.include_router(watches_router)
app.include_router(webdav_router)
app.include_router(bot_router, prefix="/bot/v1")
+ app.include_router(daemon_router)
# OAuth 2.1: when enabled, mount the official MCP SDK auth routes
# (DCR / authorize / token / metadata) plus our authorize page + consent /
diff --git a/openviking/server/bootstrap.py b/openviking/server/bootstrap.py
index 7fa0228e25..c48aff8964 100644
--- a/openviking/server/bootstrap.py
+++ b/openviking/server/bootstrap.py
@@ -17,7 +17,7 @@
import uvicorn
from openviking.server.app import create_app
-from openviking.server.config import load_server_config
+from openviking.server.config import DaemonConfig, load_server_config
from openviking_cli.utils.config import OPENVIKING_CONFIG_ENV
from openviking_cli.utils.config.config_loader import resolve_config_path
from openviking_cli.utils.config.consts import (
@@ -169,6 +169,12 @@ def main():
dest="with_bot",
help="Enable Bot API proxy to Vikingbot (requires Vikingbot running)",
)
+ parser.add_argument(
+ "--with-daemon",
+ action="store_true",
+ dest="with_daemon",
+ help="Enable Active Daemon for automatic knowledge extraction from AI tool logs",
+ )
parser.add_argument(
"--bot-port",
type=int,
@@ -243,6 +249,8 @@ def main():
config.workers = args.workers
if args.with_bot:
config.with_bot = True
+ if args.with_daemon:
+ config.daemon.enabled = True
# Configure logging for Uvicorn
configure_uvicorn_logging()
diff --git a/openviking/server/config.py b/openviking/server/config.py
index 88943f8f4a..4a6becdd8a 100644
--- a/openviking/server/config.py
+++ b/openviking/server/config.py
@@ -3,7 +3,8 @@
"""Server configuration for OpenViking HTTP Server."""
import sys
-from typing import Dict, List, Literal, Optional
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional
from pydantic import BaseModel, Field, ValidationError
@@ -156,6 +157,84 @@ class ToolOutputExternalizationConfig(BaseModel):
model_config = {"extra": "forbid"}
+class WatcherConfig(BaseModel):
+ """Configuration for a single tool watcher."""
+
+ tool_name: str
+ watch_dir: str
+ file_pattern: str = "*.jsonl"
+ enabled: bool = True
+ batch_trigger_lines: int = Field(50, gt=0)
+ batch_trigger_seconds: int = Field(300, gt=0)
+ extra: Dict[str, Any] = Field(default_factory=dict)
+
+ model_config = {"extra": "forbid"}
+
+
+class DaemonConfig(BaseModel):
+ """Configuration for OpenViking Active Daemon."""
+
+ enabled: bool = False
+ db_path: Optional[str] = None
+ watchers: List[WatcherConfig] = Field(default_factory=list)
+
+ # Backward-compatible deprecated fields
+ watch_dir: Optional[str] = None
+ batch_trigger_lines: int = Field(50, gt=0)
+ batch_trigger_seconds: int = Field(300, gt=0)
+
+ model_config = {"extra": "forbid"}
+
+ def get_effective_watchers(self) -> List[WatcherConfig]:
+ """
+ Return list of enabled watchers.
+ If watchers list is empty but watch_dir is set, create a single claude_code watcher
+ for backward compatibility.
+ """
+ if self.watchers:
+ return [w for w in self.watchers if w.enabled]
+ elif self.watch_dir:
+ return [WatcherConfig(
+ tool_name="claude_code",
+ watch_dir=self.watch_dir,
+ batch_trigger_lines=self.batch_trigger_lines,
+ batch_trigger_seconds=self.batch_trigger_seconds,
+ )]
+ else:
+ return [WatcherConfig(
+ tool_name="claude_code",
+ watch_dir=str(Path.home() / ".claude" / "projects"),
+ batch_trigger_lines=self.batch_trigger_lines,
+ batch_trigger_seconds=self.batch_trigger_seconds,
+ )]
+
+ @classmethod
+ def from_env(cls) -> "DaemonConfig":
+ """Load configuration from OV_DAEMON_* environment variables."""
+ import os
+ import json as json_lib
+
+ enabled = os.getenv("OV_DAEMON_ENABLED", "false").lower() == "true"
+ watchers_json = os.getenv("OV_DAEMON_WATCHERS")
+
+ watchers = []
+ if watchers_json:
+ try:
+ raw = json_lib.loads(watchers_json)
+ watchers = [WatcherConfig(**w) for w in raw]
+ except (json_lib.JSONDecodeError, ValueError):
+ pass
+
+ return cls(
+ enabled=enabled,
+ watch_dir=os.getenv("OV_DAEMON_WATCH_DIR"),
+ db_path=os.getenv("OV_DAEMON_DB_PATH"),
+ batch_trigger_lines=int(os.getenv("OV_DAEMON_BATCH_LINES", "50")),
+ batch_trigger_seconds=int(os.getenv("OV_DAEMON_BATCH_SECONDS", "300")),
+ watchers=watchers,
+ )
+
+
class ServerConfig(BaseModel):
host: str = "127.0.0.1"
port: int = 1933
@@ -180,6 +259,7 @@ class ServerConfig(BaseModel):
tool_output_externalization: ToolOutputExternalizationConfig = Field(
default_factory=ToolOutputExternalizationConfig
)
+ daemon: DaemonConfig = Field(default_factory=DaemonConfig)
model_config = {"extra": "forbid"}
diff --git a/openviking/server/routers/__init__.py b/openviking/server/routers/__init__.py
index ded23d4892..2d7e98903a 100644
--- a/openviking/server/routers/__init__.py
+++ b/openviking/server/routers/__init__.py
@@ -7,6 +7,7 @@
from openviking.server.routers.code import router as code_router
from openviking.server.routers.console import router as console_router
from openviking.server.routers.content import router as content_router
+from openviking.server.routers.daemon import router as daemon_router
from openviking.server.routers.debug import router as debug_router
from openviking.server.routers.filesystem import router as filesystem_router
from openviking.server.routers.metrics import router as metrics_router
@@ -28,6 +29,7 @@
"admin_router",
"bot_router",
"code_router",
+ "daemon_router",
"system_router",
"resources_router",
"filesystem_router",
diff --git a/openviking/server/routers/daemon.py b/openviking/server/routers/daemon.py
new file mode 100644
index 0000000000..7474e09d33
--- /dev/null
+++ b/openviking/server/routers/daemon.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Daemon status API endpoints."""
+from typing import Any, Dict, List, Optional
+
+from fastapi import APIRouter
+
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+router = APIRouter(prefix="/api/v1/daemon", tags=["daemon"])
+
+# Module-level reference to the running DaemonService
+_daemon_service = None
+
+
+def set_daemon_service(service):
+ """Called by app.py lifespan to register the daemon service."""
+ global _daemon_service
+ _daemon_service = service
+
+
+@router.get("/status")
+async def get_daemon_status() -> Dict[str, Any]:
+ """
+ Get multi-watcher daemon status.
+
+ Returns:
+ {
+ "enabled": bool,
+ "running": bool,
+ "watchers": [...],
+ "available_tools": [...],
+ "db_path": str | null
+ }
+ """
+ from openviking.daemon.watchers.registry import list_available_watchers
+
+ if _daemon_service is None:
+ # Daemon not running — return config-based fallback
+ from openviking.server.config import DaemonConfig
+
+ config = DaemonConfig.from_env()
+ return {
+ "enabled": config.enabled,
+ "running": False,
+ "watchers": [],
+ "available_tools": list_available_watchers(),
+ "db_path": config.db_path,
+ }
+
+ svc = _daemon_service
+ watcher_statuses: List[Dict[str, Any]] = []
+ for i, watcher in enumerate(svc.watchers):
+ wc = svc._watcher_configs[i] if i < len(svc._watcher_configs) else None
+ cursor_count = 0
+ try:
+ if svc.cursor_manager:
+ cursor_count = len(svc.cursor_manager.get_all_cursors())
+ except Exception:
+ pass
+
+ watcher_statuses.append({
+ "tool_name": watcher.tool_name,
+ "watch_dir": wc.watch_dir if wc else None,
+ "file_pattern": wc.file_pattern if wc else None,
+ "enabled": True,
+ "running": True,
+ "cursor_count": cursor_count,
+ "batch_trigger_lines": wc.batch_trigger_lines if wc else None,
+ "batch_trigger_seconds": wc.batch_trigger_seconds if wc else None,
+ })
+
+ return {
+ "enabled": True,
+ "running": svc.is_running,
+ "watchers": watcher_statuses,
+ "available_tools": list_available_watchers(),
+ "db_path": svc.db_path,
+ }
diff --git a/tests/daemon/__init__.py b/tests/daemon/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/daemon/test_aider_watcher.py b/tests/daemon/test_aider_watcher.py
new file mode 100644
index 0000000000..6ea7721d08
--- /dev/null
+++ b/tests/daemon/test_aider_watcher.py
@@ -0,0 +1,183 @@
+"""Tests for AiderWatcher parsing logic."""
+import time
+from typing import Dict, Optional
+
+from openviking.daemon.watchers.aider_watcher import AiderWatcher
+
+
+class FakeCursorManager:
+ """Minimal stub for testing."""
+ def __init__(self):
+ self.cursors = {}
+ self.updates = []
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ from openviking.daemon.models import FileCursor
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+ self.updates.append((file_path, position))
+
+
+def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
+ batches = []
+ cursor_mgr = FakeCursorManager()
+ watcher = AiderWatcher(
+ watch_dir=str(tmp_path),
+ cursor_manager=cursor_mgr,
+ batch_callback=lambda events: batches.append(events),
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ return watcher, batches, cursor_mgr
+
+
+def test_matches_file_pattern(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.matches_file_pattern("/project/.aider.chat.history.md")
+ assert not w.matches_file_pattern("/project/other.md")
+ assert not w.matches_file_pattern("/project/aider.chat.history.md")
+
+
+def test_parse_single_user_assistant_block(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ content = (
+ "# aider chat started at 2024-01-15 10:30:00\n"
+ "\n"
+ "> /path/to/project\n"
+ "\n"
+ "#### user:\n"
+ "How do I implement a REST API in Flask?\n"
+ "\n"
+ "#### assistant:\n"
+ "Here's how to create a basic Flask REST API:\n"
+ "Use Flask and add routes.\n"
+ )
+ events = w._parse_aider_content(content)
+ assert len(events) == 2
+ assert events[0]["role"] == "user"
+ assert events[0]["content"] == "How do I implement a REST API in Flask?"
+ assert events[1]["role"] == "assistant"
+ assert "Flask REST API" in events[1]["content"]
+
+
+def test_parse_multiple_blocks(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ content = (
+ "# aider chat started at 2024-01-15 10:30:00\n"
+ "> /my/project\n"
+ "#### user:\n"
+ "First question\n"
+ "#### assistant:\n"
+ "First answer\n"
+ "#### user:\n"
+ "Second question\n"
+ "#### assistant:\n"
+ "Second answer\n"
+ )
+ events = w._parse_aider_content(content)
+ assert len(events) == 4
+ assert events[0]["role"] == "user"
+ assert events[0]["content"] == "First question"
+ assert events[1]["role"] == "assistant"
+ assert events[1]["content"] == "First answer"
+ assert events[2]["role"] == "user"
+ assert events[2]["content"] == "Second question"
+ assert events[3]["role"] == "assistant"
+ assert events[3]["content"] == "Second answer"
+
+
+def test_timestamp_and_project_extraction(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ content = (
+ "# aider chat started at 2024-01-15 10:30:00\n"
+ "> /home/user/myproject\n"
+ "#### user:\n"
+ "Hello\n"
+ )
+ events = w._parse_aider_content(content)
+ assert len(events) == 1
+ assert events[0]["timestamp"] == "2024-01-15 10:30:00"
+ assert events[0]["project_name"] == "/home/user/myproject"
+
+
+def test_empty_content_handling(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ events = w._parse_aider_content("")
+ assert events == []
+
+ events = w._parse_aider_content("# aider chat started at 2024-01-15 10:30:00\n")
+ assert events == []
+
+
+def test_multiline_content(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ content = (
+ "#### user:\n"
+ "Line one\n"
+ "Line two\n"
+ "Line three\n"
+ )
+ events = w._parse_aider_content(content)
+ assert len(events) == 1
+ assert events[0]["content"] == "Line one\nLine two\nLine three"
+
+
+def test_process_file_integration(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
+
+ test_file = tmp_path / ".aider.chat.history.md"
+ test_file.write_text(
+ "# aider chat started at 2024-01-15 10:30:00\n"
+ "> /project\n"
+ "#### user:\n"
+ "Hello\n"
+ "#### assistant:\n"
+ "Hi there\n",
+ encoding="utf-8",
+ )
+
+ w._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["tool_name"] == "aider"
+ assert batches[0][1]["role"] == "assistant"
+
+
+def test_incremental_read(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100)
+
+ test_file = tmp_path / ".aider.chat.history.md"
+ test_file.write_text(
+ "#### user:\n"
+ "First message\n",
+ encoding="utf-8",
+ )
+ w._process_file(str(test_file))
+
+ # Append more content
+ with open(str(test_file), "a", encoding="utf-8") as f:
+ f.write(
+ "#### assistant:\n"
+ "Response\n"
+ )
+ w._process_file(str(test_file))
+
+ w.flush()
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][1]["role"] == "assistant"
+
+
+def test_tool_name(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.tool_name == "aider"
diff --git a/tests/daemon/test_base_file_watcher.py b/tests/daemon/test_base_file_watcher.py
new file mode 100644
index 0000000000..badefba21f
--- /dev/null
+++ b/tests/daemon/test_base_file_watcher.py
@@ -0,0 +1,163 @@
+"""Tests for BaseFileWatcher abstraction."""
+import os
+import time
+from typing import Dict, List, Optional
+from unittest.mock import MagicMock
+
+from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
+
+
+class ConcreteWatcher(BaseFileWatcher):
+ """Concrete implementation for testing."""
+
+ @property
+ def tool_name(self) -> str:
+ return "test_tool"
+
+ def parse_line(self, line: str) -> Optional[Dict]:
+ import json
+ try:
+ return json.loads(line)
+ except (json.JSONDecodeError, ValueError):
+ return None
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ role = raw_event.get("role")
+ content = raw_event.get("content", "")
+ if role not in ("user", "assistant"):
+ return None
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": raw_event.get("timestamp"),
+ }
+
+
+class FakeCursorManager:
+ def __init__(self):
+ self.cursors = {}
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ from openviking.daemon.models import FileCursor
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+
+
+def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
+ batches = []
+ cursor_mgr = FakeCursorManager()
+ watcher = ConcreteWatcher(
+ watch_dir=str(tmp_path),
+ cursor_manager=cursor_mgr,
+ batch_callback=lambda events: batches.append(events),
+ file_pattern="*.jsonl",
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ return watcher, batches, cursor_mgr
+
+
+def test_matches_file_pattern_jsonl(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.matches_file_pattern("/foo/bar.jsonl")
+ assert not w.matches_file_pattern("/foo/bar.txt")
+
+
+def test_matches_file_pattern_exact(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ w.file_pattern = "history.md"
+ assert w.matches_file_pattern("/foo/history.md")
+ assert not w.matches_file_pattern("/foo/other.md")
+
+
+def test_process_file_parses_and_normalizes(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
+
+ # Create a test file
+ test_file = tmp_path / "test.jsonl"
+ test_file.write_text(
+ '{"role": "user", "content": "hello"}\n'
+ '{"role": "assistant", "content": "hi there"}\n'
+ )
+
+ # Process it
+ w._process_file(str(test_file))
+
+ # Should have flushed (2 lines >= batch_trigger_lines=2)
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["tool_name"] == "test_tool"
+ assert batches[0][1]["role"] == "assistant"
+
+
+def test_process_file_skips_invalid_lines(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100)
+
+ test_file = tmp_path / "test.jsonl"
+ test_file.write_text(
+ '{"role": "user", "content": "hello"}\n'
+ 'not valid json\n'
+ '{"role": "tool", "content": "skipped"}\n'
+ )
+
+ w._process_file(str(test_file))
+ w.flush()
+
+ assert len(batches) == 1
+ assert len(batches[0]) == 1 # only user message, tool role is filtered
+ assert batches[0][0]["role"] == "user"
+
+
+def test_incremental_read_via_cursor(tmp_path):
+ w, batches, cm = _make_watcher(tmp_path, batch_trigger_lines=100)
+
+ test_file = tmp_path / "test.jsonl"
+ test_file.write_text('{"role": "user", "content": "first"}\n')
+ w._process_file(str(test_file))
+
+ # Append more content
+ with open(str(test_file), "a", encoding="utf-8") as f:
+ f.write('{"role": "assistant", "content": "second"}\n')
+ w._process_file(str(test_file))
+
+ w.flush()
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+
+
+def test_filter_event_override(tmp_path):
+ class FilteredWatcher(ConcreteWatcher):
+ def filter_event(self, event):
+ return "skip" not in event.get("content", "")
+
+ batches = []
+ w = FilteredWatcher(
+ watch_dir=str(tmp_path),
+ cursor_manager=FakeCursorManager(),
+ batch_callback=lambda events: batches.append(events),
+ file_pattern="*.jsonl",
+ batch_trigger_lines=100,
+ batch_trigger_seconds=300,
+ )
+
+ test_file = tmp_path / "test.jsonl"
+ test_file.write_text(
+ '{"role": "user", "content": "keep this"}\n'
+ '{"role": "user", "content": "skip this please"}\n'
+ )
+
+ w._process_file(str(test_file))
+ w.flush()
+
+ assert len(batches) == 1
+ assert len(batches[0]) == 1
+ assert batches[0][0]["content"] == "keep this"
diff --git a/tests/daemon/test_claude_code_watcher.py b/tests/daemon/test_claude_code_watcher.py
new file mode 100644
index 0000000000..a31b03e21a
--- /dev/null
+++ b/tests/daemon/test_claude_code_watcher.py
@@ -0,0 +1,134 @@
+"""Tests for ClaudeCodeWatcher parsing and filtering logic."""
+import json
+import os
+import tempfile
+
+from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher
+
+
+class FakeCursorManager:
+ """Minimal stub for testing."""
+ def __init__(self):
+ self.cursors = {}
+ self.updates = []
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ self.updates.append((file_path, position))
+
+
+def _make_watcher(batch_trigger_lines=50, batch_trigger_seconds=300):
+ batches = []
+ cursor_mgr = FakeCursorManager()
+ watcher = ClaudeCodeWatcher(
+ watch_dir=tempfile.gettempdir(),
+ cursor_manager=cursor_mgr,
+ batch_callback=lambda lines: batches.append(lines),
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ return watcher, batches, cursor_mgr
+
+
+def test_tool_name():
+ watcher, _, _ = _make_watcher()
+ assert watcher.tool_name == "claude_code"
+
+
+def test_parse_valid_jsonl_line():
+ watcher, _, _ = _make_watcher()
+ line = '{"timestamp": "2026-06-15T10:30:00Z", "role": "user", "content": "Hello", "type": "message"}'
+ event = watcher.parse_line(line)
+ assert event is not None
+ assert event["role"] == "user"
+ assert event["content"] == "Hello"
+
+
+def test_parse_invalid_line():
+ watcher, _, _ = _make_watcher()
+ assert watcher.parse_line("not valid json") is None
+ assert watcher.parse_line("") is None
+
+
+def test_normalize_event_user_message():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "user", "type": "message", "content": "Hello", "timestamp": "2026-06-15T10:30:00Z"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "user"
+ assert result["type"] == "message"
+ assert result["content"] == "Hello"
+
+
+def test_normalize_event_assistant_message():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "assistant", "type": "message", "content": "AI answer"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "assistant"
+ assert result["content"] == "AI answer"
+
+
+def test_normalize_event_excludes_system_role():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "system", "type": "message", "content": "System msg"}
+ assert watcher.normalize_event(raw) is None
+
+
+def test_normalize_event_excludes_tool_call():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "assistant", "type": "tool_call", "content": "call"}
+ assert watcher.normalize_event(raw) is None
+
+
+def test_normalize_event_excludes_tool_result():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "assistant", "type": "tool_result", "content": "result"}
+ assert watcher.normalize_event(raw) is None
+
+
+def test_filter_event_keeps_messages():
+ watcher, _, _ = _make_watcher()
+ event = {"role": "user", "type": "message", "content": "Hello", "tool_name": "claude_code"}
+ assert watcher.filter_event(event) is True
+
+
+def test_process_file():
+ """Test that _process_file reads, parses, normalizes, and buffers events."""
+ watcher, batches, cursor_mgr = _make_watcher(batch_trigger_lines=2)
+
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f:
+ f.write(json.dumps({"role": "user", "type": "message", "content": "Hello"}) + "\n")
+ f.write(json.dumps({"role": "assistant", "type": "message", "content": "Hi"}) + "\n")
+ f.write(json.dumps({"role": "system", "type": "message", "content": "ignored"}) + "\n")
+ tmp_path = f.name
+
+ try:
+ watcher._process_file(tmp_path)
+ # batch_trigger_lines=2, so 2 valid events should trigger flush
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][1]["role"] == "assistant"
+ assert all(e["tool_name"] == "claude_code" for e in batches[0])
+ # Cursor should have been updated
+ assert len(cursor_mgr.updates) == 1
+ assert cursor_mgr.updates[0][0] == tmp_path
+ finally:
+ os.unlink(tmp_path)
+
+
+def test_force_flush_empty_buffer():
+ watcher, batches, _ = _make_watcher()
+ watcher.flush()
+ assert len(batches) == 0
+
+
+def test_force_flush_with_data():
+ watcher, batches, _ = _make_watcher()
+ watcher._buffer.add_line({"role": "user", "content": "test", "tool_name": "claude_code"}, 10)
+ watcher.flush()
+ assert len(batches) == 1
diff --git a/tests/daemon/test_config.py b/tests/daemon/test_config.py
new file mode 100644
index 0000000000..3118d45b61
--- /dev/null
+++ b/tests/daemon/test_config.py
@@ -0,0 +1,85 @@
+"""Tests for WatcherConfig and DaemonConfig multi-watcher support."""
+import pytest
+from openviking.server.config import WatcherConfig, DaemonConfig
+
+
+def test_watcher_config_defaults():
+ wc = WatcherConfig(tool_name="test", watch_dir="/tmp/test")
+ assert wc.file_pattern == "*.jsonl"
+ assert wc.enabled is True
+ assert wc.batch_trigger_lines == 50
+ assert wc.extra == {}
+
+
+def test_watcher_config_custom():
+ wc = WatcherConfig(
+ tool_name="aider",
+ watch_dir="~/Projects",
+ file_pattern=".aider.chat.history.md",
+ batch_trigger_lines=100,
+ extra={"key": "value"},
+ )
+ assert wc.tool_name == "aider"
+ assert wc.file_pattern == ".aider.chat.history.md"
+ assert wc.extra == {"key": "value"}
+
+
+def test_watcher_config_forbid_extra():
+ with pytest.raises(ValueError):
+ WatcherConfig(tool_name="test", watch_dir="/tmp", unknown_field="x")
+
+
+def test_daemon_config_get_effective_watchers_explicit():
+ cfg = DaemonConfig(
+ enabled=True,
+ watchers=[
+ WatcherConfig(tool_name="claude_code", watch_dir="/a"),
+ WatcherConfig(tool_name="aider", watch_dir="/b"),
+ ],
+ )
+ effective = cfg.get_effective_watchers()
+ assert len(effective) == 2
+ assert effective[0].tool_name == "claude_code"
+ assert effective[1].tool_name == "aider"
+
+
+def test_daemon_config_get_effective_watchers_disabled_filtered():
+ cfg = DaemonConfig(
+ enabled=True,
+ watchers=[
+ WatcherConfig(tool_name="claude_code", watch_dir="/a"),
+ WatcherConfig(tool_name="aider", watch_dir="/b", enabled=False),
+ ],
+ )
+ effective = cfg.get_effective_watchers()
+ assert len(effective) == 1
+ assert effective[0].tool_name == "claude_code"
+
+
+def test_daemon_config_backward_compat_watch_dir():
+ cfg = DaemonConfig(enabled=True, watch_dir="~/.claude/projects")
+ effective = cfg.get_effective_watchers()
+ assert len(effective) == 1
+ assert effective[0].tool_name == "claude_code"
+ assert effective[0].watch_dir == "~/.claude/projects"
+
+
+def test_daemon_config_backward_compat_default():
+ cfg = DaemonConfig(enabled=True)
+ effective = cfg.get_effective_watchers()
+ assert len(effective) == 1
+ assert effective[0].tool_name == "claude_code"
+
+
+def test_daemon_config_from_env_watchers():
+ import os
+ os.environ["OV_DAEMON_ENABLED"] = "true"
+ os.environ["OV_DAEMON_WATCHERS"] = '[{"tool_name": "aider", "watch_dir": "/tmp"}]'
+ try:
+ cfg = DaemonConfig.from_env()
+ assert cfg.enabled is True
+ assert len(cfg.watchers) == 1
+ assert cfg.watchers[0].tool_name == "aider"
+ finally:
+ os.environ.pop("OV_DAEMON_ENABLED", None)
+ os.environ.pop("OV_DAEMON_WATCHERS", None)
diff --git a/tests/daemon/test_continue_dev_watcher.py b/tests/daemon/test_continue_dev_watcher.py
new file mode 100644
index 0000000000..2a6e01dc14
--- /dev/null
+++ b/tests/daemon/test_continue_dev_watcher.py
@@ -0,0 +1,193 @@
+"""Tests for ContinueDevWatcher parsing and normalization logic."""
+import time
+from typing import Dict, Optional
+
+from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher
+
+
+class FakeCursorManager:
+ """Minimal stub for testing."""
+ def __init__(self):
+ self.cursors = {}
+ self.updates = []
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ from openviking.daemon.models import FileCursor
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+ self.updates.append((file_path, position))
+
+
+def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
+ batches = []
+ cursor_mgr = FakeCursorManager()
+ watcher = ContinueDevWatcher(
+ watch_dir=str(tmp_path),
+ cursor_manager=cursor_mgr,
+ batch_callback=lambda events: batches.append(events),
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ return watcher, batches, cursor_mgr
+
+
+def test_parse_line_valid_json(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ line = '{"role": "user", "content": "Hello"}'
+ result = w.parse_line(line)
+ assert result is not None
+ assert result["role"] == "user"
+ assert result["content"] == "Hello"
+
+
+def test_parse_line_invalid_json(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.parse_line("not valid json") is None
+ assert w.parse_line("") is None
+ assert w.parse_line(" ") is None
+
+
+def test_parse_line_non_dict(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.parse_line('"just a string"') is None
+ assert w.parse_line("[1, 2, 3]") is None
+ assert w.parse_line("42") is None
+
+
+def test_normalize_standard_format(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "user", "content": "How do I sort a list?"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "user"
+ assert event["content"] == "How do I sort a list?"
+ assert event["type"] == "message"
+
+
+def test_normalize_assistant(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "assistant", "content": "Use the sorted() function."}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "assistant"
+ assert event["content"] == "Use the sorted() function."
+
+
+def test_normalize_missing_content_filtered(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ # Empty content
+ raw = {"role": "user", "content": ""}
+ assert w.normalize_event(raw) is None
+
+ # Missing content key entirely
+ raw = {"role": "user"}
+ assert w.normalize_event(raw) is None
+
+
+def test_normalize_invalid_role_filtered(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "system", "content": "System message"}
+ assert w.normalize_event(raw) is None
+
+ raw = {"role": "tool", "content": "Tool output"}
+ assert w.normalize_event(raw) is None
+
+ raw = {"content": "No role at all"}
+ assert w.normalize_event(raw) is None
+
+
+def test_normalize_session_id_mapping(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ # sessionId field
+ raw = {"role": "user", "content": "test", "sessionId": "session-abc-123"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["session_id"] == "session-abc-123"
+
+ # session_id field (alternative)
+ raw = {"role": "user", "content": "test", "session_id": "session-xyz-789"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["session_id"] == "session-xyz-789"
+
+
+def test_normalize_workspace_directory_to_project_name(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {
+ "role": "user",
+ "content": "test",
+ "workspaceDirectory": "/home/user/my-project",
+ }
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["project_name"] == "/home/user/my-project"
+
+
+def test_normalize_timestamp(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "user", "content": "test", "timestamp": "2024-01-15T10:30:00Z"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["timestamp"] == "2024-01-15T10:30:00Z"
+
+
+def test_normalize_optional_fields_none(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "user", "content": "minimal event"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["timestamp"] is None
+ assert event["session_id"] is None
+ assert event["project_name"] is None
+
+
+def test_tool_name(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.tool_name == "continue_dev"
+
+
+def test_process_file_integration(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
+
+ test_file = tmp_path / "continue.json"
+ test_file.write_text(
+ '{"role": "user", "content": "Hello"}\n'
+ '{"role": "assistant", "content": "Hi there"}\n',
+ encoding="utf-8",
+ )
+
+ w._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["tool_name"] == "continue_dev"
+ assert batches[0][1]["role"] == "assistant"
+
+
+def test_process_file_filters_non_chat(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100)
+
+ test_file = tmp_path / "continue.json"
+ test_file.write_text(
+ '{"role": "user", "content": "Hello"}\n'
+ '{"role": "system", "content": "System prompt"}\n'
+ '{"role": "assistant", "content": "Hi"}\n'
+ 'not json at all\n',
+ encoding="utf-8",
+ )
+
+ w._process_file(str(test_file))
+ w.flush()
+
+ assert len(batches) == 1
+ assert len(batches[0]) == 2 # only user + assistant, system filtered
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][1]["role"] == "assistant"
diff --git a/tests/daemon/test_conversation_reconstructor.py b/tests/daemon/test_conversation_reconstructor.py
new file mode 100644
index 0000000000..b8c588855d
--- /dev/null
+++ b/tests/daemon/test_conversation_reconstructor.py
@@ -0,0 +1,58 @@
+"""Tests for ConversationReconstructor."""
+from openviking.daemon.conversation_reconstructor import ConversationReconstructor
+
+
+def test_reconstruct_simple_conversation():
+ r = ConversationReconstructor()
+ events = [
+ {"role": "user", "content": "How to configure PostgreSQL?", "timestamp": "2026-06-15T10:00:00Z"},
+ {"role": "assistant", "content": "Edit postgresql.conf", "timestamp": "2026-06-15T10:00:01Z"},
+ ]
+ turns = r.reconstruct(events)
+ assert len(turns) == 1
+ assert turns[0].user_prompt == "How to configure PostgreSQL?"
+ assert turns[0].assistant_response == "Edit postgresql.conf"
+
+
+def test_skip_orphaned_assistant():
+ r = ConversationReconstructor()
+ events = [
+ {"role": "assistant", "content": "Orphan answer", "timestamp": "2026-06-15T10:00:00Z"},
+ ]
+ turns = r.reconstruct(events)
+ assert len(turns) == 0
+
+
+def test_multiple_turns():
+ r = ConversationReconstructor()
+ events = [
+ {"role": "user", "content": "Q1", "timestamp": "2026-06-15T10:00:00Z"},
+ {"role": "assistant", "content": "A1", "timestamp": "2026-06-15T10:00:01Z"},
+ {"role": "user", "content": "Q2", "timestamp": "2026-06-15T10:00:02Z"},
+ {"role": "assistant", "content": "A2", "timestamp": "2026-06-15T10:00:03Z"},
+ ]
+ turns = r.reconstruct(events)
+ assert len(turns) == 2
+ assert turns[0].user_prompt == "Q1"
+ assert turns[1].user_prompt == "Q2"
+
+
+def test_unpaired_user_prompt():
+ r = ConversationReconstructor()
+ events = [
+ {"role": "user", "content": "No answer", "timestamp": "2026-06-15T10:00:00Z"},
+ ]
+ turns = r.reconstruct(events)
+ assert len(turns) == 0
+
+
+def test_preserves_metadata():
+ r = ConversationReconstructor()
+ events = [
+ {"role": "user", "content": "Q", "timestamp": "2026-06-15T10:00:00Z", "session_id": "s1", "project_name": "proj"},
+ {"role": "assistant", "content": "A", "timestamp": "2026-06-15T10:00:01Z"},
+ ]
+ turns = r.reconstruct(events)
+ assert turns[0].session_id == "s1"
+ assert turns[0].project_name == "proj"
+ assert turns[0].timestamp == "2026-06-15T10:00:00Z"
diff --git a/tests/daemon/test_cursor_manager.py b/tests/daemon/test_cursor_manager.py
new file mode 100644
index 0000000000..375df56d09
--- /dev/null
+++ b/tests/daemon/test_cursor_manager.py
@@ -0,0 +1,61 @@
+"""Tests for CursorManager."""
+import os
+import tempfile
+
+import pytest
+
+from openviking.daemon.cursor_manager import CursorManager
+
+
+@pytest.fixture
+def temp_db():
+ fd, path = tempfile.mkstemp(suffix=".db")
+ os.close(fd)
+ yield path
+ if os.path.exists(path):
+ os.unlink(path)
+
+
+def test_default_cursor(temp_db):
+ manager = CursorManager(temp_db)
+ cursor = manager.get_cursor("/path/to/file.jsonl")
+ assert cursor.last_position == 0
+ assert cursor.last_read_time == 0.0
+
+
+def test_save_and_load_cursor(temp_db):
+ manager = CursorManager(temp_db)
+ manager.update_cursor("/path/to/file.jsonl", 1024)
+
+ cursor = manager.get_cursor("/path/to/file.jsonl")
+ assert cursor.last_position == 1024
+ assert cursor.last_read_time > 0
+
+
+def test_persist_across_instances(temp_db):
+ manager1 = CursorManager(temp_db)
+ manager1.update_cursor("/path/to/file.jsonl", 2048)
+
+ manager2 = CursorManager(temp_db)
+ cursor = manager2.get_cursor("/path/to/file.jsonl")
+ assert cursor.last_position == 2048
+
+
+def test_get_all_cursors(temp_db):
+ manager = CursorManager(temp_db)
+ manager.update_cursor("/path/file1.jsonl", 100)
+ manager.update_cursor("/path/file2.jsonl", 200)
+
+ cursors = manager.get_all_cursors()
+ assert len(cursors) == 2
+ assert cursors["/path/file1.jsonl"].last_position == 100
+ assert cursors["/path/file2.jsonl"].last_position == 200
+
+
+def test_update_existing_cursor(temp_db):
+ manager = CursorManager(temp_db)
+ manager.update_cursor("/path/file.jsonl", 100)
+ manager.update_cursor("/path/file.jsonl", 500)
+
+ cursor = manager.get_cursor("/path/file.jsonl")
+ assert cursor.last_position == 500
diff --git a/tests/daemon/test_cursor_watcher.py b/tests/daemon/test_cursor_watcher.py
new file mode 100644
index 0000000000..e3982608f4
--- /dev/null
+++ b/tests/daemon/test_cursor_watcher.py
@@ -0,0 +1,187 @@
+"""Tests for CursorWatcher parsing and normalization logic."""
+import time
+from typing import Dict, Optional
+
+from openviking.daemon.watchers.cursor_watcher import CursorWatcher
+
+
+class FakeCursorManager:
+ """Minimal stub for testing."""
+ def __init__(self):
+ self.cursors = {}
+ self.updates = []
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ from openviking.daemon.models import FileCursor
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+ self.updates.append((file_path, position))
+
+
+def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
+ batches = []
+ cursor_mgr = FakeCursorManager()
+ watcher = CursorWatcher(
+ watch_dir=str(tmp_path),
+ cursor_manager=cursor_mgr,
+ batch_callback=lambda events: batches.append(events),
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ )
+ return watcher, batches, cursor_mgr
+
+
+def test_parse_line_valid_json(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ line = '{"type": "chat", "role": "user", "message": "Hello"}'
+ result = w.parse_line(line)
+ assert result is not None
+ assert result["role"] == "user"
+ assert result["message"] == "Hello"
+
+
+def test_parse_line_invalid_json(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.parse_line("not valid json") is None
+ assert w.parse_line("") is None
+ assert w.parse_line(" ") is None
+
+
+def test_parse_line_non_dict(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.parse_line('"just a string"') is None
+ assert w.parse_line("[1, 2, 3]") is None
+ assert w.parse_line("42") is None
+
+
+def test_normalize_standard_chat_format(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"type": "chat", "role": "user", "message": "How do I sort a list?"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "user"
+ assert event["content"] == "How do I sort a list?"
+ assert event["type"] == "message"
+
+
+def test_normalize_human_role(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "human", "content": "What is Python?"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "user"
+ assert event["content"] == "What is Python?"
+
+
+def test_normalize_human_turn_role(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "human_turn", "text": "Explain recursion"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "user"
+ assert event["content"] == "Explain recursion"
+
+
+def test_normalize_ai_response(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "ai_response", "text": "Recursion is when a function calls itself."}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "assistant"
+ assert event["content"] == "Recursion is when a function calls itself."
+
+
+def test_normalize_assistant_role(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "assistant", "message": "Here is the answer."}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "assistant"
+ assert event["content"] == "Here is the answer."
+
+
+def test_normalize_ai_role(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "ai", "content": "AI generated response"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "assistant"
+ assert event["content"] == "AI generated response"
+
+
+def test_normalize_bot_role(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "bot", "message": "Bot reply"}
+ event = w.normalize_event(raw)
+ assert event is not None
+ assert event["role"] == "assistant"
+ assert event["content"] == "Bot reply"
+
+
+def test_normalize_non_chat_event_filtered(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ # System message should be filtered
+ raw = {"role": "system", "content": "System prompt"}
+ assert w.normalize_event(raw) is None
+
+ # No role
+ raw = {"type": "info", "content": "some log"}
+ assert w.normalize_event(raw) is None
+
+ # Empty content
+ raw = {"role": "user", "message": ""}
+ assert w.normalize_event(raw) is None
+
+
+def test_normalize_timestamp_fields(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "user", "message": "test", "timestamp": "2024-01-15T10:30:00Z"}
+ event = w.normalize_event(raw)
+ assert event["timestamp"] == "2024-01-15T10:30:00Z"
+
+ # Also check ts field
+ raw = {"role": "user", "message": "test", "ts": "2024-01-15T11:00:00Z"}
+ event = w.normalize_event(raw)
+ assert event["timestamp"] == "2024-01-15T11:00:00Z"
+
+
+def test_normalize_session_id_mapping(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ raw = {"role": "user", "message": "test", "conversationId": "conv-123"}
+ event = w.normalize_event(raw)
+ assert event["session_id"] == "conv-123"
+
+ raw = {"role": "user", "message": "test", "session_id": "sess-456"}
+ event = w.normalize_event(raw)
+ assert event["session_id"] == "sess-456"
+
+
+def test_tool_name(tmp_path):
+ w, _, _ = _make_watcher(tmp_path)
+ assert w.tool_name == "cursor"
+
+
+def test_process_file_integration(tmp_path):
+ w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
+
+ test_file = tmp_path / "cursor.log"
+ test_file.write_text(
+ '{"role": "user", "message": "Hello"}\n'
+ '{"role": "assistant", "message": "Hi there"}\n',
+ encoding="utf-8",
+ )
+
+ w._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["tool_name"] == "cursor"
+ assert batches[0][1]["role"] == "assistant"
diff --git a/tests/daemon/test_deduplicator.py b/tests/daemon/test_deduplicator.py
new file mode 100644
index 0000000000..569c14a3fb
--- /dev/null
+++ b/tests/daemon/test_deduplicator.py
@@ -0,0 +1,42 @@
+"""Tests for KnowledgeDeduplicator."""
+from openviking.daemon.deduplicator import KnowledgeDeduplicator
+from openviking.daemon.models import ExtractedKnowledge
+
+
+def _make_knowledge(title: str, content: str) -> ExtractedKnowledge:
+ return ExtractedKnowledge(
+ status="EXTRACTED",
+ category="memories",
+ title=title,
+ content=content,
+ )
+
+
+def test_first_occurrence_not_duplicate():
+ dedup = KnowledgeDeduplicator()
+ k = _make_knowledge("Test", "Some unique content")
+ assert not dedup.is_duplicate(k)
+
+
+def test_same_content_is_duplicate():
+ dedup = KnowledgeDeduplicator()
+ k1 = _make_knowledge("Title A", "Same content")
+ k2 = _make_knowledge("Title B", "Same content")
+ assert not dedup.is_duplicate(k1)
+ assert dedup.is_duplicate(k2)
+
+
+def test_different_content_not_duplicate():
+ dedup = KnowledgeDeduplicator()
+ k1 = _make_knowledge("A", "Content 1")
+ k2 = _make_knowledge("B", "Content 2")
+ assert not dedup.is_duplicate(k1)
+ assert not dedup.is_duplicate(k2)
+
+
+def test_clear_resets_cache():
+ dedup = KnowledgeDeduplicator()
+ k = _make_knowledge("Test", "Content")
+ dedup.is_duplicate(k)
+ dedup.clear()
+ assert not dedup.is_duplicate(k)
diff --git a/tests/daemon/test_filters.py b/tests/daemon/test_filters.py
new file mode 100644
index 0000000000..52d0c07d50
--- /dev/null
+++ b/tests/daemon/test_filters.py
@@ -0,0 +1,53 @@
+"""Tests for LowValueFilter."""
+from openviking.daemon.filters import LowValueFilter
+
+
+def test_filter_short_content():
+ f = LowValueFilter()
+ events = [
+ {"content": "Short"},
+ {"content": "This is a longer meaningful conversation about architecture"},
+ ]
+ filtered = f.apply(events)
+ assert len(filtered) == 1
+
+
+def test_filter_noise_patterns():
+ f = LowValueFilter()
+ events = [
+ {"content": "npm install lodash --save"},
+ {"content": "git commit -m 'fix bug'"},
+ {"content": "Let's discuss the architecture design pattern for the new module"},
+ ]
+ filtered = f.apply(events)
+ assert len(filtered) == 1
+ assert "architecture" in filtered[0]["content"]
+
+
+def test_filter_pip_install():
+ f = LowValueFilter()
+ events = [
+ {"content": "pip install requests library for HTTP calls"},
+ ]
+ filtered = f.apply(events)
+ assert len(filtered) == 0
+
+
+def test_preserves_valid_content():
+ f = LowValueFilter()
+ events = [
+ {"content": "We decided to use PostgreSQL instead of MySQL for better JSON support"},
+ {"content": "The memory leak was caused by unclosed database connections"},
+ ]
+ filtered = f.apply(events)
+ assert len(filtered) == 2
+
+
+def test_empty_content_filtered():
+ f = LowValueFilter()
+ events = [
+ {"content": ""},
+ {"content": " "},
+ ]
+ filtered = f.apply(events)
+ assert len(filtered) == 0
diff --git a/tests/daemon/test_generic_jsonl_watcher.py b/tests/daemon/test_generic_jsonl_watcher.py
new file mode 100644
index 0000000000..38707cba98
--- /dev/null
+++ b/tests/daemon/test_generic_jsonl_watcher.py
@@ -0,0 +1,156 @@
+"""Tests for GenericJSONLWatcher with default and custom field mappings."""
+import json
+import os
+import tempfile
+
+from openviking.daemon.watchers.generic_jsonl_watcher import GenericJSONLWatcher
+
+
+class FakeCursorManager:
+ """Minimal stub for testing."""
+ def __init__(self):
+ self.cursors = {}
+ self.updates = []
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ self.updates.append((file_path, position))
+
+
+def _make_watcher(extra=None, batch_trigger_lines=50, batch_trigger_seconds=300):
+ batches = []
+ cursor_mgr = FakeCursorManager()
+ watcher = GenericJSONLWatcher(
+ watch_dir=tempfile.gettempdir(),
+ cursor_manager=cursor_mgr,
+ batch_callback=lambda lines: batches.append(lines),
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ extra=extra,
+ )
+ return watcher, batches, cursor_mgr
+
+
+# -- parse_line tests --
+
+def test_parse_valid_jsonl():
+ watcher, _, _ = _make_watcher()
+ line = '{"role": "user", "content": "Hello"}'
+ result = watcher.parse_line(line)
+ assert result is not None
+ assert result["role"] == "user"
+
+
+def test_parse_invalid_jsonl():
+ watcher, _, _ = _make_watcher()
+ assert watcher.parse_line("not json") is None
+ assert watcher.parse_line("") is None
+
+
+# -- normalize_event with default mapping --
+
+def test_normalize_event_default_user():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "user", "content": "Hello", "type": "message"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "user"
+ assert result["content"] == "Hello"
+ assert result["type"] == "message"
+
+
+def test_normalize_event_default_assistant():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "assistant", "content": "Response", "type": "message"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "assistant"
+
+
+def test_normalize_event_excludes_unknown_role():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "system", "content": "System msg", "type": "message"}
+ assert watcher.normalize_event(raw) is None
+
+
+def test_normalize_event_excludes_empty_content():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "user", "content": "", "type": "message"}
+ assert watcher.normalize_event(raw) is None
+
+
+def test_normalize_event_excludes_wrong_type():
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "user", "content": "Hello", "type": "tool_call"}
+ assert watcher.normalize_event(raw) is None
+
+
+def test_normalize_event_passes_when_no_type_field():
+ """When the type field is absent, the type check is skipped."""
+ watcher, _, _ = _make_watcher()
+ raw = {"role": "user", "content": "Hello"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "user"
+
+
+# -- normalize_event with custom field mapping --
+
+def test_normalize_event_custom_role_field():
+ extra = {"role_field": "speaker", "user_role_value": "human", "assistant_role_value": "bot"}
+ watcher, _, _ = _make_watcher(extra=extra)
+ raw = {"speaker": "human", "content": "Hi", "type": "message"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "user"
+
+
+def test_normalize_event_custom_content_field():
+ extra = {"content_field": "text"}
+ watcher, _, _ = _make_watcher(extra=extra)
+ raw = {"role": "user", "text": "Custom content", "type": "message"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["content"] == "Custom content"
+
+
+def test_normalize_event_custom_type_field():
+ extra = {"type_field": "event_type", "message_type_value": "chat"}
+ watcher, _, _ = _make_watcher(extra=extra)
+ raw = {"role": "user", "content": "Hello", "event_type": "chat"}
+ result = watcher.normalize_event(raw)
+ assert result is not None
+
+ raw2 = {"role": "user", "content": "Hello", "event_type": "tool_use"}
+ assert watcher.normalize_event(raw2) is None
+
+
+# -- _process_file integration --
+
+def test_process_file():
+ watcher, batches, cursor_mgr = _make_watcher(batch_trigger_lines=2)
+
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f:
+ f.write(json.dumps({"role": "user", "content": "Hello", "type": "message"}) + "\n")
+ f.write(json.dumps({"role": "assistant", "content": "Hi", "type": "message"}) + "\n")
+ f.write(json.dumps({"role": "system", "content": "ignored", "type": "message"}) + "\n")
+ tmp_path = f.name
+
+ try:
+ watcher._process_file(tmp_path)
+ assert len(batches) == 1
+ assert len(batches[0]) == 2
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][1]["role"] == "assistant"
+ assert all(e["tool_name"] == "generic_jsonl" for e in batches[0])
+ assert len(cursor_mgr.updates) == 1
+ finally:
+ os.unlink(tmp_path)
+
+
+def test_tool_name():
+ watcher, _, _ = _make_watcher()
+ assert watcher.tool_name == "generic_jsonl"
diff --git a/tests/daemon/test_integration.py b/tests/daemon/test_integration.py
new file mode 100644
index 0000000000..c4b497092e
--- /dev/null
+++ b/tests/daemon/test_integration.py
@@ -0,0 +1,189 @@
+"""Integration tests for OpenViking Active Daemon."""
+import json
+import os
+import tempfile
+from pathlib import Path
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+
+from openviking.daemon.cursor_manager import CursorManager
+from openviking.daemon.deduplicator import KnowledgeDeduplicator
+from openviking.daemon.etl_pipeline import BatchETLPipeline
+from openviking.daemon.filters import LowValueFilter
+from openviking.daemon.conversation_reconstructor import ConversationReconstructor
+from openviking.daemon.knowledge_router import KnowledgeRouter
+from openviking.daemon.storage_adapter import VikingStorageAdapter
+from openviking.daemon.models import ExtractedKnowledge
+
+
+@pytest.fixture
+def temp_watch_dir():
+ with tempfile.TemporaryDirectory() as tmpdir:
+ yield Path(tmpdir)
+
+
+@pytest.fixture
+def temp_db():
+ fd, path = tempfile.mkstemp(suffix=".db")
+ os.close(fd)
+ yield path
+ if os.path.exists(path):
+ os.unlink(path)
+
+
+def test_jsonl_file_created_and_readable(temp_watch_dir):
+ """Verify we can create and read JSONL files in the watch directory."""
+ jsonl_file = temp_watch_dir / "test_session.jsonl"
+
+ events = [
+ {
+ "timestamp": "2026-06-15T10:00:00Z",
+ "role": "user",
+ "content": "How to configure PostgreSQL for high availability?",
+ "type": "message",
+ },
+ {
+ "timestamp": "2026-06-15T10:00:01Z",
+ "role": "assistant",
+ "content": "Edit postgresql.conf, set max_connections=100 and enable WAL archiving",
+ "type": "message",
+ },
+ ]
+
+ with open(jsonl_file, "w", encoding="utf-8") as f:
+ for event in events:
+ f.write(json.dumps(event) + "\n")
+
+ assert jsonl_file.exists()
+ lines = jsonl_file.read_text(encoding="utf-8").strip().split("\n")
+ assert len(lines) == 2
+
+ parsed = [json.loads(line) for line in lines]
+ assert parsed[0]["role"] == "user"
+ assert parsed[1]["role"] == "assistant"
+
+
+def test_cursor_tracks_jsonl_progress(temp_watch_dir, temp_db):
+ """Verify CursorManager correctly tracks incremental reads."""
+ jsonl_file = temp_watch_dir / "session.jsonl"
+
+ # Write first batch
+ with open(jsonl_file, "w", encoding="utf-8") as f:
+ f.write(json.dumps({"role": "user", "content": "First question", "type": "message"}) + "\n")
+
+ cursor_mgr = CursorManager(temp_db)
+ cursor = cursor_mgr.get_cursor(str(jsonl_file))
+ assert cursor.last_position == 0
+
+ # Read the file
+ with open(jsonl_file, "r", encoding="utf-8") as f:
+ f.seek(cursor.last_position)
+ lines = f.readlines()
+ new_position = f.tell()
+
+ assert len(lines) == 1
+ assert new_position > 0
+
+ cursor_mgr.update_cursor(str(jsonl_file), new_position)
+
+ # Verify cursor persisted
+ cursor2 = cursor_mgr.get_cursor(str(jsonl_file))
+ assert cursor2.last_position == new_position
+
+ # Append more data
+ with open(jsonl_file, "a", encoding="utf-8") as f:
+ f.write(json.dumps({"role": "assistant", "content": "Answer", "type": "message"}) + "\n")
+
+ # Read incrementally
+ with open(jsonl_file, "r", encoding="utf-8") as f:
+ f.seek(cursor2.last_position)
+ new_lines = f.readlines()
+
+ assert len(new_lines) == 1
+ assert json.loads(new_lines[0])["role"] == "assistant"
+
+
+def test_filter_and_reconstruct_pipeline():
+ """Verify the filter -> reconstruct pipeline works end-to-end."""
+ events = [
+ {"role": "user", "content": "Hi", "type": "message", "timestamp": "2026-06-15T10:00:00Z"}, # Too short
+ {"role": "user", "content": "npm install lodash --save-dev", "type": "message", "timestamp": "2026-06-15T10:00:01Z"}, # Noise
+ {"role": "user", "content": "How should we handle database migrations in production?", "type": "message", "timestamp": "2026-06-15T10:00:02Z"},
+ {"role": "assistant", "content": "Use Alembic with versioned migration scripts and run them in a CI pipeline before deployment", "type": "message", "timestamp": "2026-06-15T10:00:03Z"},
+ ]
+
+ # Step 1: Filter
+ filt = LowValueFilter()
+ filtered = filt.apply(events)
+ assert len(filtered) == 2 # "Hi" and "npm install" removed
+
+ # Step 2: Reconstruct
+ reconstructor = ConversationReconstructor()
+ turns = reconstructor.reconstruct(filtered)
+ assert len(turns) == 1
+ assert "database migrations" in turns[0].user_prompt
+ assert "Alembic" in turns[0].assistant_response
+
+
+def test_knowledge_router_all_categories():
+ """Verify router handles all categories correctly."""
+ router = KnowledgeRouter()
+
+ skill = ExtractedKnowledge(status="EXTRACTED", category="skills", title="PG Config", content="...", source_tool="claude_code")
+ assert "skills/claude_code" in router.route(skill)
+
+ mem_proj = ExtractedKnowledge(status="EXTRACTED", category="memories", title="Arch", content="...", project_name="myapp")
+ assert "memories/projects/myapp" in router.route(mem_proj)
+
+ mem_global = ExtractedKnowledge(status="EXTRACTED", category="memories", title="General", content="...")
+ assert "memories/global" in router.route(mem_global)
+
+ resource = ExtractedKnowledge(status="EXTRACTED", category="resources", title="Redis", content="...", entity_links=["Redis"])
+ assert "resources/Redis" in router.route(resource)
+
+
+def test_deduplicator_prevents_duplicates():
+ """Verify deduplicator blocks duplicate knowledge."""
+ dedup = KnowledgeDeduplicator()
+
+ k1 = ExtractedKnowledge(status="EXTRACTED", category="memories", title="A", content="Same content here")
+ k2 = ExtractedKnowledge(status="EXTRACTED", category="memories", title="B", content="Same content here")
+ k3 = ExtractedKnowledge(status="EXTRACTED", category="memories", title="C", content="Different content")
+
+ assert not dedup.is_duplicate(k1)
+ assert dedup.is_duplicate(k2)
+ assert not dedup.is_duplicate(k3)
+
+
+def test_storage_adapter_formats_content():
+ """Verify storage adapter generates correct Markdown for each category."""
+ mock_service = MagicMock()
+ adapter = VikingStorageAdapter(mock_service)
+
+ skill = ExtractedKnowledge(
+ status="EXTRACTED", category="skills", title="Test Skill",
+ content="Do this thing", confidence=0.9,
+ actionable_steps=["Step 1", "Step 2"],
+ )
+ content = adapter._format_content(skill)
+ assert "Test Skill" in content
+ assert "Step 1" in content
+ assert "Step 2" in content
+
+ memory = ExtractedKnowledge(
+ status="EXTRACTED", category="memories", title="Decision",
+ content="We chose X", entity_links=["tag1"],
+ timestamp="2026-06-15T10:00:00Z",
+ )
+ content = adapter._format_content(memory)
+ assert "Decision" in content
+ assert "tag1" in content
+
+ resource = ExtractedKnowledge(
+ status="EXTRACTED", category="resources", title="Guide",
+ content="Reference material", entity_links=["Docker"],
+ )
+ content = adapter._format_content(resource)
+ assert "Guide" in content
+ assert "Docker" in content
diff --git a/tests/daemon/test_knowledge_extractor.py b/tests/daemon/test_knowledge_extractor.py
new file mode 100644
index 0000000000..905cc440d3
--- /dev/null
+++ b/tests/daemon/test_knowledge_extractor.py
@@ -0,0 +1,117 @@
+"""Tests for KnowledgeExtractor."""
+import json
+from typing import Optional
+
+import pytest
+
+from openviking.daemon.knowledge_extractor import KnowledgeExtractor
+from openviking.daemon.models import ConversationTurn, ExtractedKnowledge
+
+
+class MockVLMConfig:
+ """Mock VLM config that returns canned string responses via get_completion_async."""
+
+ def __init__(self, response_text: str):
+ self.response_text = response_text
+
+ async def get_completion_async(self, prompt: str = "", **kwargs) -> str:
+ return self.response_text
+
+
+def _make_turn(user="How to configure PostgreSQL?", assistant="Edit postgresql.conf"):
+ return ConversationTurn(
+ user_prompt=user,
+ assistant_response=assistant,
+ timestamp="2026-06-15T10:00:00Z",
+ )
+
+
+@pytest.mark.asyncio
+async def test_extract_valid_knowledge():
+ llm_response = json.dumps({
+ "status": "EXTRACTED",
+ "category": "skills",
+ "confidence": 0.9,
+ "title": "PostgreSQL Config",
+ "content": "Configure PostgreSQL by editing postgresql.conf",
+ "project_name": "my-project",
+ "entity_links": ["PostgreSQL"],
+ "actionable_steps": ["Edit postgresql.conf"],
+ })
+ extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response))
+ result = await extractor.extract(_make_turn())
+
+ assert result is not None
+ assert result.status == "EXTRACTED"
+ assert result.category == "skills"
+ assert result.confidence == 0.9
+
+
+@pytest.mark.asyncio
+async def test_extract_ignored_status():
+ llm_response = json.dumps({
+ "status": "IGNORED",
+ "category": "memories",
+ "confidence": 0.3,
+ "title": "Typo fix",
+ "content": "Fixed typo",
+ })
+ extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response))
+ result = await extractor.extract(_make_turn())
+
+ assert result is None
+
+
+@pytest.mark.asyncio
+async def test_extract_low_confidence():
+ llm_response = json.dumps({
+ "status": "EXTRACTED",
+ "category": "memories",
+ "confidence": 0.4,
+ "title": "Low confidence",
+ "content": "Some content",
+ })
+ extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response))
+ result = await extractor.extract(_make_turn())
+
+ assert result is None
+
+
+@pytest.mark.asyncio
+async def test_extract_invalid_json():
+ extractor = KnowledgeExtractor(vlm_config=MockVLMConfig("not valid json"))
+ result = await extractor.extract(_make_turn())
+
+ assert result is None
+
+
+@pytest.mark.asyncio
+async def test_extract_cleans_markdown():
+ llm_response = json.dumps({
+ "status": "EXTRACTED",
+ "category": "memories",
+ "confidence": 0.8,
+ "title": "Test",
+ "content": "```python\nsome code\n```\nActual content here",
+ })
+ extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response))
+ result = await extractor.extract(_make_turn())
+
+ assert result is not None
+ assert "```" not in result.content
+
+
+@pytest.mark.asyncio
+async def test_title_truncated():
+ llm_response = json.dumps({
+ "status": "EXTRACTED",
+ "category": "memories",
+ "confidence": 0.8,
+ "title": "A" * 100,
+ "content": "Content",
+ })
+ extractor = KnowledgeExtractor(vlm_config=MockVLMConfig(llm_response))
+ result = await extractor.extract(_make_turn())
+
+ assert result is not None
+ assert len(result.title) <= 50
diff --git a/tests/daemon/test_knowledge_router.py b/tests/daemon/test_knowledge_router.py
new file mode 100644
index 0000000000..a045dc4a5a
--- /dev/null
+++ b/tests/daemon/test_knowledge_router.py
@@ -0,0 +1,73 @@
+"""Tests for KnowledgeRouter."""
+from openviking.daemon.knowledge_router import KnowledgeRouter
+from openviking.daemon.models import ExtractedKnowledge
+
+
+def _make(**kwargs) -> ExtractedKnowledge:
+ defaults = {
+ "status": "EXTRACTED",
+ "category": "memories",
+ "title": "Test Title",
+ "content": "Some content",
+ }
+ defaults.update(kwargs)
+ return ExtractedKnowledge(**defaults)
+
+
+def test_route_skills():
+ router = KnowledgeRouter()
+ k = _make(category="skills", title="PostgreSQL Config")
+ uri = router.route(k)
+ assert uri.startswith("viking://skills/general/")
+ assert uri.endswith(".md")
+
+
+def test_route_skills_with_source_tool():
+ router = KnowledgeRouter()
+ k = _make(category="skills", title="PostgreSQL Config", source_tool="claude_code")
+ uri = router.route(k)
+ assert uri.startswith("viking://skills/claude_code/")
+ assert uri.endswith(".md")
+
+
+def test_route_memories_with_project():
+ router = KnowledgeRouter()
+ k = _make(category="memories", title="Arch Decision", project_name="my-project")
+ uri = router.route(k)
+ assert "my-project" in uri
+ assert uri.endswith("decisions.md")
+
+
+def test_route_memories_global():
+ router = KnowledgeRouter()
+ k = _make(category="memories", title="Global Memory")
+ uri = router.route(k)
+ assert uri.startswith("viking://memories/global/")
+
+
+def test_route_resources():
+ router = KnowledgeRouter()
+ k = _make(category="resources", title="Redis Guide", entity_links=["Redis"])
+ uri = router.route(k)
+ assert "Redis" in uri
+ assert uri.endswith(".md")
+
+
+def test_route_resources_no_tags():
+ router = KnowledgeRouter()
+ k = _make(category="resources", title="General", entity_links=[])
+ uri = router.route(k)
+ assert "general" in uri
+
+
+def test_route_unknown_category():
+ router = KnowledgeRouter()
+ k = _make(category="unknown")
+ uri = router.route(k)
+ assert uri is None
+
+
+def test_sanitize_filename():
+ router = KnowledgeRouter()
+ assert router._sanitize_filename('file<>:name') == 'file___name'
+ assert router._sanitize_filename("a" * 100) == "a" * 50
diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py
new file mode 100644
index 0000000000..8623a9e304
--- /dev/null
+++ b/tests/daemon/test_multi_watcher_integration.py
@@ -0,0 +1,330 @@
+"""
+Integration tests for multi-watcher daemon pipeline.
+Tests the full flow: multiple watchers -> normalized events -> ETL compatibility.
+"""
+import os
+import time
+import pytest
+from pathlib import Path
+
+from openviking.daemon.watchers.registry import create_watcher, list_available_watchers
+from openviking.daemon.watchers import BaseWatcher
+
+
+class FakeCursorManager:
+ """In-memory cursor manager for testing."""
+
+ def __init__(self):
+ self.cursors = {}
+
+ def get_cursor(self, file_path):
+ from openviking.daemon.models import FileCursor
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ from openviking.daemon.models import FileCursor
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+
+
+# --- Registry Tests ---
+
+def test_all_watchers_registered():
+ """All 5 built-in watchers should be registered."""
+ available = list_available_watchers()
+ expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev"}
+ assert expected.issubset(set(available)), f"Missing: {expected - set(available)}"
+
+
+def test_create_all_watchers():
+ """Verify all registered watchers can be instantiated via factory."""
+ batches = []
+ cm = FakeCursorManager()
+
+ file_patterns = {
+ "claude_code": "*.jsonl",
+ "generic_jsonl": "*.jsonl",
+ "aider": ".aider.chat.history.md",
+ "cursor": "*.log",
+ "continue_dev": "*.json",
+ }
+
+ for tool_name in list_available_watchers():
+ watcher = create_watcher(
+ tool_name=tool_name,
+ watch_dir="/tmp/test",
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ file_pattern=file_patterns.get(tool_name, "*.jsonl"),
+ )
+ assert isinstance(watcher, BaseWatcher)
+ assert watcher.tool_name == tool_name
+
+
+def test_create_unknown_watcher_raises():
+ """Unknown tool name should raise ValueError."""
+ with pytest.raises(ValueError, match="Unknown watcher tool"):
+ create_watcher(
+ tool_name="nonexistent",
+ watch_dir="/tmp",
+ cursor_manager=FakeCursorManager(),
+ batch_callback=lambda e: None,
+ )
+
+
+# --- Multi-Watcher Normalization Tests ---
+
+def test_claude_code_events_have_tool_name(tmp_path):
+ """Claude Code events should include tool_name='claude_code'."""
+ batches = []
+ cm = FakeCursorManager()
+ watcher = create_watcher(
+ tool_name="claude_code",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ batch_trigger_lines=2,
+ )
+
+ test_file = tmp_path / "session.jsonl"
+ test_file.write_text(
+ '{"role": "user", "content": "Hello from CC"}\n'
+ '{"role": "assistant", "content": "Hi from CC"}\n'
+ )
+ watcher._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert all(e["tool_name"] == "claude_code" for e in batches[0])
+
+
+def test_generic_jsonl_custom_mapping(tmp_path):
+ """GenericJSONL with custom field mapping should normalize correctly."""
+ batches = []
+ cm = FakeCursorManager()
+ watcher = create_watcher(
+ tool_name="generic_jsonl",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ batch_trigger_lines=1,
+ extra={
+ "role_field": "author",
+ "user_role_value": "human",
+ "assistant_role_value": "ai",
+ "content_field": "text",
+ },
+ )
+
+ test_file = tmp_path / "custom.jsonl"
+ test_file.write_text('{"author": "human", "text": "Custom format test"}\n')
+ watcher._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["content"] == "Custom format test"
+ assert batches[0][0]["tool_name"] == "generic_jsonl"
+
+
+def test_aider_multiline_parsing(tmp_path):
+ """Aider markdown format should produce user+assistant events."""
+ batches = []
+ cm = FakeCursorManager()
+ watcher = create_watcher(
+ tool_name="aider",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ batch_trigger_lines=100,
+ )
+
+ test_file = tmp_path / ".aider.chat.history.md"
+ test_file.write_text(
+ "# aider chat started at 2026-01-15 10:30:00\n\n"
+ "> /my/project\n\n"
+ "#### user:\n"
+ "How do I use fastapi?\n\n"
+ "#### assistant:\n"
+ "FastAPI is great for building APIs.\n"
+ )
+ watcher._process_file(str(test_file))
+ watcher.flush()
+
+ assert len(batches) == 1
+ events = batches[0]
+ assert len(events) == 2
+ assert events[0]["role"] == "user"
+ assert events[0]["content"] == "How do I use fastapi?"
+ assert events[0]["tool_name"] == "aider"
+ assert events[1]["role"] == "assistant"
+
+
+def test_cursor_log_parsing(tmp_path):
+ """Cursor log format should be normalized correctly."""
+ batches = []
+ cm = FakeCursorManager()
+ watcher = create_watcher(
+ tool_name="cursor",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ file_pattern="*.log",
+ batch_trigger_lines=1,
+ )
+
+ test_file = tmp_path / "cursor.log"
+ test_file.write_text(
+ '{"type": "chat", "role": "user", "message": "Fix this bug"}\n'
+ )
+ watcher._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["content"] == "Fix this bug"
+ assert batches[0][0]["tool_name"] == "cursor"
+
+
+def test_continue_dev_log_parsing(tmp_path):
+ """Continue.dev JSON format should be normalized."""
+ batches = []
+ cm = FakeCursorManager()
+ watcher = create_watcher(
+ tool_name="continue_dev",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ file_pattern="*.json",
+ batch_trigger_lines=1,
+ )
+
+ test_file = tmp_path / "continue.json"
+ test_file.write_text(
+ '{"role": "user", "content": "Explain this code", "sessionId": "abc123"}\n'
+ )
+ watcher._process_file(str(test_file))
+
+ assert len(batches) == 1
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["tool_name"] == "continue_dev"
+ assert batches[0][0]["session_id"] == "abc123"
+
+
+# --- Cross-Watcher ETL Compatibility ---
+
+def test_normalized_events_compatible_with_reconstructor():
+ """Events from all watchers should work with ConversationReconstructor."""
+ from openviking.daemon.conversation_reconstructor import ConversationReconstructor
+
+ events = [
+ {"role": "user", "content": "Question from CC", "tool_name": "claude_code",
+ "timestamp": "2026-01-15T10:00:00Z", "session_id": "s1", "project_name": "proj"},
+ {"role": "assistant", "content": "Answer from CC", "tool_name": "claude_code",
+ "timestamp": "2026-01-15T10:00:01Z", "session_id": "s1", "project_name": "proj"},
+ {"role": "user", "content": "Question from Aider", "tool_name": "aider",
+ "timestamp": "2026-01-15T10:00:02Z"},
+ {"role": "assistant", "content": "Answer from Aider", "tool_name": "aider",
+ "timestamp": "2026-01-15T10:00:03Z"},
+ ]
+
+ reconstructor = ConversationReconstructor()
+ turns = reconstructor.reconstruct(events)
+
+ assert len(turns) == 2
+ assert turns[0].user_prompt == "Question from CC"
+ assert turns[1].user_prompt == "Question from Aider"
+
+
+def test_normalized_events_compatible_with_filter():
+ """Events from all watchers should work with LowValueFilter."""
+ from openviking.daemon.filters import LowValueFilter
+
+ events = [
+ {"role": "user", "content": "A meaningful question about architecture",
+ "tool_name": "claude_code"},
+ {"role": "user", "content": "npm install express",
+ "tool_name": "cursor"},
+ {"role": "assistant", "content": "Here is a detailed explanation of the design pattern",
+ "tool_name": "aider"},
+ ]
+
+ f = LowValueFilter()
+ filtered = f.apply(events)
+
+ assert len(filtered) == 2
+ assert all("npm install" not in e["content"] for e in filtered)
+
+
+def test_source_tool_propagated_through_pipeline():
+ """source_tool should flow from events through ConversationTurn."""
+ from openviking.daemon.conversation_reconstructor import ConversationReconstructor
+
+ events = [
+ {"role": "user", "content": "How to use FastAPI?", "tool_name": "cursor",
+ "timestamp": "2026-01-15T10:00:00Z"},
+ {"role": "assistant", "content": "Install FastAPI with pip...", "tool_name": "cursor",
+ "timestamp": "2026-01-15T10:00:01Z"},
+ ]
+
+ reconstructor = ConversationReconstructor()
+ turns = reconstructor.reconstruct(events)
+
+ assert len(turns) == 1
+ assert turns[0].source_tool == "cursor"
+
+
+def test_multi_watcher_config_effective_watchers():
+ """DaemonConfig.get_effective_watchers() should handle all cases."""
+ from openviking.server.config import WatcherConfig, DaemonConfig
+
+ # Explicit watchers list
+ cfg = DaemonConfig(
+ enabled=True,
+ watchers=[
+ WatcherConfig(tool_name="claude_code", watch_dir="/a"),
+ WatcherConfig(tool_name="aider", watch_dir="/b"),
+ WatcherConfig(tool_name="cursor", watch_dir="/c", enabled=False),
+ ],
+ )
+ effective = cfg.get_effective_watchers()
+ assert len(effective) == 2 # disabled watcher filtered out
+ assert effective[0].tool_name == "claude_code"
+ assert effective[1].tool_name == "aider"
+
+ # Backward compat: watch_dir only
+ cfg2 = DaemonConfig(enabled=True, watch_dir="~/.claude/projects")
+ effective2 = cfg2.get_effective_watchers()
+ assert len(effective2) == 1
+ assert effective2[0].tool_name == "claude_code"
+
+ # Default fallback
+ cfg3 = DaemonConfig(enabled=True)
+ effective3 = cfg3.get_effective_watchers()
+ assert len(effective3) == 1
+ assert effective3[0].tool_name == "claude_code"
+
+
+def test_knowledge_router_uses_source_tool():
+ """KnowledgeRouter should use source_tool in URI path."""
+ from openviking.daemon.knowledge_router import KnowledgeRouter
+ from openviking.daemon.models import ExtractedKnowledge
+
+ router = KnowledgeRouter()
+
+ # With source_tool
+ k1 = ExtractedKnowledge(
+ status="EXTRACTED", category="skills", title="FastAPI Tips",
+ content="...", confidence=0.9, source_tool="cursor",
+ )
+ uri1 = router.route(k1)
+ assert "cursor" in uri1
+ assert "claude_code" not in uri1
+
+ # Without source_tool (fallback)
+ k2 = ExtractedKnowledge(
+ status="EXTRACTED", category="skills", title="Python Tips",
+ content="...", confidence=0.9,
+ )
+ uri2 = router.route(k2)
+ assert "general" in uri2
diff --git a/tests/daemon/test_registry.py b/tests/daemon/test_registry.py
new file mode 100644
index 0000000000..50bd2f77bc
--- /dev/null
+++ b/tests/daemon/test_registry.py
@@ -0,0 +1,31 @@
+"""Tests for watcher registry."""
+import pytest
+from openviking.daemon.watchers.registry import (
+ create_watcher,
+ list_available_watchers,
+ _WATCHER_REGISTRY,
+ register_watcher,
+)
+
+
+def test_list_available_includes_claude_code():
+ assert "claude_code" in list_available_watchers()
+
+
+def test_create_watcher_unknown_raises():
+ with pytest.raises(ValueError, match="Unknown watcher tool"):
+ create_watcher("nonexistent_tool", watch_dir="/tmp", cursor_manager=None,
+ batch_callback=lambda x: None)
+
+
+def test_register_watcher_decorator():
+ class FakeWatcher:
+ pass
+
+ @register_watcher("test_tool_xyz")
+ class Decorated:
+ pass
+
+ assert "test_tool_xyz" in _WATCHER_REGISTRY
+ # cleanup
+ del _WATCHER_REGISTRY["test_tool_xyz"]
diff --git a/tests/daemon/test_service_multi.py b/tests/daemon/test_service_multi.py
new file mode 100644
index 0000000000..60fe888eff
--- /dev/null
+++ b/tests/daemon/test_service_multi.py
@@ -0,0 +1,31 @@
+"""Tests for multi-watcher DaemonService."""
+import pytest
+from unittest.mock import MagicMock
+from openviking.server.config import WatcherConfig
+
+
+def test_service_stores_watcher_configs():
+ """DaemonService should accept and store watcher_configs."""
+ from openviking.daemon.service import DaemonService
+ configs = [
+ WatcherConfig(tool_name="claude_code", watch_dir="/tmp/cc"),
+ WatcherConfig(tool_name="aider", watch_dir="/tmp/aider"),
+ ]
+ svc = DaemonService(
+ resource_service=MagicMock(),
+ watcher_configs=configs,
+ db_path="/tmp/test.db",
+ )
+ assert len(svc._watcher_configs) == 2
+
+
+def test_service_backward_compat_single_dir():
+ """DaemonService should create single claude_code config from watch_dir."""
+ from openviking.daemon.service import DaemonService
+ svc = DaemonService(
+ resource_service=MagicMock(),
+ watch_dir="/tmp/cc",
+ db_path="/tmp/test.db",
+ )
+ assert len(svc._watcher_configs) == 1
+ assert svc._watcher_configs[0].tool_name == "claude_code"
diff --git a/web-studio/src/i18n/locales/en.ts b/web-studio/src/i18n/locales/en.ts
index 7326f436a8..f824efa947 100644
--- a/web-studio/src/i18n/locales/en.ts
+++ b/web-studio/src/i18n/locales/en.ts
@@ -301,6 +301,21 @@ const en = {
usageDisabled:
'Usage/Audit is not initialized, so live usage stats are unavailable.',
},
+ daemon: {
+ status: 'Active Daemon',
+ enabled: 'Enabled',
+ disabled: 'Disabled',
+ running: 'Running',
+ stopped: 'Stopped',
+ watchDir: 'Watch Directory',
+ batchLines: 'Batch Lines',
+ batchSeconds: 'Batch Seconds',
+ cursorCount: 'Tracked Files',
+ lastFlush: 'Last Flush',
+ watcherCount: 'watcher(s) active',
+ activeWatchers: 'Active Watchers',
+ availableTools: 'Available Tools',
+ },
operations: {
page: {
placeholder: 'Operations dashboard is under construction.',
diff --git a/web-studio/src/i18n/locales/zh-CN.ts b/web-studio/src/i18n/locales/zh-CN.ts
index 031b077cb0..f056f9da53 100644
--- a/web-studio/src/i18n/locales/zh-CN.ts
+++ b/web-studio/src/i18n/locales/zh-CN.ts
@@ -298,6 +298,21 @@ const zhCN = {
},
usageDisabled: 'Usage/Audit 未初始化,暂无实时统计。',
},
+ daemon: {
+ status: '主动守护进程',
+ enabled: '已启用',
+ disabled: '已禁用',
+ running: '运行中',
+ stopped: '已停止',
+ watchDir: '监听目录',
+ batchLines: '批处理行数',
+ batchSeconds: '批处理间隔(秒)',
+ cursorCount: '追踪文件数',
+ lastFlush: '上次刷新',
+ watcherCount: '个监听器运行中',
+ activeWatchers: '活跃监听器',
+ availableTools: '可用工具',
+ },
operations: {
page: {
placeholder: '运维面板能力尚未接入。',
diff --git a/web-studio/src/routes/home/-components/daemon-status-card.tsx b/web-studio/src/routes/home/-components/daemon-status-card.tsx
new file mode 100644
index 0000000000..a7dfe7984f
--- /dev/null
+++ b/web-studio/src/routes/home/-components/daemon-status-card.tsx
@@ -0,0 +1,186 @@
+import { useQuery } from '@tanstack/react-query';
+import { Activity, Clock, Database, FileText, Monitor, ToggleLeft, ToggleRight } from 'lucide-react';
+import { useTranslation } from 'react-i18next';
+import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
+import { Badge } from '@/components/ui/badge';
+import { Skeleton } from '@/components/ui/skeleton';
+import { ovClient } from '@/lib/ov-client/client';
+
+interface WatcherStatus {
+ tool_name: string;
+ watch_dir: string | null;
+ file_pattern: string | null;
+ enabled: boolean;
+ running: boolean;
+ cursor_count: number;
+ batch_trigger_lines: number | null;
+ batch_trigger_seconds: number | null;
+}
+
+interface DaemonStatus {
+ enabled: boolean;
+ running: boolean;
+ watchers: WatcherStatus[];
+ available_tools: string[];
+ db_path: string | null;
+}
+
+async function fetchDaemonStatus(): Promise {
+ const response = await ovClient.instance.get('/api/v1/daemon/status');
+ return response.data as DaemonStatus;
+}
+
+function ToolBadge({ toolName }: { toolName: string }) {
+ const colors: Record = {
+ claude_code: 'bg-orange-500/10 text-orange-600 dark:text-orange-400 border-orange-200 dark:border-orange-800',
+ generic_jsonl: 'bg-blue-500/10 text-blue-600 dark:text-blue-400 border-blue-200 dark:border-blue-800',
+ aider: 'bg-green-500/10 text-green-600 dark:text-green-400 border-green-200 dark:border-green-800',
+ cursor: 'bg-purple-500/10 text-purple-600 dark:text-purple-400 border-purple-200 dark:border-purple-800',
+ continue_dev: 'bg-cyan-500/10 text-cyan-600 dark:text-cyan-400 border-cyan-200 dark:border-cyan-800',
+ };
+ return (
+
+ {toolName}
+
+ );
+}
+
+export function DaemonStatusCard() {
+ const { t } = useTranslation();
+
+ const { data, isLoading, error } = useQuery({
+ queryKey: ['daemon-status'],
+ queryFn: fetchDaemonStatus,
+ refetchInterval: 30000,
+ });
+
+ if (isLoading) {
+ return (
+
+
+
+
+
+
+
+
+
+
+
+
+ );
+ }
+
+ if (error || !data) {
+ return (
+
+
+
+
+ {t('daemon.status')}
+
+
+
+ Failed to load daemon status
+
+
+ );
+ }
+
+ const statusColor = data.enabled && data.running
+ ? 'bg-green-500'
+ : data.enabled
+ ? 'bg-yellow-500'
+ : 'bg-gray-400';
+
+ const statusLabel = data.enabled && data.running
+ ? t('daemon.running')
+ : data.enabled
+ ? t('daemon.stopped')
+ : t('daemon.disabled');
+
+ return (
+
+
+
+
+
+ {t('daemon.status')}
+
+
+ {statusLabel}
+
+
+
+
+ {/* Global Status */}
+
+
+ {data.enabled ? : }
+ {t('daemon.enabled')}
+
+
+ {data.watchers.length} {t('daemon.watcherCount')}
+
+
+
+ {/* Watcher List */}
+ {data.watchers.length > 0 && (
+
+
+ {t('daemon.activeWatchers')}
+
+ {data.watchers.map((w, i) => (
+
+
+
+ {w.watch_dir && (
+
+ {w.watch_dir.split(/[\\/]/).pop() || w.watch_dir}
+
+ )}
+
+
+
+
+ {w.cursor_count}
+
+ {w.batch_trigger_lines && (
+
+
+ {w.batch_trigger_lines}
+
+ )}
+ {w.batch_trigger_seconds && (
+
+
+ {w.batch_trigger_seconds}s
+
+ )}
+
+
+ ))}
+
+ )}
+
+ {/* Available Tools */}
+ {data.available_tools.length > 0 && (
+
+
+
+ {t('daemon.availableTools')}
+
+
+ {data.available_tools.map((tool) => (
+
+ ))}
+
+
+ )}
+
+
+ );
+}
diff --git a/web-studio/src/routes/home/route.tsx b/web-studio/src/routes/home/route.tsx
index a99aefa00d..fae60d7e7d 100644
--- a/web-studio/src/routes/home/route.tsx
+++ b/web-studio/src/routes/home/route.tsx
@@ -3,6 +3,7 @@ import { useQuery } from '@tanstack/react-query'
import { createFileRoute } from '@tanstack/react-router'
import { ContextCommitsPanel } from './-components/context-commits-panel'
+import { DaemonStatusCard } from './-components/daemon-status-card'
import {
ContextDataPanel,
TodayRetrievalsPanel,
@@ -90,6 +91,8 @@ function HomePage() {
/>
+
+
Date: Fri, 19 Jun 2026 16:39:28 +0800
Subject: [PATCH 2/6] fix(daemon): fix ETL pipeline not flushing events and
invalid URI routes
Root cause: BatchBuffer.created_at was never set in add_line(), so the
time-based trigger never fired and events stayed buffered indefinitely.
Additional fixes:
- Thread-safe enqueue from watchdog thread via loop.call_soon_threadsafe()
- _flush_buffer() now calls callback before clearing buffer (prevents data loss)
- KnowledgeRouter uses valid viking://resources/ scope (skills/memories were invalid)
- Non-ASCII titles sanitized via sha256 hash to produce valid URI paths
- VLM extraction concurrency limited to 2 via semaphore
- ClaudeCodeWatcher rewrite: handle nested message.content (text blocks, tool_use)
- Project name derived from file path via _post_normalize hook
- All exception handlers now include exc_info=True for stack traces
- 148 tests passing
---
docs/daemon/Watcher.md | 32 ++
openviking/daemon/etl_pipeline.py | 12 +-
openviking/daemon/knowledge_router.py | 24 +-
openviking/daemon/models.py | 3 +
openviking/daemon/service.py | 20 +-
.../daemon/watchers/base_file_watcher.py | 23 +-
.../daemon/watchers/claude_code_watcher.py | 79 ++++-
tests/daemon/test_claude_code_watcher.py | 295 ++++++++++++++++--
tests/daemon/test_integration.py | 2 +-
tests/daemon/test_knowledge_router.py | 14 +-
.../daemon/test_multi_watcher_integration.py | 4 +-
11 files changed, 448 insertions(+), 60 deletions(-)
create mode 100644 docs/daemon/Watcher.md
diff --git a/docs/daemon/Watcher.md b/docs/daemon/Watcher.md
new file mode 100644
index 0000000000..5b672d74f2
--- /dev/null
+++ b/docs/daemon/Watcher.md
@@ -0,0 +1,32 @@
+# Daemon Watcher 排查计划
+
+## 状态:148 tests pass,服务器在 1966 端口运行中,daemon 已处理 3 个文件
+
+## 待完成
+
+### 1. 确认端到端写入
+目标:验证知识是否真正写入 `viking://resources/...`
+
+方法:
+- 服务器日志当前输出到后台进程 stdout,无法查看。需要改为写文件:在 `ov.conf` 中加 `"log": {"level": "INFO", "output": "D:\\Develop\\ov-test\\server.log"}`
+- 杀掉当前服务器,清 cursor DB,重启
+- touch JSONL 触发处理,等 ETL 跑完
+- 查看 `server.log` 中的写入日志(成功/失败)
+
+预期日志链路:
+```
+[claude_code] Flushing N events
+_enqueue_batch: received N events
+ETL loop: dequeued N events
+Processing batch with N events → After filtering: X → Reconstructed Y turns → Extracted Z items
+Knowledge ingested: viking://resources/... ← 关键确认点
+```
+
+### 2. 根据结果处理
+- 写入成功 → 提交 commit 和 PR
+- 写入失败 → 根据日志修 `storage_adapter.py`(可能是 `resource_service.add_resource()` 调用参数或 URI 格式问题)
+
+### 3. 提交 PR
+- 分支基于 `upstream/main`
+- 只包含 daemon 相关文件变更
+- PR 目标:`volcengine/OpenViking:main`
diff --git a/openviking/daemon/etl_pipeline.py b/openviking/daemon/etl_pipeline.py
index 35a47315f8..b0f35d58d1 100644
--- a/openviking/daemon/etl_pipeline.py
+++ b/openviking/daemon/etl_pipeline.py
@@ -46,15 +46,21 @@ async def process_batch(self, events: List[Dict]) -> List[ExtractedKnowledge]:
if not turns:
return []
- # Step 3: Extract knowledge in parallel
- tasks = [self.extractor.extract(turn) for turn in turns]
+ # Step 3: Extract knowledge with limited concurrency (2 parallel VLM calls max)
+ sem = asyncio.Semaphore(2)
+
+ async def _guarded_extract(turn):
+ async with sem:
+ return await self.extractor.extract(turn)
+
+ tasks = [_guarded_extract(turn) for turn in turns]
results = await asyncio.gather(*tasks, return_exceptions=True)
# Step 4: Filter errors and deduplicate
extracted: List[ExtractedKnowledge] = []
for result in results:
if isinstance(result, Exception):
- logger.error("Extraction failed: %s", result)
+ logger.error("Extraction failed: %s", result, exc_info=result)
continue
if result is not None and not self.deduplicator.is_duplicate(result):
diff --git a/openviking/daemon/knowledge_router.py b/openviking/daemon/knowledge_router.py
index cd043d905a..e2adcb112f 100644
--- a/openviking/daemon/knowledge_router.py
+++ b/openviking/daemon/knowledge_router.py
@@ -1,6 +1,7 @@
"""
Route extracted knowledge to appropriate viking:// URIs based on category and project.
"""
+import hashlib
import re
from typing import Optional
@@ -17,10 +18,10 @@ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]:
"""
Determine the target URI for a knowledge item.
- Routing rules:
- - skills -> viking://skills//.md
- - memories (with project) -> viking://memories/projects//decisions.md
- - memories (no project) -> viking://memories/global/.md
+ Routing rules (all under resources/ scope):
+ - skills -> viking://resources/skills//.md
+ - memories (with project) -> viking://resources/memories//decisions.md
+ - memories (no project) -> viking://resources/memories/global/.md
- resources -> viking://resources//.md
"""
category = knowledge.category
@@ -30,14 +31,14 @@ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]:
if category == "skills":
safe_source = self._sanitize_filename(source)
- return f"viking://skills/{safe_source}/{title}.md"
+ return f"viking://resources/skills/{safe_source}/{title}.md"
elif category == "memories":
if project_name:
safe_project = self._sanitize_filename(project_name)
- return f"viking://memories/projects/{safe_project}/decisions.md"
+ return f"viking://resources/memories/{safe_project}/decisions.md"
else:
- return f"viking://memories/global/{title}.md"
+ return f"viking://resources/memories/global/{title}.md"
elif category == "resources":
entity_links = knowledge.entity_links
@@ -50,7 +51,12 @@ def route(self, knowledge: ExtractedKnowledge) -> Optional[str]:
@staticmethod
def _sanitize_filename(name: str) -> str:
- """Sanitize a string for use as a filename."""
+ """Sanitize a string for use as a filename. Non-ASCII names are replaced with a short hash."""
sanitized = re.sub(r'[<>:"/\\|?*]', '_', name)
sanitized = sanitized.strip().replace(' ', '_')
- return sanitized[:50]
+ sanitized = sanitized[:50]
+ # If any non-ASCII characters remain, use a truncated hash instead
+ if not sanitized.isascii():
+ name_hash = hashlib.sha256(name.encode("utf-8")).hexdigest()[:16]
+ sanitized = name_hash
+ return sanitized
diff --git a/openviking/daemon/models.py b/openviking/daemon/models.py
index f409762b53..ea88df8953 100644
--- a/openviking/daemon/models.py
+++ b/openviking/daemon/models.py
@@ -1,6 +1,7 @@
"""
Data models for OpenViking Active Daemon.
"""
+import time
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
@@ -36,6 +37,8 @@ class BatchBuffer:
created_at: float = 0.0
def add_line(self, line: Dict[str, Any], byte_size: int):
+ if self.is_empty():
+ self.created_at = time.time()
self.lines.append(line)
self.byte_count += byte_size
diff --git a/openviking/daemon/service.py b/openviking/daemon/service.py
index 8ca1c1e2d1..bd6bc8bebe 100644
--- a/openviking/daemon/service.py
+++ b/openviking/daemon/service.py
@@ -66,10 +66,12 @@ def __init__(
self._running = False
self._etl_task: Optional[asyncio.Task] = None
self._batch_queue: asyncio.Queue = asyncio.Queue()
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
async def start(self):
"""Start the Daemon service with all configured watchers."""
logger.info("Starting OpenViking Active Daemon...")
+ self._loop = asyncio.get_running_loop()
self.cursor_manager = CursorManager(self.db_path)
self.etl_pipeline = BatchETLPipeline()
@@ -101,7 +103,7 @@ async def start(self):
self.watchers.append(watcher)
logger.info("Watcher started: %s -> %s", wc.tool_name, watch_dir)
except Exception as e:
- logger.warning("Failed to start watcher %s: %s", wc.tool_name, e)
+ logger.warning("Failed to start watcher %s: %s", wc.tool_name, e, exc_info=True)
self._running = True
logger.info("Daemon started with %d watcher(s)", len(self.watchers))
@@ -128,11 +130,16 @@ async def stop(self):
logger.info("Daemon stopped")
def _enqueue_batch(self, events):
- """Sync callback from watcher thread - puts events onto async queue."""
+ """Sync callback from watcher thread - thread-safe enqueue onto async queue."""
try:
- self._batch_queue.put_nowait(events)
+ logger.info("_enqueue_batch: received %d events, queue size before: %d",
+ len(events), self._batch_queue.qsize())
+ if self._loop is not None and self._loop.is_running():
+ self._loop.call_soon_threadsafe(self._batch_queue.put_nowait, events)
+ else:
+ self._batch_queue.put_nowait(events)
except Exception as e:
- logger.error("Failed to enqueue batch: %s", e)
+ logger.error("Failed to enqueue batch: %s", e, exc_info=True)
async def _etl_loop(self):
"""Background loop that processes batches from the queue."""
@@ -149,6 +156,9 @@ async def _etl_loop(self):
if events is None:
break
+ logger.info("ETL loop: dequeued %d events, queue size after: %d",
+ len(events), self._batch_queue.qsize())
+
try:
extracted = await self.etl_pipeline.process_batch(events)
if not extracted:
@@ -172,7 +182,7 @@ async def _etl_loop(self):
else:
logger.warning("Failed to write: %s", knowledge.title)
except Exception as e:
- logger.error("Error writing knowledge: %s", e)
+ logger.error("Error writing knowledge: %s", e, exc_info=True)
except Exception as e:
logger.error("Error in ETL processing: %s", e, exc_info=True)
diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py
index 0e6cce1e0f..1f3e71147d 100644
--- a/openviking/daemon/watchers/base_file_watcher.py
+++ b/openviking/daemon/watchers/base_file_watcher.py
@@ -93,6 +93,13 @@ def filter_event(self, event: Dict) -> bool:
"""Additional filtering. Override for tool-specific rules. Default: keep all."""
return True
+ def _post_normalize(self, event: Dict, file_path: str) -> Dict:
+ """Hook for subclasses to enrich normalized events with file-path context.
+ Called after normalize_event() and before filter_event().
+ Default: return event unchanged.
+ """
+ return event
+
def matches_file_pattern(self, file_path: str) -> bool:
"""Check if file matches the watcher's file pattern."""
filename = os.path.basename(file_path)
@@ -137,6 +144,10 @@ def _process_file(self, file_path: str):
new_position = cursor.last_position + len(new_content.encode("utf-8"))
+ logger.info("[%s] Processing %s: %d bytes new content from pos %d",
+ self.tool_name, file_path, len(new_content.encode("utf-8")), cursor.last_position)
+
+ event_count = 0
for line in new_content.splitlines():
line = line.strip()
if not line:
@@ -150,6 +161,8 @@ def _process_file(self, file_path: str):
if normalized is None:
continue
+ normalized = self._post_normalize(normalized, file_path)
+
if not self.filter_event(normalized):
continue
@@ -158,13 +171,15 @@ def _process_file(self, file_path: str):
byte_size = len(line.encode("utf-8"))
self._buffer.add_line(normalized, byte_size)
+ event_count += 1
+ logger.info("[%s] Added %d events to buffer (total: %d, trigger at %d)",
+ self.tool_name, event_count, len(self._buffer.lines), self.batch_trigger_lines)
self.cursor_manager.update_cursor(file_path, new_position)
-
self._check_batch_trigger()
except Exception as e:
- logger.error("[%s] Error processing file %s: %s", self.tool_name, file_path, e)
+ logger.error("[%s] Error processing file %s: %s", self.tool_name, file_path, e, exc_info=True)
def _check_batch_trigger(self):
"""Check if batch trigger conditions are met."""
@@ -183,10 +198,10 @@ def _flush_buffer(self):
return
events = self._buffer.lines.copy()
- self._buffer.clear()
logger.info("[%s] Flushing %d events", self.tool_name, len(events))
try:
self.batch_callback(events)
+ self._buffer.clear()
except Exception as e:
- logger.error("[%s] Batch callback failed: %s", self.tool_name, e)
+ logger.error("[%s] Batch callback failed: %s", self.tool_name, e, exc_info=True)
diff --git a/openviking/daemon/watchers/claude_code_watcher.py b/openviking/daemon/watchers/claude_code_watcher.py
index 4174d559b7..88ffae8c87 100644
--- a/openviking/daemon/watchers/claude_code_watcher.py
+++ b/openviking/daemon/watchers/claude_code_watcher.py
@@ -1,8 +1,17 @@
"""
Claude Code JSONL log watcher.
Monitors ~/.claude/projects//.jsonl files.
+
+Real log format (verified against 288 sessions, 29541 lines):
+- Top-level "type": "user" | "assistant" | "attachment" | "queue-operation" | "system" | ...
+- "role" and "content" are nested inside "message" object
+- "message.content" can be a plain string OR an array of content blocks
+ (e.g. [{"type": "text", "text": "..."}, {"type": "tool_use", ...}])
+- Session ID is at top-level "sessionId" (camelCase, no underscore)
+- Project name is NOT in the JSON — derived from file path
"""
import json
+import os
from typing import Dict, List, Optional
from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
@@ -12,6 +21,26 @@
logger = get_logger(__name__)
+def _extract_text_from_content(content) -> str:
+ """Extract plain text from Claude Code message content.
+
+ content can be:
+ - str: a plain text message
+ - list: array of content blocks (text, tool_use, thinking, tool_result)
+ Returns concatenated text from all "text" blocks, or "" if none.
+ """
+ if isinstance(content, str):
+ return content
+ if isinstance(content, list):
+ text_parts = []
+ for block in content:
+ if isinstance(block, dict):
+ if block.get("type") == "text" and "text" in block:
+ text_parts.append(block["text"])
+ return "\n".join(text_parts)
+ return ""
+
+
@register_watcher("claude_code")
class ClaudeCodeWatcher(BaseFileWatcher):
"""Watches Claude Code JSONL log files and extracts conversation events."""
@@ -41,20 +70,58 @@ def parse_line(self, line: str) -> Optional[Dict]:
return None
def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
- """Normalize Claude Code event to common format."""
- role = raw_event.get("role")
+ """Normalize Claude Code event to common format.
+
+ Only extracts events where top-level type is "user" or "assistant"
+ and message.role is "user" or "assistant".
+ Skips tool_use-only assistant messages (no text content).
+ """
event_type = raw_event.get("type", "")
+ # Only process conversation messages
+ if event_type not in ("user", "assistant"):
+ return None
+
+ msg = raw_event.get("message")
+ if not isinstance(msg, dict):
+ return None
+
+ role = msg.get("role")
if role not in ("user", "assistant"):
return None
- if event_type and event_type != "message":
+
+ # Extract text content — skip if empty (e.g. tool_use-only messages)
+ raw_content = msg.get("content", "")
+ content = _extract_text_from_content(raw_content)
+ if not content:
return None
+ # Skip tool_result messages (type="user" but content is tool_result array)
+ if isinstance(raw_content, list):
+ has_tool_result = any(
+ isinstance(b, dict) and b.get("type") == "tool_result"
+ for b in raw_content
+ )
+ if has_tool_result:
+ return None
+
return {
"role": role,
- "content": raw_event.get("content", ""),
+ "content": content,
"type": "message",
"timestamp": raw_event.get("timestamp"),
- "session_id": raw_event.get("session_id"),
- "project_name": raw_event.get("project_name"),
+ "session_id": raw_event.get("sessionId"),
+ "project_name": None, # injected by _post_normalize
}
+
+ def _post_normalize(self, event: Dict, file_path: str) -> Dict:
+ """Derive project_name from file path: ~/.claude/projects//.jsonl"""
+ if not event.get("project_name"):
+ parts = file_path.replace("\\", "/").split("/")
+ try:
+ projects_idx = parts.index("projects")
+ if projects_idx + 1 < len(parts) - 1:
+ event["project_name"] = parts[projects_idx + 1]
+ except ValueError:
+ pass
+ return event
diff --git a/tests/daemon/test_claude_code_watcher.py b/tests/daemon/test_claude_code_watcher.py
index a31b03e21a..201670f045 100644
--- a/tests/daemon/test_claude_code_watcher.py
+++ b/tests/daemon/test_claude_code_watcher.py
@@ -1,9 +1,19 @@
-"""Tests for ClaudeCodeWatcher parsing and filtering logic."""
+"""Tests for ClaudeCodeWatcher parsing and filtering logic.
+
+All test fixtures use the REAL Claude Code JSONL format:
+- Top-level "type": "user" | "assistant" | ...
+- "role" and "content" nested inside "message" object
+- "sessionId" at top level (camelCase)
+- "content" can be string or array of content blocks
+"""
import json
import os
import tempfile
-from openviking.daemon.watchers.claude_code_watcher import ClaudeCodeWatcher
+from openviking.daemon.watchers.claude_code_watcher import (
+ ClaudeCodeWatcher,
+ _extract_text_from_content,
+)
class FakeCursorManager:
@@ -33,18 +43,156 @@ def _make_watcher(batch_trigger_lines=50, batch_trigger_seconds=300):
return watcher, batches, cursor_mgr
+# --- Fixtures matching REAL Claude Code JSONL format ---
+
+def _make_user_event(content="Hello", session_id="test-session-001"):
+ """Create a realistic user event matching real Claude Code logs."""
+ return {
+ "type": "user",
+ "message": {"role": "user", "content": content},
+ "uuid": "user-uuid-001",
+ "timestamp": "2026-06-15T10:30:00.000Z",
+ "sessionId": session_id,
+ }
+
+
+def _make_assistant_event(content="AI answer", session_id="test-session-001"):
+ """Create a realistic assistant event with string content."""
+ return {
+ "type": "assistant",
+ "message": {
+ "role": "assistant",
+ "content": content,
+ },
+ "uuid": "asst-uuid-001",
+ "timestamp": "2026-06-15T10:30:01.000Z",
+ "sessionId": session_id,
+ }
+
+
+def _make_assistant_event_blocks(blocks, session_id="test-session-001"):
+ """Create an assistant event with array content blocks."""
+ return {
+ "type": "assistant",
+ "message": {
+ "role": "assistant",
+ "content": blocks,
+ },
+ "uuid": "asst-uuid-002",
+ "timestamp": "2026-06-15T10:30:02.000Z",
+ "sessionId": session_id,
+ }
+
+
+def _make_queue_event():
+ """Create a queue-operation event (should be skipped)."""
+ return {
+ "type": "queue-operation",
+ "operation": "enqueue",
+ "timestamp": "2026-06-15T10:29:59.000Z",
+ "sessionId": "test-session-001",
+ "content": "Some prompt",
+ }
+
+
+def _make_system_event():
+ """Create a system event (should be skipped)."""
+ return {
+ "type": "system",
+ "subtype": "stop_hook_summary",
+ "uuid": "sys-uuid-001",
+ "timestamp": "2026-06-15T10:30:03.000Z",
+ "sessionId": "test-session-001",
+ }
+
+
+def _make_attachment_event():
+ """Create an attachment event (should be skipped)."""
+ return {
+ "type": "attachment",
+ "attachment": {"type": "hook_success"},
+ "uuid": "att-uuid-001",
+ "timestamp": "2026-06-15T10:29:58.000Z",
+ "sessionId": "test-session-001",
+ }
+
+
+def _make_tool_result_user_event():
+ """Create a user event carrying tool_result (should be skipped)."""
+ return {
+ "type": "user",
+ "message": {
+ "role": "user",
+ "content": [
+ {
+ "type": "tool_result",
+ "tool_use_id": "call_001",
+ "content": "output here",
+ "is_error": False,
+ }
+ ],
+ },
+ "uuid": "tr-uuid-001",
+ "timestamp": "2026-06-15T10:30:04.000Z",
+ "sessionId": "test-session-001",
+ }
+
+
+# --- Tests ---
+
def test_tool_name():
watcher, _, _ = _make_watcher()
assert watcher.tool_name == "claude_code"
+# --- _extract_text_from_content helper ---
+
+def test_extract_text_from_string():
+ assert _extract_text_from_content("hello world") == "hello world"
+
+
+def test_extract_text_from_array_with_text():
+ blocks = [
+ {"type": "thinking", "thinking": "let me think..."},
+ {"type": "text", "text": "Here is the answer."},
+ {"type": "tool_use", "name": "Bash", "input": {"command": "ls"}},
+ ]
+ assert _extract_text_from_content(blocks) == "Here is the answer."
+
+
+def test_extract_text_from_array_multiple_texts():
+ blocks = [
+ {"type": "text", "text": "Part 1"},
+ {"type": "text", "text": "Part 2"},
+ ]
+ assert _extract_text_from_content(blocks) == "Part 1\nPart 2"
+
+
+def test_extract_text_from_array_tool_only():
+ blocks = [
+ {"type": "tool_use", "name": "Read", "input": {"file_path": "/x"}},
+ ]
+ assert _extract_text_from_content(blocks) == ""
+
+
+def test_extract_text_from_none():
+ assert _extract_text_from_content(None) == ""
+
+
+def test_extract_text_from_empty_list():
+ assert _extract_text_from_content([]) == ""
+
+
+# --- parse_line ---
+
def test_parse_valid_jsonl_line():
watcher, _, _ = _make_watcher()
- line = '{"timestamp": "2026-06-15T10:30:00Z", "role": "user", "content": "Hello", "type": "message"}'
- event = watcher.parse_line(line)
- assert event is not None
- assert event["role"] == "user"
- assert event["content"] == "Hello"
+ event = _make_user_event("Hello")
+ line = json.dumps(event)
+ result = watcher.parse_line(line)
+ assert result is not None
+ assert result["type"] == "user"
+ assert result["message"]["role"] == "user"
def test_parse_invalid_line():
@@ -53,72 +201,165 @@ def test_parse_invalid_line():
assert watcher.parse_line("") is None
-def test_normalize_event_user_message():
+# --- normalize_event ---
+
+def test_normalize_user_string_message():
watcher, _, _ = _make_watcher()
- raw = {"role": "user", "type": "message", "content": "Hello", "timestamp": "2026-06-15T10:30:00Z"}
+ raw = _make_user_event("Hello world")
result = watcher.normalize_event(raw)
assert result is not None
assert result["role"] == "user"
+ assert result["content"] == "Hello world"
assert result["type"] == "message"
- assert result["content"] == "Hello"
+ assert result["session_id"] == "test-session-001"
+
+
+def test_normalize_assistant_string_message():
+ watcher, _, _ = _make_watcher()
+ raw = _make_assistant_event("AI response here")
+ result = watcher.normalize_event(raw)
+ assert result is not None
+ assert result["role"] == "assistant"
+ assert result["content"] == "AI response here"
-def test_normalize_event_assistant_message():
+def test_normalize_assistant_text_block():
+ """Assistant with text content block should be extracted."""
watcher, _, _ = _make_watcher()
- raw = {"role": "assistant", "type": "message", "content": "AI answer"}
+ raw = _make_assistant_event_blocks([
+ {"type": "thinking", "thinking": "internal reasoning"},
+ {"type": "text", "text": "The answer is 42."},
+ ])
result = watcher.normalize_event(raw)
assert result is not None
assert result["role"] == "assistant"
- assert result["content"] == "AI answer"
+ assert result["content"] == "The answer is 42."
+ assert "internal reasoning" not in result["content"]
-def test_normalize_event_excludes_system_role():
+def test_normalize_assistant_tool_use_only_skipped():
+ """Assistant message with only tool_use blocks (no text) should be skipped."""
watcher, _, _ = _make_watcher()
- raw = {"role": "system", "type": "message", "content": "System msg"}
+ raw = _make_assistant_event_blocks([
+ {"type": "tool_use", "name": "Bash", "input": {"command": "ls"}},
+ ])
+ result = watcher.normalize_event(raw)
+ assert result is None
+
+
+def test_normalize_tool_result_user_skipped():
+ """User events carrying tool_result should be skipped."""
+ watcher, _, _ = _make_watcher()
+ raw = _make_tool_result_user_event()
+ result = watcher.normalize_event(raw)
+ assert result is None
+
+
+def test_normalize_queue_operation_skipped():
+ watcher, _, _ = _make_watcher()
+ raw = _make_queue_event()
assert watcher.normalize_event(raw) is None
-def test_normalize_event_excludes_tool_call():
+def test_normalize_system_event_skipped():
watcher, _, _ = _make_watcher()
- raw = {"role": "assistant", "type": "tool_call", "content": "call"}
+ raw = _make_system_event()
assert watcher.normalize_event(raw) is None
-def test_normalize_event_excludes_tool_result():
+def test_normalize_attachment_event_skipped():
watcher, _, _ = _make_watcher()
- raw = {"role": "assistant", "type": "tool_result", "content": "result"}
+ raw = _make_attachment_event()
assert watcher.normalize_event(raw) is None
+def test_normalize_no_message_field_skipped():
+ """Events with type=user/assistant but no message dict should be skipped."""
+ watcher, _, _ = _make_watcher()
+ assert watcher.normalize_event({"type": "user"}) is None
+ assert watcher.normalize_event({"type": "assistant", "message": "not a dict"}) is None
+
+
+# --- _post_normalize (project_name from path) ---
+
+def test_post_normalize_injects_project_name():
+ watcher, _, _ = _make_watcher()
+ event = {"role": "user", "content": "test", "project_name": None}
+ path = "C:/Users/test/.claude/projects/D--Develop-MyProject/abc123.jsonl"
+ result = watcher._post_normalize(event, path)
+ assert result["project_name"] == "D--Develop-MyProject"
+
+
+def test_post_normalize_windows_backslash():
+ watcher, _, _ = _make_watcher()
+ event = {"role": "user", "content": "test", "project_name": None}
+ path = "C:\\Users\\test\\.claude\\projects\\D--Develop-OpenViking\\session.jsonl"
+ result = watcher._post_normalize(event, path)
+ assert result["project_name"] == "D--Develop-OpenViking"
+
+
+def test_post_normalize_preserves_existing_project_name():
+ watcher, _, _ = _make_watcher()
+ event = {"role": "user", "content": "test", "project_name": "already-set"}
+ path = "C:/Users/test/.claude/projects/D--Develop-Other/abc.jsonl"
+ result = watcher._post_normalize(event, path)
+ assert result["project_name"] == "already-set"
+
+
+# --- filter_event ---
+
def test_filter_event_keeps_messages():
watcher, _, _ = _make_watcher()
event = {"role": "user", "type": "message", "content": "Hello", "tool_name": "claude_code"}
assert watcher.filter_event(event) is True
+# --- _process_file (integration of parse + normalize + buffer) ---
+
def test_process_file():
"""Test that _process_file reads, parses, normalizes, and buffers events."""
watcher, batches, cursor_mgr = _make_watcher(batch_trigger_lines=2)
- with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False, encoding="utf-8") as f:
- f.write(json.dumps({"role": "user", "type": "message", "content": "Hello"}) + "\n")
- f.write(json.dumps({"role": "assistant", "type": "message", "content": "Hi"}) + "\n")
- f.write(json.dumps({"role": "system", "type": "message", "content": "ignored"}) + "\n")
- tmp_path = f.name
+ # Build a realistic session file path
+ tmp_dir = tempfile.mkdtemp()
+ project_dir = os.path.join(tmp_dir, "projects", "D--Develop-TestProj")
+ os.makedirs(project_dir)
+ session_file = os.path.join(project_dir, "sess-001.jsonl")
+
+ with open(session_file, "w", encoding="utf-8") as f:
+ # queue-operation → should be skipped
+ f.write(json.dumps(_make_queue_event()) + "\n")
+ # user message → should be extracted
+ f.write(json.dumps(_make_user_event("Hello")) + "\n")
+ # attachment → should be skipped
+ f.write(json.dumps(_make_attachment_event()) + "\n")
+ # assistant text reply → should be extracted
+ f.write(json.dumps(_make_assistant_event("Hi there")) + "\n")
+ # system event → should be skipped
+ f.write(json.dumps(_make_system_event()) + "\n")
+ # tool_result user → should be skipped
+ f.write(json.dumps(_make_tool_result_user_event()) + "\n")
try:
- watcher._process_file(tmp_path)
+ watcher._process_file(session_file)
# batch_trigger_lines=2, so 2 valid events should trigger flush
assert len(batches) == 1
assert len(batches[0]) == 2
assert batches[0][0]["role"] == "user"
+ assert batches[0][0]["content"] == "Hello"
assert batches[0][1]["role"] == "assistant"
+ assert batches[0][1]["content"] == "Hi there"
assert all(e["tool_name"] == "claude_code" for e in batches[0])
+ # project_name should be derived from path
+ assert batches[0][0]["project_name"] == "D--Develop-TestProj"
# Cursor should have been updated
assert len(cursor_mgr.updates) == 1
- assert cursor_mgr.updates[0][0] == tmp_path
+ assert cursor_mgr.updates[0][0] == session_file
finally:
- os.unlink(tmp_path)
+ os.unlink(session_file)
+ os.rmdir(project_dir)
+ os.rmdir(os.path.join(tmp_dir, "projects"))
+ os.rmdir(tmp_dir)
def test_force_flush_empty_buffer():
diff --git a/tests/daemon/test_integration.py b/tests/daemon/test_integration.py
index c4b497092e..ee62964a01 100644
--- a/tests/daemon/test_integration.py
+++ b/tests/daemon/test_integration.py
@@ -134,7 +134,7 @@ def test_knowledge_router_all_categories():
assert "skills/claude_code" in router.route(skill)
mem_proj = ExtractedKnowledge(status="EXTRACTED", category="memories", title="Arch", content="...", project_name="myapp")
- assert "memories/projects/myapp" in router.route(mem_proj)
+ assert "memories/myapp" in router.route(mem_proj)
mem_global = ExtractedKnowledge(status="EXTRACTED", category="memories", title="General", content="...")
assert "memories/global" in router.route(mem_global)
diff --git a/tests/daemon/test_knowledge_router.py b/tests/daemon/test_knowledge_router.py
index a045dc4a5a..739703321b 100644
--- a/tests/daemon/test_knowledge_router.py
+++ b/tests/daemon/test_knowledge_router.py
@@ -18,7 +18,7 @@ def test_route_skills():
router = KnowledgeRouter()
k = _make(category="skills", title="PostgreSQL Config")
uri = router.route(k)
- assert uri.startswith("viking://skills/general/")
+ assert uri.startswith("viking://resources/skills/general/")
assert uri.endswith(".md")
@@ -26,7 +26,7 @@ def test_route_skills_with_source_tool():
router = KnowledgeRouter()
k = _make(category="skills", title="PostgreSQL Config", source_tool="claude_code")
uri = router.route(k)
- assert uri.startswith("viking://skills/claude_code/")
+ assert uri.startswith("viking://resources/skills/claude_code/")
assert uri.endswith(".md")
@@ -42,7 +42,7 @@ def test_route_memories_global():
router = KnowledgeRouter()
k = _make(category="memories", title="Global Memory")
uri = router.route(k)
- assert uri.startswith("viking://memories/global/")
+ assert uri.startswith("viking://resources/memories/global/")
def test_route_resources():
@@ -71,3 +71,11 @@ def test_sanitize_filename():
router = KnowledgeRouter()
assert router._sanitize_filename('file<>:name') == 'file___name'
assert router._sanitize_filename("a" * 100) == "a" * 50
+
+
+def test_sanitize_filename_non_ascii():
+ router = KnowledgeRouter()
+ result = router._sanitize_filename("中文标题测试")
+ assert len(result) == 16
+ assert result.isascii()
+ assert all(c in "0123456789abcdef" for c in result)
diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py
index 8623a9e304..d3a797ffa0 100644
--- a/tests/daemon/test_multi_watcher_integration.py
+++ b/tests/daemon/test_multi_watcher_integration.py
@@ -91,8 +91,8 @@ def test_claude_code_events_have_tool_name(tmp_path):
test_file = tmp_path / "session.jsonl"
test_file.write_text(
- '{"role": "user", "content": "Hello from CC"}\n'
- '{"role": "assistant", "content": "Hi from CC"}\n'
+ '{"type": "user", "message": {"role": "user", "content": "Hello from CC"}}\n'
+ '{"type": "assistant", "message": {"role": "assistant", "content": "Hi from CC"}}\n'
)
watcher._process_file(str(test_file))
From 0136fbca4bea1ecc539bd7c126f751e6cd7cfd46 Mon Sep 17 00:00:00 2001
From: baobaodae <2014596548@qq.com>
Date: Sat, 20 Jun 2026 11:52:17 +0800
Subject: [PATCH 3/6] feat(daemon): add BasePollingWatcher and CursorDBWatcher
for SQLite-based AI tool monitoring
Extend daemon watcher framework to support SQLite database sources alongside
existing JSONL file watchers. BasePollingWatcher uses Thread + Event.wait()
polling instead of watchdog Observer, with cursor advancement on all raw
events (including filtered) to avoid infinite re-query loops.
CursorDBWatcher monitors Cursor IDE's dual-SQLite storage:
- Global DB (cursorDiskKV) for bubbleId:* conversation entries
- type 1=user, 2=assistant; filters empty text (tool calls/streaming)
Includes 28 new tests (12 polling base + 16 cursor_db) and 3 integration tests.
---
docs/daemon/db-watcher-plan.md | 752 ++++++++++++++++++
.../daemon/watchers/base_polling_watcher.py | 205 +++++
.../daemon/watchers/cursor_db_watcher.py | 193 +++++
openviking/daemon/watchers/registry.py | 6 +
tests/daemon/test_base_polling_watcher.py | 313 ++++++++
tests/daemon/test_cursor_db_watcher.py | 422 ++++++++++
.../daemon/test_multi_watcher_integration.py | 59 +-
7 files changed, 1948 insertions(+), 2 deletions(-)
create mode 100644 docs/daemon/db-watcher-plan.md
create mode 100644 openviking/daemon/watchers/base_polling_watcher.py
create mode 100644 openviking/daemon/watchers/cursor_db_watcher.py
create mode 100644 tests/daemon/test_base_polling_watcher.py
create mode 100644 tests/daemon/test_cursor_db_watcher.py
diff --git a/docs/daemon/db-watcher-plan.md b/docs/daemon/db-watcher-plan.md
new file mode 100644
index 0000000000..49ae875c41
--- /dev/null
+++ b/docs/daemon/db-watcher-plan.md
@@ -0,0 +1,752 @@
+## Daemon 数据库源工具适配方案(v2 — 详细执行计划)
+
+### 一、现状分析
+
+#### 1.1 当前架构全景
+
+```
+ BaseWatcher Protocol
+ (tool_name, start, stop, flush)
+ │
+ BaseFileWatcher (ABC)
+ ┌─────────┼──────────────────────┐
+ │ │ │
+ watchdog CursorManager BatchBuffer
+ Observer (byte offset) (lines + time)
+ │ │ │
+ └─────────┴──────────┬───────────┘
+ │
+ _process_file()
+ parse_line() → normalize_event() → _post_normalize() → filter_event()
+ │
+ batch_callback → DaemonService._enqueue_batch()
+ │
+ asyncio.Queue → _etl_loop() → BatchETLPipeline
+ │
+ Filter → Reconstruct → Extract(LLM) → Dedup
+ │
+ VikingStorageAdapter → viking://resources/
+```
+
+#### 1.2 现有 5 个文件 Watcher
+
+| Watcher | tool_name | 文件格式 | 继承 | 特殊点 |
+|---------|-----------|----------|------|--------|
+| ClaudeCodeWatcher | `claude_code` | `*.jsonl` | BaseFileWatcher | `_post_normalize` 从路径推导 project_name |
+| CursorWatcher | `cursor` | `*.log` | BaseFileWatcher | 灵活 role 映射(user/human/human_turn) |
+| AiderWatcher | `aider` | `.aider.chat.history.md` | BaseFileWatcher | **覆盖** `_process_file`,多行 Markdown 解析 |
+| ContinueDevWatcher | `continue_dev` | `*.json` | BaseFileWatcher | 标准 JSON 行格式 |
+| GenericJSONLWatcher | `generic_jsonl` | `*.jsonl` | BaseFileWatcher | `extra` dict 自定义字段映射 |
+
+#### 1.3 关键接口约束
+
+- **BaseWatcher Protocol**(`watchers/__init__.py`):4 个方法,天然支持非文件型 watcher
+- **Registry**(`registry.py`):`create_watcher(tool_name, **kwargs)` 工厂模式,已有 `extra` 参数传递链
+- **CursorManager**(`cursor_manager.py`):`file_path TEXT PK` + `last_position INTEGER`,后者是通用整数——文件 watcher 存字节偏移,DB watcher 可存 rowid/timestamp
+- **DaemonService**(`service.py`):`_enqueue_batch` 已用 `loop.call_soon_threadsafe`,线程安全——polling thread 可直接调用
+- **WatcherConfig**(`config.py`):已有 `extra: Dict[str, Any]` 字段,无需改 config schema
+
+#### 1.4 不需要改动的部分
+
+- `BatchBuffer` / `BatchETLPipeline` / `ConversationReconstructor` / `KnowledgeExtractor` / `KnowledgeDeduplicator` / `VikingStorageAdapter` / `KnowledgeRouter` ——全部与数据源无关,只消费 normalized events
+- `WatcherConfig` / `DaemonConfig` —— schema 已够用
+- `DaemonService.start()` / `_etl_loop()` —— 已支持多 watcher 并行
+
+---
+
+### 二、需要适配的数据库源工具(含格式调研)
+
+#### 2.1 Cursor(P0)— 双库架构
+
+**⚠️ 关键发现:Cursor 使用两个独立的 SQLite 数据库**
+
+| 数据库 | 路径 (Windows) | 表名 | 用途 |
+|--------|---------------|------|------|
+| 工作区 DB | `%APPDATA%\Cursor\User\workspaceStorage\\state.vscdb` | `ItemTable`(PascalCase) | 会话元数据 |
+| 全局 DB | `%APPDATA%\Cursor\User\globalStorage\state.vscdb` | `cursorDiskKV`(camelCase) | 对话内容 |
+
+**踩坑点:表名大小写不同!混用会报 `no such table` 错误。**
+
+**工作区 DB — 会话元数据**:
+```sql
+-- 查询所有会话列表
+SELECT value FROM ItemTable WHERE [key] = 'composer.composerData';
+```
+返回 JSON,结构为 `{ "allComposers": [ { "id": "", "createdAt": "...", ... } ] }`
+
+**全局 DB — 对话内容(Bubble 数据)**:
+```sql
+-- 查询某会话的所有消息
+SELECT [key], value FROM cursorDiskKV WHERE [key] LIKE 'bubbleId::%' ORDER BY rowid ASC;
+
+-- 查询所有 composerId
+SELECT DISTINCT SUBSTR([key], 10, INSTR(SUBSTR([key], 10), ':') - 1)
+FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:%';
+```
+
+**Key 格式**:`bubbleId::`(冒号分隔三段)
+
+**Value JSON 结构(单条消息)**:
+```json
+{
+ "_v": 3, // schema 版本号
+ "type": 1, // 1=用户消息, 2=助手消息
+ "text": "用户的问题内容", // 对话文本
+ "createdAt": "2026-...", // 时间戳
+ "isAgentic": false, // 是否为 Agent 模式
+ "toolResults": [], // 工具调用结果
+ "codeBlocks": [], // 代码块
+ "allThinkingBlocks": [ // AI 思考过程(仅 type=2)
+ { "thinking": "..." }
+ ]
+}
+```
+
+**解析注意事项**:
+1. 流式传输会产生 `text` 为空的中间态 assistant 消息,必须过滤
+2. `_v` 版本号可能随更新递增,遇到未知版本应仅打 warning
+3. 工作区路径有 `file:///` 前缀且 URL 编码,需解码
+4. 项目文件夹删除后,工作区 DB 丢失但全局 DB 数据仍在("孤立对话")
+5. 数据物理隔离在两个 DB,必须先从工作区获取 composerId,再跨库查询全局内容
+
+#### 2.2 Trae(P1)— 单库架构
+
+| 路径 (Windows) | 表名 | Key 模式 |
+|---------------|------|----------|
+| `%APPDATA%\Trae\User\workspaceStorage\\state.vscdb` | `ItemTable` | `memento/icube-ai-ng-chat-storage` 或 `memento/icube-ai-agent-storage` |
+
+**Value JSON 结构**:
+```json
+{
+ "list": [
+ {
+ "messages": [
+ {
+ "role": "user",
+ "content": "用户的问题",
+ "timestamp": "...",
+ "status": "active" // "deleted" 的需跳过
+ },
+ {
+ "role": "assistant",
+ "content": "", // 注意:可能为空!
+ "agentTaskContent": { // 实际内容可能在这里
+ "proposalText": "实际回复内容",
+ "proposalReasoningContent": "推理过程"
+ }
+ }
+ ]
+ }
+ ]
+}
+```
+
+**踩坑点**:助手消息的 `content` 字段经常为空,核心内容在 `agentTaskContent` 中。提取优先级:`content` > `agentTaskContent.proposalText` > `finish` 工具结果。
+
+#### 2.3 Windsurf(P2)
+
+- 存储格式:类似 VS Code 的 SQLite `state.vscdb`
+- 路径:`%APPDATA%\Windsurf\User\` 下
+- Cascade 会话文件:`~/.codeium/windsurf/cascade/`
+- **具体表名和 key 模式尚无公开文档**,需实际 dump 确认
+- 使用 WAL 模式(存在 `-wal` 和 `-shm` 文件)
+
+#### 2.4 GitHub Copilot(P2)
+
+**⚠️ 关键发现:Copilot 不用 SQLite,用的是 JSON/JSONL 文件**
+
+| 路径 | 格式 | 说明 |
+|------|------|------|
+| VS Code `workspaceStorage//chatSessions/` | `.json` + `.jsonl` | 项目级 |
+| VS Code `globalStorage/chatSessions/` | `.json` + `.jsonl` | 全局 |
+
+**JSON/JSONL 结构**:
+- `.json` 文件包含 `sessionId`、`timestamp`、`customTitle`
+- `.jsonl` 文件首行 `kind: 0` 为完整快照,后续行为 UI 补丁(应忽略)
+- 快照内 `requests` 数组嵌套用户输入和助手响应
+- 响应通过 `kind` 字段区分(文本/思考/工具调用)
+
+→ 这意味着 GitHub Copilot 更适合用现有的 `BaseFileWatcher`(文件追加模式),而非 `BasePollingWatcher`。需要新建一个 `CopilotChatWatcher(BaseFileWatcher)` 子类。
+
+#### 2.5 优先级排序
+
+| 优先级 | 工具 | Watcher 类型 | 理由 |
+|--------|------|-------------|------|
+| **P0** | Cursor | BasePollingWatcher(双库 SQLite) | 用户量最大,格式已调研 |
+| P1 | Trae | BasePollingWatcher(单库 SQLite) | 格式已明确,VS Code fork |
+| P1 | Windsurf | BasePollingWatcher(SQLite) | 与 Cursor/Trae 类似,需 dump 确认 |
+| P2 | GitHub Copilot | **BaseFileWatcher**(JSON/JSONL 文件) | 非 SQLite!用文件追加模式 |
+| P3 | Warp Terminal | BasePollingWatcher | 命令历史,低优先级 |
+| P4 | Tabnine | BasePollingWatcher | JSON+SQLite 混合,低优先级 |
+
+---
+
+### 三、架构设计
+
+#### 3.1 新增 BasePollingWatcher 基类
+
+**文件**:`openviking/daemon/watchers/base_polling_watcher.py`
+
+不继承 `BaseFileWatcher`,直接实现 `BaseWatcher` Protocol。
+
+```python
+class BasePollingWatcher(ABC):
+ """Base class for database/API-based watchers that use periodic polling."""
+
+ def __init__(
+ self,
+ tool_name: str,
+ watch_dir: str, # DB 文件所在目录(或 DB 文件路径)
+ cursor_manager: CursorManager,
+ batch_callback: Callable[[List[Dict]], None],
+ poll_interval: int = 30, # 轮询间隔(秒)
+ batch_trigger_lines: int = 50,
+ batch_trigger_seconds: int = 300,
+ extra: Optional[Dict] = None,
+ **kwargs, # 兼容 create_watcher 工厂的其他参数
+ ):
+ self._tool_name = tool_name
+ self.watch_dir = os.path.expanduser(watch_dir)
+ self.cursor_manager = cursor_manager
+ self.batch_callback = batch_callback
+ self.poll_interval = poll_interval
+ self.extra = extra or {}
+
+ self._buffer = BatchBuffer()
+ self.batch_trigger_lines = batch_trigger_lines
+ self.batch_trigger_seconds = batch_trigger_seconds
+ self._poll_thread: Optional[Thread] = None
+ self._stop_event = threading.Event()
+
+ # ─── BaseWatcher Protocol ───
+ @property
+ def tool_name(self) -> str:
+ return self._tool_name
+
+ def start(self) -> None:
+ self._stop_event.clear()
+ self._poll_thread = Thread(target=self._poll_loop, daemon=True, name=f"poll-{self._tool_name}")
+ self._poll_thread.start()
+ logger.info("[%s] Polling watcher started (interval=%ds)", self._tool_name, self.poll_interval)
+
+ def stop(self) -> None:
+ self._stop_event.set()
+ if self._poll_thread:
+ self._poll_thread.join(timeout=5)
+ logger.info("[%s] Polling watcher stopped", self._tool_name)
+
+ def flush(self) -> None:
+ self._flush_buffer()
+
+ # ─── 子类必须实现 ───
+ @abstractmethod
+ def query_new_events(self, last_cursor: int) -> List[Dict]:
+ """查询数据源中 cursor 之后的新事件。
+ Args:
+ last_cursor: 上次处理到的位置(rowid/timestamp/offset)
+ Returns:
+ raw event dicts,每个必须包含 '_cursor_position' 字段
+ """
+ ...
+
+ @abstractmethod
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """将 raw DB row 转换为标准 normalized event dict。
+ 返回 None 表示跳过。
+ 输出格式同文件 watcher:{role, content, type, timestamp, session_id, project_name}
+ """
+ ...
+
+ # ─── 可选覆盖 ───
+ def filter_event(self, event: Dict) -> bool:
+ """额外过滤。默认保留全部。"""
+ return True
+
+ def resolve_db_path(self) -> Optional[str]:
+ """解析 DB 文件完整路径。
+ 默认:在 watch_dir 下查找 extra['db_filename']。
+ 子类可覆盖以实现更复杂的发现逻辑。
+ """
+ db_filename = self.extra.get("db_filename", "state.vscdb")
+ candidate = os.path.join(self.watch_dir, db_filename)
+ if os.path.exists(candidate):
+ return candidate
+ return None
+
+ # ─── 内部方法 ───
+ def _poll_loop(self):
+ """主轮询循环。"""
+ # cursor key 使用 watch_dir(与文件 watcher 的 file_path 区分)
+ cursor_key = self.watch_dir
+
+ while not self._stop_event.is_set():
+ try:
+ db_path = self.resolve_db_path()
+ if db_path is None:
+ logger.debug("[%s] DB not found, retrying...", self._tool_name)
+ self._stop_event.wait(self.poll_interval)
+ continue
+
+ cursor = self.cursor_manager.get_cursor(cursor_key)
+ raw_events = self.query_new_events(cursor.last_position)
+
+ if raw_events:
+ new_position = cursor.last_position
+ for raw in raw_events:
+ normalized = self.normalize_event(raw)
+ if normalized is None:
+ continue
+ if not self.filter_event(normalized):
+ continue
+ normalized["tool_name"] = self._tool_name
+ self._buffer.add_line(normalized, byte_size=0)
+
+ # 追踪最大 cursor position
+ pos = raw.get("_cursor_position", 0)
+ if pos > new_position:
+ new_position = pos
+
+ # 更新 cursor
+ if new_position > cursor.last_position:
+ self.cursor_manager.update_cursor(cursor_key, new_position)
+
+ self._check_batch_trigger()
+
+ except Exception as e:
+ logger.error("[%s] Poll error: %s", self._tool_name, e, exc_info=True)
+
+ self._stop_event.wait(self.poll_interval)
+
+ def _check_batch_trigger(self):
+ if self._buffer.is_empty():
+ return
+ line_count = len(self._buffer.lines)
+ age = time.time() - self._buffer.created_at if self._buffer.created_at > 0 else 0
+ if line_count >= self.batch_trigger_lines or age >= self.batch_trigger_seconds:
+ self._flush_buffer()
+
+ def _flush_buffer(self):
+ if self._buffer.is_empty():
+ return
+ events = self._buffer.lines.copy()
+ logger.info("[%s] Flushing %d events", self._tool_name, len(events))
+ try:
+ self.batch_callback(events)
+ self._buffer.clear()
+ except Exception as e:
+ logger.error("[%s] Batch callback failed: %s", self._tool_name, e, exc_info=True)
+```
+
+**关键设计决策:**
+
+| 决策点 | 方案 | 理由 |
+|--------|------|------|
+| 轮询机制 | `Thread` + `Event.wait(interval)` | 与 watchdog Observer 平级,DaemonService 已处理线程安全 |
+| cursor key | 使用 `watch_dir` 字符串 | 与文件 watcher 的 `file_path` 互不冲突,CursorManager 无需改动 |
+| DB 连接管理 | 子类在 `query_new_events` 内自行 open/close | SQLite `?mode=ro` 只读,避免锁冲突;CursorManager 已有 per-call connect 模式 |
+| `_cursor_position` 约定 | raw event dict 中必须携带 | 让子类灵活定义位置语义(rowid/timestamp/offset),基类只取 max |
+| `BatchBuffer` 复用 | 直接用现有实现 | `byte_size=0`(DB 无字节偏移概念),time trigger 正常工作 |
+| `extra` 参数 | 传递 `poll_interval`、`db_filename` 等 | 利用已有 WatcherConfig.extra 字段 |
+
+**注意:`BatchBuffer` 与 trigger 值存储**
+
+当前 `BatchBuffer` dataclass 不接受 trigger 参数——`batch_trigger_lines` 和 `batch_trigger_seconds` 由 `BaseFileWatcher` 作为实例属性持有,在 `_check_batch_trigger()` 中读取。
+
+`BasePollingWatcher` 采用同样模式:`self.batch_trigger_lines` / `self.batch_trigger_seconds` 存在 watcher 实例上,`BatchBuffer()` 无参构造。与 `BaseFileWatcher` 完全一致,零改动。
+
+#### 3.2 CursorDBWatcher 实现(P0)— 基于真实格式调研
+
+**文件**:`openviking/daemon/watchers/cursor_db_watcher.py`
+
+Cursor 使用**双库架构**:工作区 DB 存会话元数据,全局 DB 存对话内容。CursorDBWatcher 的 `watch_dir` 指向 Cursor 用户数据根目录(如 `%APPDATA%\Cursor\User`),内部同时访问两个 state.vscdb。
+
+```python
+@register_watcher("cursor_db")
+class CursorDBWatcher(BasePollingWatcher):
+ """Watches Cursor IDE's dual-SQLite storage for AI conversations.
+
+ Architecture:
+ - Workspace DB: workspaceStorage//state.vscdb → ItemTable → composer.composerData (session metadata)
+ - Global DB: globalStorage/state.vscdb → cursorDiskKV → bubbleId:: (message content)
+ """
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ poll_interval=30, batch_trigger_lines=50, batch_trigger_seconds=300,
+ extra=None, **kwargs):
+ super().__init__(
+ tool_name="cursor_db",
+ watch_dir=watch_dir, # e.g. %APPDATA%\Cursor\User
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ poll_interval=poll_interval,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ extra=extra,
+ )
+ self._global_db_path = os.path.join(self.watch_dir, "globalStorage", "state.vscdb")
+ self._workspace_storage_dir = os.path.join(self.watch_dir, "workspaceStorage")
+
+ def resolve_db_path(self) -> Optional[str]:
+ """返回全局 DB 路径(主要数据源)。"""
+ if os.path.exists(self._global_db_path):
+ return self._global_db_path
+ return None
+
+ def _discover_composer_ids(self) -> List[str]:
+ """扫描所有工作区 DB,收集 composerId 列表。
+ 用于关联全局 DB 中的 bubble 数据。
+ """
+ composer_ids = []
+ if not os.path.isdir(self._workspace_storage_dir):
+ return composer_ids
+
+ for ws_hash in os.listdir(self._workspace_storage_dir):
+ ws_db = os.path.join(self._workspace_storage_dir, ws_hash, "state.vscdb")
+ if not os.path.exists(ws_db):
+ continue
+ try:
+ conn = sqlite3.connect(f"file:{ws_db}?mode=ro", uri=True)
+ try:
+ row = conn.execute(
+ "SELECT value FROM ItemTable WHERE [key] = 'composer.composerData'"
+ ).fetchone()
+ if row and row[0]:
+ data = json.loads(row[0])
+ all_composers = data.get("allComposers", [])
+ for c in all_composers:
+ cid = c.get("id")
+ if cid:
+ composer_ids.append(cid)
+ finally:
+ conn.close()
+ except Exception:
+ continue
+ return composer_ids
+
+ def query_new_events(self, last_cursor: int) -> List[Dict]:
+ """从全局 DB 的 cursorDiskKV 表查询新 bubble 数据。
+
+ 策略:直接扫描所有 bubbleId:* key(rowid > last_cursor),
+ 不依赖工作区 DB 的 composerId 列表(支持发现孤立对话)。
+ """
+ db_path = self.resolve_db_path()
+ if not db_path:
+ return []
+
+ try:
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+ conn.execute("PRAGMA busy_timeout = 3000")
+ try:
+ rows = conn.execute(
+ "SELECT rowid, [key], value FROM cursorDiskKV "
+ "WHERE rowid > ? AND [key] LIKE 'bubbleId:%' "
+ "ORDER BY rowid ASC LIMIT 500",
+ (last_cursor,)
+ ).fetchall()
+
+ events = []
+ for rowid, key, value in rows:
+ # 解析 key: bubbleId::
+ parts = key.split(":", 2)
+ composer_id = parts[1] if len(parts) >= 3 else None
+
+ try:
+ parsed_value = json.loads(value) if isinstance(value, str) else value
+ except (json.JSONDecodeError, TypeError):
+ continue
+
+ events.append({
+ "rowid": rowid,
+ "key": key,
+ "value": parsed_value,
+ "composer_id": composer_id,
+ "_cursor_position": rowid,
+ })
+ return events
+ finally:
+ conn.close()
+ except sqlite3.OperationalError as e:
+ logger.warning("[cursor_db] SQLite error (DB may be locked): %s", e)
+ return []
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """解析 Cursor bubble 格式。
+
+ Value JSON:
+ - _v: schema version (currently 3)
+ - type: 1=user, 2=assistant
+ - text: message content
+ - createdAt: timestamp
+ - allThinkingBlocks: AI reasoning (assistant only)
+ """
+ value = raw_event.get("value")
+ if not isinstance(value, dict):
+ return None
+
+ # Schema version check — warn but don't crash
+ schema_version = value.get("_v", 0)
+ if schema_version > 3:
+ logger.debug("[cursor_db] Unknown bubble schema v%d", schema_version)
+
+ # type: 1=user, 2=assistant
+ bubble_type = value.get("type")
+ if bubble_type == 1:
+ role = "user"
+ elif bubble_type == 2:
+ role = "assistant"
+ else:
+ return None
+
+ # text: message content
+ content = value.get("text", "")
+ if not content or not content.strip():
+ return None # 过滤流式传输的空壳消息
+
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": value.get("createdAt"),
+ "session_id": raw_event.get("composer_id"),
+ }
+
+ def filter_event(self, event: Dict) -> bool:
+ """过滤过短内容。"""
+ content = event.get("content", "")
+ return len(content.strip()) >= 10
+```
+
+**与原计划的关键差异**:
+
+| 原假设 | 实际格式 |
+|--------|----------|
+| 单库 `ItemTable` + `WHERE key LIKE '%chat%'` | 双库:`ItemTable`(元数据)+ `cursorDiskKV`(内容) |
+| value 有 `role` 字段 | `type` 字段:1=用户, 2=助手 |
+| value 有 `message`/`content` 字段 | `text` 字段 |
+| `session_id` 在 value 内 | 从 key 的冒号分隔中解析(`composerId`) |
+| 无 schema 版本 | `_v` 字段标识版本(当前主流 v3) |
+
+**调研验证命令**(在用户机器上确认):
+```bash
+# 确认全局 DB 表名
+sqlite3 "%APPDATA%/Cursor/User/globalStorage/state.vscdb" ".tables"
+# 预期输出: cursorDiskKV
+
+# 查看 bubble 数据样例
+sqlite3 "%APPDATA%/Cursor/User/globalStorage/state.vscdb" \
+ "SELECT [key], value FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:%' LIMIT 3"
+```
+
+#### 3.3 WindsurfDBWatcher(P2,待调研)
+
+**⚠️ Windsurf 的具体表名和 key 模式尚无公开文档。** Phase 4 需先 dump 真实 state.vscdb 确认:
+- 是否使用 `cursorDiskKV` 还是 `ItemTable`
+- bubble key 模式是否与 Cursor 相同
+- 是否有独特的 Cascade 会话格式
+
+如果格式与 Cursor 高度相似,可继承 CursorDBWatcher:
+
+```python
+@register_watcher("windsurf_db")
+class WindsurfDBWatcher(CursorDBWatcher):
+ """Windsurf — format TBD after dump analysis."""
+
+ @property
+ def tool_name(self) -> str:
+ return "windsurf_db"
+```
+
+如果格式差异大,则独立实现。
+
+#### 3.4 Registry 更新
+
+在 `_register_builtins()` 中添加:
+
+```python
+try:
+ from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher
+ _WATCHER_REGISTRY["cursor_db"] = CursorDBWatcher
+except ImportError:
+ pass
+
+try:
+ from openviking.daemon.watchers.trae_db_watcher import TraeDBWatcher
+ _WATCHER_REGISTRY["trae_db"] = TraeDBWatcher
+except ImportError:
+ pass
+
+try:
+ from openviking.daemon.watchers.windsurf_db_watcher import WindsurfDBWatcher
+ _WATCHER_REGISTRY["windsurf_db"] = WindsurfDBWatcher
+except ImportError:
+ pass
+```
+
+#### 3.5 WatcherConfig 示例
+
+```json
+{
+ "server": {
+ "daemon": {
+ "enabled": true,
+ "watchers": [
+ {
+ "tool_name": "claude_code",
+ "watch_dir": "C:\\Users\\xxx\\.claude\\projects",
+ "batch_trigger_lines": 5
+ },
+ {
+ "tool_name": "cursor_db",
+ "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Cursor\\User",
+ "batch_trigger_lines": 5,
+ "extra": {
+ "poll_interval": 30
+ }
+ },
+ {
+ "tool_name": "trae_db",
+ "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Trae\\User",
+ "batch_trigger_lines": 5,
+ "extra": {
+ "poll_interval": 30
+ }
+ },
+ {
+ "tool_name": "windsurf_db",
+ "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Windsurf\\User",
+ "batch_trigger_lines": 5,
+ "extra": {
+ "poll_interval": 30
+ }
+ }
+ ]
+ }
+ }
+}
+```
+
+**注意**:`watch_dir` 现在指向各 IDE 的 `User` 根目录(而非 `globalStorage`),因为 CursorDBWatcher 内部需要同时访问 `workspaceStorage/` 和 `globalStorage/` 两个子目录。
+
+#### 3.6 CursorManager 兼容性
+
+**无需改动**。现有表结构完全适用:
+
+```sql
+CREATE TABLE IF NOT EXISTS file_cursors (
+ file_path TEXT PRIMARY KEY, -- DB watcher: watch_dir 路径
+ last_position INTEGER NOT NULL, -- DB watcher: rowid / timestamp
+ last_read_time REAL NOT NULL,
+ updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+)
+```
+
+文件 watcher 和 DB watcher 以 `file_path` 字段值(文件路径 vs 目录路径)自然区分,互不干扰。
+
+#### 3.7 DaemonService 兼容性
+
+**无需改动**。`service.py` 的 `start()` 方法已经:
+- 通过 `create_watcher(**kwargs)` 工厂创建 watcher,自动适配 BasePollingWatcher
+- 通过 `_enqueue_batch` → `call_soon_threadsafe` 处理来自 polling thread 的回调
+- 通过 `watcher.start()` / `stop()` / `flush()` 统一生命周期管理
+
+唯一需要确认:`DaemonService.start()` 中 `Path(watch_dir).mkdir(parents=True, exist_ok=True)` 对 DB watcher 是否安全——DB watcher 的 watch_dir 是已存在的 Cursor 安装目录,`mkdir(exist_ok=True)` 不会出错。✓
+
+---
+
+### 四、实施任务清单
+
+#### Phase 1:基础设施(BasePollingWatcher + 测试框架)
+
+| # | 任务 | 文件 | 预估 |
+|---|------|------|------|
+| 1.1 | 创建 `BasePollingWatcher` 基类 | `watchers/base_polling_watcher.py` | ~120 行 |
+| 1.2 | 创建 `test_base_polling_watcher.py` 单元测试 | `tests/daemon/` | mock poll loop / buffer / flush / Protocol 验证 |
+
+#### Phase 2:CursorDBWatcher(P0 核心)
+
+| # | 任务 | 文件 | 预估 |
+|---|------|------|------|
+| 2.1 | ~~调研~~:dump 真实 Cursor state.vscdb 结构 | — | **已完成**(见 §2.1) |
+| 2.2 | 创建 `CursorDBWatcher`(双库架构、cursorDiskKV、bubbleId 解析) | `watchers/cursor_db_watcher.py` | ~150 行 |
+| 2.3 | 单元测试 `test_cursor_db_watcher.py` | `tests/daemon/` | 创建临时双库 SQLite → mock bubble 数据 → normalize 验证 |
+| 2.4 | 集成测试:完整 poll 周期 → batch_callback 验证 | 同上 | |
+| 2.5 | Registry 注册 `cursor_db` | `watchers/registry.py` | 5 行 |
+
+#### Phase 3:TraeDBWatcher(P1)
+
+| # | 任务 | 文件 | 预估 |
+|---|------|------|------|
+| 3.1 | 创建 `TraeDBWatcher`(单库 ItemTable、`memento/icube-ai-ng-chat-storage` key) | `watchers/trae_db_watcher.py` | ~120 行 |
+| 3.2 | `normalize_event`:处理 `content` 为空的 fallback(`agentTaskContent.proposalText`) | 同上 | |
+| 3.3 | 单元测试 + Registry 注册 `trae_db` | `tests/daemon/test_trae_db_watcher.py` | |
+
+#### Phase 4:WindsurfDBWatcher(P2,需先 dump 确认格式)
+
+| # | 任务 | 文件 | 预估 |
+|---|------|------|------|
+| 4.1 | **调研**:dump 真实 Windsurf state.vscdb 确认表名和 key 模式 | 手动 | 可能继承 CursorDBWatcher |
+| 4.2 | 创建 `WindsurfDBWatcher` | `watchers/windsurf_db_watcher.py` | ~30 行(若格式同 Cursor) |
+| 4.3 | 单元测试 + Registry 注册 `windsurf_db` | | |
+
+#### Phase 5:集成验证
+
+| # | 任务 | 文件 | 预估 |
+|---|------|------|------|
+| 5.1 | 更新 `test_multi_watcher_integration.py`:加入 DB watcher 测试 | `tests/daemon/` | |
+| 5.2 | 端到端:真实 Cursor 对话 → watcher → ETL → viking:// | 手动验证 | |
+| 5.3 | 更新 `test_all_watchers_registered` 验证新增 watcher | | 2 行 |
+
+---
+
+### 五、风险点与应对
+
+| 风险 | 影响 | 应对 |
+|------|------|------|
+| Cursor 锁住 SQLite(WAL mode) | `query_new_events` 读超时 | `?mode=ro` + `PRAGMA busy_timeout=3000`;读失败时跳过本轮 |
+| Cursor 更新 `cursorDiskKV` 格式 / `_v` 版本升级 | normalize 失效 | normalize_event 已含 `_v` 版本检查;加 debug 日志记录未知格式 |
+| watch_dir 不存在或 DB 文件被删 | resolve_db_path 返回 None | _poll_loop 已有 None 检查,优雅跳过 |
+| polling thread 与 asyncio event loop 竞争 | batch_callback 死锁 | DaemonService._enqueue_batch 已用 call_soon_threadsafe,无改动 |
+| CursorManager 的 `file_path` 字段语义混淆 | 调试困难 | 在 BasePollingWatcher 文档注释中明确说明 cursor_key = watch_dir |
+| Trae 助手消息 `content` 为空 | 丢失有效内容 | 三级 fallback:`content` → `agentTaskContent.proposalText` → `finish` 工具结果 |
+| Windsurf 格式与 Cursor 不同 | 不能简单继承 | Phase 4 调研确认后再决定实现方式 |
+
+---
+
+### 六、验证计划
+
+每个新 DB watcher 的三级验证:
+
+1. **单元测试**:创建临时 SQLite → 写入 mock 数据(模拟真实 bubble/trait 格式)→ 调用 `query_new_events(0)` → 验证返回值 → 调用 `normalize_event` → 验证 normalized 格式
+2. **集成测试**:`BasePollingWatcher.start()` → 等待 1-2 个 poll 周期 → 验证 `batch_callback` 被调用且 events 格式正确 → `stop()`
+3. **端到端**:真实工具产生对话 → watcher 检测 → ETL pipeline → 知识写入 `viking://resources/`
+
+---
+
+### 七、文件变更清单
+
+| 操作 | 文件路径 |
+|------|----------|
+| **新增** | `openviking/daemon/watchers/base_polling_watcher.py` |
+| **新增** | `openviking/daemon/watchers/cursor_db_watcher.py`(P0) |
+| **新增** | `openviking/daemon/watchers/trae_db_watcher.py`(P1) |
+| **新增** | `openviking/daemon/watchers/windsurf_db_watcher.py`(P2) |
+| **新增** | `tests/daemon/test_base_polling_watcher.py` |
+| **新增** | `tests/daemon/test_cursor_db_watcher.py` |
+| **新增** | `tests/daemon/test_trae_db_watcher.py` |
+| **修改** | `openviking/daemon/watchers/registry.py` — 添加 cursor_db / trae_db / windsurf_db 注册 |
+| **修改** | `tests/daemon/test_multi_watcher_integration.py` — 添加 DB watcher 测试用例 |
+| **不改** | `models.py` / `cursor_manager.py` / `service.py` / `etl_pipeline.py` / `config.py` / `base_file_watcher.py` |
+
+---
+
+### 八、参考资料
+
+| 来源 | 链接 |
+|------|------|
+| Cursor state.vscdb 解析踩坑记 | https://article.juejin.cn/post/7640063917021167625 |
+| Cursor 对话导入:解析 SQLite 里的宝藏 | https://juejin.cn/post/7640053666326741030 |
+| 5 种 AI 对话数据格式全解析 | https://blog.csdn.net/2201_75708499/article/details/161789991 |
+| Codex CLI / Trae / Copilot 数据源接入 | https://juejin.cn/post/7640357289836281882 |
+| AI编程助手数据提取终极指南 | https://www.xugj520.cn/archives/ai-coding-assistant-data-extraction-2.html |
+| Trae 对话记录导出脚本 | https://juejin.cn/post/7472786501598101523 |
+| windsurf-monitor (GitHub) | https://github.com/bjfwan/windsurf-monitor |
diff --git a/openviking/daemon/watchers/base_polling_watcher.py b/openviking/daemon/watchers/base_polling_watcher.py
new file mode 100644
index 0000000000..29059b7a93
--- /dev/null
+++ b/openviking/daemon/watchers/base_polling_watcher.py
@@ -0,0 +1,205 @@
+"""
+Abstract base class for database/API-based watchers that use periodic polling.
+Unlike BaseFileWatcher (watchdog + file cursor), this uses Thread + Event.wait(interval).
+Subclasses implement query_new_events() and normalize_event().
+"""
+import os
+import time
+import threading
+from abc import ABC, abstractmethod
+from threading import Thread
+from typing import Callable, Dict, List, Optional
+
+from openviking.daemon.models import BatchBuffer
+from openviking.daemon.cursor_manager import CursorManager
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+class BasePollingWatcher(ABC):
+ """Base class for database/API-based watchers that use periodic polling.
+
+ Implements BaseWatcher Protocol (tool_name, start, stop, flush) without
+ inheriting BaseFileWatcher. Uses a daemon Thread for polling instead of
+ watchdog Observer.
+
+ Subclasses must implement:
+ - query_new_events(last_cursor): Query data source for new events
+ - normalize_event(raw_event): Convert raw event to normalized format
+
+ Optional overrides:
+ - filter_event(event): Additional filtering
+ - resolve_db_path(): Custom DB file discovery logic
+ """
+
+ def __init__(
+ self,
+ tool_name: str,
+ watch_dir: str,
+ cursor_manager: CursorManager,
+ batch_callback: Callable[[List[Dict]], None],
+ poll_interval: int = 30,
+ batch_trigger_lines: int = 50,
+ batch_trigger_seconds: int = 300,
+ extra: Optional[Dict] = None,
+ **kwargs,
+ ):
+ self._tool_name = tool_name
+ self.watch_dir = os.path.expanduser(watch_dir)
+ self.cursor_manager = cursor_manager
+ self.batch_callback = batch_callback
+ self.poll_interval = poll_interval
+ self.extra = extra or {}
+
+ self._buffer = BatchBuffer()
+ self.batch_trigger_lines = batch_trigger_lines
+ self.batch_trigger_seconds = batch_trigger_seconds
+ self._poll_thread: Optional[Thread] = None
+ self._stop_event = threading.Event()
+
+ # --- BaseWatcher Protocol ---
+
+ @property
+ def tool_name(self) -> str:
+ return self._tool_name
+
+ def start(self) -> None:
+ self._stop_event.clear()
+ self._poll_thread = Thread(
+ target=self._poll_loop, daemon=True, name=f"poll-{self._tool_name}"
+ )
+ self._poll_thread.start()
+ logger.info(
+ "[%s] Polling watcher started (interval=%ds, dir=%s)",
+ self._tool_name, self.poll_interval, self.watch_dir,
+ )
+
+ def stop(self) -> None:
+ self._stop_event.set()
+ if self._poll_thread:
+ self._poll_thread.join(timeout=5)
+ logger.info("[%s] Polling watcher stopped", self._tool_name)
+
+ def flush(self) -> None:
+ self._flush_buffer()
+
+ # --- Subclasses must implement ---
+
+ @abstractmethod
+ def query_new_events(self, last_cursor: int) -> List[Dict]:
+ """Query data source for events newer than last_cursor.
+
+ Args:
+ last_cursor: Last processed position (rowid/timestamp/offset)
+
+ Returns:
+ List of raw event dicts. Each MUST include '_cursor_position' field.
+ """
+ ...
+
+ @abstractmethod
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """Convert raw event to normalized format.
+
+ Returns None to skip. Output must have at minimum:
+ {role, content, type, timestamp, session_id}
+ """
+ ...
+
+ # --- Optional overrides ---
+
+ def filter_event(self, event: Dict) -> bool:
+ """Additional filtering. Return True to keep, False to skip."""
+ return True
+
+ def resolve_db_path(self) -> Optional[str]:
+ """Resolve DB file path. Default: watch_dir/extra['db_filename'].
+ Subclasses can override for more complex discovery.
+ """
+ db_filename = self.extra.get("db_filename", "state.vscdb")
+ candidate = os.path.join(self.watch_dir, db_filename)
+ if os.path.exists(candidate):
+ return candidate
+ return None
+
+ # --- Internal ---
+
+ def _poll_loop(self):
+ """Main polling loop. Runs in a daemon thread."""
+ cursor_key = self.watch_dir
+
+ while not self._stop_event.is_set():
+ try:
+ db_path = self.resolve_db_path()
+ if db_path is None:
+ logger.debug("[%s] DB not found, retrying...", self._tool_name)
+ self._stop_event.wait(self.poll_interval)
+ continue
+
+ cursor = self.cursor_manager.get_cursor(cursor_key)
+ raw_events = self.query_new_events(cursor.last_position)
+
+ if raw_events:
+ new_position = cursor.last_position
+ for raw in raw_events:
+ # Always advance cursor for every raw event seen,
+ # even if normalize/filter drops it. Otherwise filtered
+ # rows would be re-queried on every poll cycle.
+ pos = raw.get("_cursor_position", 0)
+ if pos > new_position:
+ new_position = pos
+
+ normalized = self.normalize_event(raw)
+ if normalized is None:
+ continue
+ if not self.filter_event(normalized):
+ continue
+
+ normalized["tool_name"] = self._tool_name
+ self._buffer.add_line(normalized, byte_size=0)
+
+ if new_position > cursor.last_position:
+ self.cursor_manager.update_cursor(cursor_key, new_position)
+
+ self._check_batch_trigger()
+
+ except Exception as e:
+ logger.error(
+ "[%s] Poll error: %s", self._tool_name, e, exc_info=True
+ )
+
+ self._stop_event.wait(self.poll_interval)
+
+ def _check_batch_trigger(self):
+ """Check if batch trigger conditions are met."""
+ if self._buffer.is_empty():
+ return
+
+ line_count = len(self._buffer.lines)
+ age = (
+ time.time() - self._buffer.created_at
+ if self._buffer.created_at > 0
+ else 0
+ )
+
+ if (
+ line_count >= self.batch_trigger_lines
+ or age >= self.batch_trigger_seconds
+ ):
+ self._flush_buffer()
+
+ def _flush_buffer(self):
+ """Flush buffered events via batch_callback."""
+ if self._buffer.is_empty():
+ return
+
+ events = self._buffer.lines.copy()
+ logger.info("[%s] Flushing %d events", self._tool_name, len(events))
+ try:
+ self.batch_callback(events)
+ self._buffer.clear()
+ except Exception as e:
+ logger.error(
+ "[%s] Batch callback failed: %s", self._tool_name, e, exc_info=True
+ )
diff --git a/openviking/daemon/watchers/cursor_db_watcher.py b/openviking/daemon/watchers/cursor_db_watcher.py
new file mode 100644
index 0000000000..77a6444318
--- /dev/null
+++ b/openviking/daemon/watchers/cursor_db_watcher.py
@@ -0,0 +1,193 @@
+"""
+Cursor IDE SQLite database watcher.
+Monitors Cursor's dual-SQLite storage for AI conversations:
+- Workspace DB: workspaceStorage//state.vscdb -> ItemTable (session metadata)
+- Global DB: globalStorage/state.vscdb -> cursorDiskKV (bubble message content)
+
+Key format: bubbleId::
+Value JSON: {_v, type(1=user/2=assistant), text, createdAt, allThinkingBlocks, ...}
+"""
+import json
+import os
+import sqlite3
+from typing import Dict, List, Optional
+
+from openviking.daemon.watchers.base_polling_watcher import BasePollingWatcher
+from openviking.daemon.watchers.registry import register_watcher
+from openviking_cli.utils.logger import get_logger
+
+logger = get_logger(__name__)
+
+
+@register_watcher("cursor_db")
+class CursorDBWatcher(BasePollingWatcher):
+ """Watches Cursor IDE's dual-SQLite storage for AI conversations.
+
+ Architecture:
+ - Workspace DB: workspaceStorage//state.vscdb -> ItemTable -> composer.composerData
+ - Global DB: globalStorage/state.vscdb -> cursorDiskKV -> bubbleId::
+
+ watch_dir should point to the Cursor User root:
+ - Windows: %APPDATA%\\Cursor\\User
+ - macOS: ~/Library/Application Support/Cursor/User
+ - Linux: ~/.config/Cursor/User
+ """
+
+ def __init__(self, watch_dir, cursor_manager, batch_callback,
+ poll_interval=30, batch_trigger_lines=50, batch_trigger_seconds=300,
+ extra=None, **kwargs):
+ super().__init__(
+ tool_name="cursor_db",
+ watch_dir=watch_dir,
+ cursor_manager=cursor_manager,
+ batch_callback=batch_callback,
+ poll_interval=poll_interval,
+ batch_trigger_lines=batch_trigger_lines,
+ batch_trigger_seconds=batch_trigger_seconds,
+ extra=extra,
+ )
+ self._global_db_path = os.path.join(
+ self.watch_dir, "globalStorage", "state.vscdb"
+ )
+ self._workspace_storage_dir = os.path.join(
+ self.watch_dir, "workspaceStorage"
+ )
+
+ @property
+ def tool_name(self) -> str:
+ return "cursor_db"
+
+ def resolve_db_path(self) -> Optional[str]:
+ """Return global DB path (primary data source)."""
+ if os.path.exists(self._global_db_path):
+ return self._global_db_path
+ return None
+
+ def _discover_composer_ids(self) -> List[str]:
+ """Scan all workspace DBs to collect composerId list.
+ Useful for correlating bubble data in global DB.
+ """
+ composer_ids = []
+ if not os.path.isdir(self._workspace_storage_dir):
+ return composer_ids
+
+ for ws_hash in os.listdir(self._workspace_storage_dir):
+ ws_db = os.path.join(
+ self._workspace_storage_dir, ws_hash, "state.vscdb"
+ )
+ if not os.path.exists(ws_db):
+ continue
+ try:
+ conn = sqlite3.connect(f"file:{ws_db}?mode=ro", uri=True)
+ try:
+ row = conn.execute(
+ "SELECT value FROM ItemTable "
+ "WHERE [key] = 'composer.composerData'"
+ ).fetchone()
+ if row and row[0]:
+ data = json.loads(row[0])
+ for c in data.get("allComposers", []):
+ cid = c.get("id")
+ if cid:
+ composer_ids.append(cid)
+ finally:
+ conn.close()
+ except Exception:
+ continue
+ return composer_ids
+
+ def query_new_events(self, last_cursor: int) -> List[Dict]:
+ """Query global DB cursorDiskKV for new bubble data.
+
+ Strategy: scan all bubbleId:* keys (rowid > last_cursor).
+ Does NOT depend on workspace DB composerId list (supports orphan conversations).
+ """
+ db_path = self.resolve_db_path()
+ if not db_path:
+ return []
+
+ try:
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+ conn.execute("PRAGMA busy_timeout = 3000")
+ try:
+ rows = conn.execute(
+ "SELECT rowid, [key], value FROM cursorDiskKV "
+ "WHERE rowid > ? AND [key] LIKE 'bubbleId:%' "
+ "ORDER BY rowid ASC LIMIT 500",
+ (last_cursor,),
+ ).fetchall()
+
+ events = []
+ for rowid, key, value in rows:
+ # Parse key: bubbleId::
+ parts = key.split(":", 2)
+ composer_id = parts[1] if len(parts) >= 3 else None
+
+ try:
+ parsed_value = (
+ json.loads(value) if isinstance(value, str) else value
+ )
+ except (json.JSONDecodeError, TypeError):
+ continue
+
+ events.append({
+ "rowid": rowid,
+ "key": key,
+ "value": parsed_value,
+ "composer_id": composer_id,
+ "_cursor_position": rowid,
+ })
+ return events
+ finally:
+ conn.close()
+ except sqlite3.OperationalError as e:
+ logger.warning("[cursor_db] SQLite error (DB may be locked): %s", e)
+ return []
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ """Parse Cursor bubble format.
+
+ Value JSON fields:
+ - _v: schema version (currently 3)
+ - type: 1=user, 2=assistant
+ - text: message content
+ - createdAt: timestamp
+ - allThinkingBlocks: AI reasoning (assistant only)
+ """
+ value = raw_event.get("value")
+ if not isinstance(value, dict):
+ return None
+
+ # Schema version check - warn but don't crash
+ schema_version = value.get("_v", 0)
+ if schema_version > 3:
+ logger.debug(
+ "[cursor_db] Unknown bubble schema v%d", schema_version
+ )
+
+ # type: 1=user, 2=assistant
+ bubble_type = value.get("type")
+ if bubble_type == 1:
+ role = "user"
+ elif bubble_type == 2:
+ role = "assistant"
+ else:
+ return None
+
+ # text: message content
+ content = value.get("text", "")
+ if not content or not content.strip():
+ return None # Filter empty streaming artifacts
+
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": value.get("createdAt"),
+ "session_id": raw_event.get("composer_id"),
+ }
+
+ def filter_event(self, event: Dict) -> bool:
+ """Filter short content."""
+ content = event.get("content", "")
+ return len(content.strip()) >= 10
diff --git a/openviking/daemon/watchers/registry.py b/openviking/daemon/watchers/registry.py
index 18a436347d..b76b9624d3 100644
--- a/openviking/daemon/watchers/registry.py
+++ b/openviking/daemon/watchers/registry.py
@@ -68,5 +68,11 @@ def _register_builtins():
except ImportError:
pass
+ try:
+ from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher
+ _WATCHER_REGISTRY["cursor_db"] = CursorDBWatcher
+ except ImportError:
+ pass
+
_register_builtins()
diff --git a/tests/daemon/test_base_polling_watcher.py b/tests/daemon/test_base_polling_watcher.py
new file mode 100644
index 0000000000..cf4a819251
--- /dev/null
+++ b/tests/daemon/test_base_polling_watcher.py
@@ -0,0 +1,313 @@
+"""
+Unit tests for BasePollingWatcher.
+Tests poll loop, buffer management, batch trigger, and BaseWatcher Protocol compliance.
+"""
+import time
+import pytest
+from typing import Dict, List, Optional
+from unittest.mock import MagicMock
+
+from openviking.daemon.watchers.base_polling_watcher import BasePollingWatcher
+from openviking.daemon.watchers import BaseWatcher
+from openviking.daemon.models import FileCursor
+
+
+class FakeCursorManager:
+ """In-memory cursor manager for testing."""
+
+ def __init__(self):
+ self.cursors = {}
+
+ def get_cursor(self, file_path):
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+
+
+class ConcretePollingWatcher(BasePollingWatcher):
+ """Concrete implementation for testing."""
+
+ def __init__(self, mock_events=None, **kwargs):
+ super().__init__(**kwargs)
+ self._mock_events = mock_events or []
+ self._query_count = 0
+
+ def query_new_events(self, last_cursor: int) -> List[Dict]:
+ self._query_count += 1
+ return [e for e in self._mock_events if e.get("_cursor_position", 0) > last_cursor]
+
+ def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
+ role = raw_event.get("role")
+ content = raw_event.get("content", "")
+ if not role or not content:
+ return None
+ return {
+ "role": role,
+ "content": content,
+ "type": "message",
+ "timestamp": raw_event.get("timestamp"),
+ "session_id": raw_event.get("session_id"),
+ }
+
+
+# --- Protocol Compliance ---
+
+def test_implements_base_watcher_protocol():
+ """BasePollingWatcher must satisfy BaseWatcher Protocol."""
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir="/tmp/test",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ assert isinstance(w, BaseWatcher)
+
+
+def test_tool_name_property():
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="my_tool",
+ watch_dir="/tmp",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ assert w.tool_name == "my_tool"
+
+
+# --- Buffer and Batch Trigger ---
+
+def test_flush_empty_buffer_no_callback():
+ """Flushing empty buffer should not call batch_callback."""
+ batches = []
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir="/tmp",
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ )
+ w.flush()
+ assert len(batches) == 0
+
+
+def test_flush_nonempty_buffer_calls_callback():
+ """Flushing non-empty buffer should call batch_callback with events."""
+ batches = []
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir="/tmp",
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ )
+ w._buffer.add_line({"role": "user", "content": "hello"}, byte_size=0)
+ w.flush()
+
+ assert len(batches) == 1
+ assert batches[0][0]["role"] == "user"
+ assert w._buffer.is_empty()
+
+
+def test_batch_trigger_by_line_count():
+ """Buffer should flush when line count reaches trigger."""
+ batches = []
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir="/tmp",
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ batch_trigger_lines=3,
+ )
+
+ for i in range(3):
+ w._buffer.add_line({"role": "user", "content": f"msg {i}"}, byte_size=0)
+
+ w._check_batch_trigger()
+ assert len(batches) == 1
+ assert len(batches[0]) == 3
+
+
+def test_batch_trigger_by_time():
+ """Buffer should flush when age exceeds trigger seconds."""
+ batches = []
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir="/tmp",
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ batch_trigger_lines=100, # high line trigger
+ batch_trigger_seconds=1, # low time trigger
+ )
+
+ w._buffer.add_line({"role": "user", "content": "old msg"}, byte_size=0)
+ # Manually age the buffer
+ w._buffer.created_at = time.time() - 5
+
+ w._check_batch_trigger()
+ assert len(batches) == 1
+
+
+def test_no_trigger_below_thresholds():
+ """Buffer should NOT flush when below both thresholds."""
+ batches = []
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir="/tmp",
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ batch_trigger_lines=100,
+ batch_trigger_seconds=300,
+ )
+
+ w._buffer.add_line({"role": "user", "content": "msg"}, byte_size=0)
+ w._check_batch_trigger()
+ assert len(batches) == 0
+
+
+# --- Poll Loop ---
+
+def test_poll_loop_processes_events(tmp_path):
+ """Poll loop should query, normalize, buffer, and trigger batch."""
+ batches = []
+ cm = FakeCursorManager()
+
+ events = [
+ {"role": "user", "content": "Hello", "_cursor_position": 1},
+ {"role": "assistant", "content": "Hi there", "_cursor_position": 2},
+ ]
+
+ w = ConcretePollingWatcher(
+ mock_events=events,
+ tool_name="test",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ poll_interval=1,
+ batch_trigger_lines=2,
+ )
+ # Override resolve_db_path to return a valid path
+ w.resolve_db_path = lambda: str(tmp_path / "fake.db")
+
+ w.start()
+ time.sleep(2.5) # Wait for at least 1 poll cycle
+ w.stop()
+
+ assert len(batches) >= 1
+ assert all(e["tool_name"] == "test" for e in batches[0])
+ assert batches[0][0]["role"] == "user"
+ assert batches[0][1]["role"] == "assistant"
+
+
+def test_poll_loop_updates_cursor(tmp_path):
+ """Poll loop should update cursor after processing events."""
+ cm = FakeCursorManager()
+ events = [
+ {"role": "user", "content": "msg", "_cursor_position": 42},
+ ]
+
+ w = ConcretePollingWatcher(
+ mock_events=events,
+ tool_name="test",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ poll_interval=1,
+ batch_trigger_lines=100,
+ )
+ w.resolve_db_path = lambda: str(tmp_path / "fake.db")
+
+ w.start()
+ time.sleep(1.5)
+ w.stop()
+
+ cursor = cm.get_cursor(str(tmp_path))
+ assert cursor.last_position == 42
+
+
+def test_poll_loop_skips_when_db_not_found(tmp_path):
+ """Poll loop should gracefully skip when DB doesn't exist."""
+ cm = FakeCursorManager()
+ w = ConcretePollingWatcher(
+ tool_name="test",
+ watch_dir=str(tmp_path / "nonexistent"),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ poll_interval=1,
+ )
+ # Default resolve_db_path returns None for nonexistent dir
+
+ w.start()
+ time.sleep(1.5)
+ w.stop()
+ # Should not raise — just skip gracefully
+
+
+def test_filter_event_skips_unwanted(tmp_path):
+ """filter_event returning False should skip the event."""
+ batches = []
+ cm = FakeCursorManager()
+
+ events = [
+ {"role": "user", "content": "keep this", "_cursor_position": 1},
+ {"role": "user", "content": "skip", "_cursor_position": 2},
+ ]
+
+ class FilteringWatcher(ConcretePollingWatcher):
+ def filter_event(self, event):
+ return event["content"] != "skip"
+
+ w = FilteringWatcher(
+ mock_events=events,
+ tool_name="test",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ poll_interval=1,
+ batch_trigger_lines=10,
+ )
+ w.resolve_db_path = lambda: str(tmp_path / "fake.db")
+
+ w.start()
+ time.sleep(1.5)
+ w.stop()
+ w.flush()
+
+ all_events = [e for batch in batches for e in batch]
+ assert len(all_events) == 1
+ assert all_events[0]["content"] == "keep this"
+
+
+# --- Callback failure resilience ---
+
+def test_callback_failure_does_not_crash(tmp_path):
+ """batch_callback failure should be caught, not crash the watcher."""
+ cm = FakeCursorManager()
+ events = [{"role": "user", "content": "msg", "_cursor_position": 1}]
+
+ def failing_callback(e):
+ raise RuntimeError("simulated failure")
+
+ w = ConcretePollingWatcher(
+ mock_events=events,
+ tool_name="test",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=failing_callback,
+ poll_interval=1,
+ batch_trigger_lines=1,
+ )
+ w.resolve_db_path = lambda: str(tmp_path / "fake.db")
+
+ w.start()
+ time.sleep(2)
+ w.stop()
+ # Should not raise — error is logged and caught
diff --git a/tests/daemon/test_cursor_db_watcher.py b/tests/daemon/test_cursor_db_watcher.py
new file mode 100644
index 0000000000..1042e1fa92
--- /dev/null
+++ b/tests/daemon/test_cursor_db_watcher.py
@@ -0,0 +1,422 @@
+"""
+Unit tests for CursorDBWatcher.
+Tests dual-SQLite architecture: global DB (cursorDiskKV) + workspace DB (ItemTable).
+Uses temporary SQLite databases to simulate real Cursor storage.
+"""
+import json
+import os
+import sqlite3
+import time
+import pytest
+
+from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher
+from openviking.daemon.watchers import BaseWatcher
+from openviking.daemon.models import FileCursor
+
+
+class FakeCursorManager:
+ """In-memory cursor manager for testing."""
+
+ def __init__(self):
+ self.cursors = {}
+
+ def get_cursor(self, file_path):
+ return self.cursors.get(file_path, FileCursor(file_path=file_path))
+
+ def update_cursor(self, file_path, position):
+ self.cursors[file_path] = FileCursor(
+ file_path=file_path,
+ last_position=position,
+ last_read_time=time.time(),
+ )
+
+
+def _create_global_db(db_path, bubbles):
+ """Create a mock global state.vscdb with cursorDiskKV table.
+
+ Args:
+ db_path: Path for the SQLite file
+ bubbles: List of (key, value_dict) tuples
+ """
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
+ conn = sqlite3.connect(db_path)
+ conn.execute("""
+ CREATE TABLE IF NOT EXISTS cursorDiskKV (
+ [key] TEXT PRIMARY KEY,
+ value TEXT
+ )
+ """)
+ for key, value in bubbles:
+ conn.execute(
+ "INSERT INTO cursorDiskKV ([key], value) VALUES (?, ?)",
+ (key, json.dumps(value)),
+ )
+ conn.commit()
+ conn.close()
+
+
+def _create_workspace_db(db_path, composers):
+ """Create a mock workspace state.vscdb with ItemTable.
+
+ Args:
+ db_path: Path for the SQLite file
+ composers: List of composer dicts with 'id' field
+ """
+ os.makedirs(os.path.dirname(db_path), exist_ok=True)
+ conn = sqlite3.connect(db_path)
+ conn.execute("""
+ CREATE TABLE IF NOT EXISTS ItemTable (
+ [key] TEXT PRIMARY KEY,
+ value TEXT
+ )
+ """)
+ composer_data = {"allComposers": composers}
+ conn.execute(
+ "INSERT INTO ItemTable ([key], value) VALUES (?, ?)",
+ ("composer.composerData", json.dumps(composer_data)),
+ )
+ conn.commit()
+ conn.close()
+
+
+def _make_cursor_user_dir(tmp_path):
+ """Create a mock Cursor User directory structure."""
+ user_dir = tmp_path / "Cursor" / "User"
+ user_dir.mkdir(parents=True, exist_ok=True)
+ return user_dir
+
+
+# --- Protocol ---
+
+def test_implements_base_watcher():
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ assert isinstance(w, BaseWatcher)
+ assert w.tool_name == "cursor_db"
+
+
+# --- resolve_db_path ---
+
+def test_resolve_db_path_found(tmp_path):
+ user_dir = _make_cursor_user_dir(tmp_path)
+ global_db = user_dir / "globalStorage" / "state.vscdb"
+ _create_global_db(str(global_db), [])
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ assert w.resolve_db_path() == str(global_db)
+
+
+def test_resolve_db_path_not_found(tmp_path):
+ user_dir = tmp_path / "empty"
+ user_dir.mkdir()
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ assert w.resolve_db_path() is None
+
+
+# --- query_new_events ---
+
+def test_query_new_events_empty_db(tmp_path):
+ user_dir = _make_cursor_user_dir(tmp_path)
+ global_db = user_dir / "globalStorage" / "state.vscdb"
+ _create_global_db(str(global_db), [])
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ events = w.query_new_events(0)
+ assert events == []
+
+
+def test_query_new_events_returns_bubbles(tmp_path):
+ user_dir = _make_cursor_user_dir(tmp_path)
+ global_db = user_dir / "globalStorage" / "state.vscdb"
+
+ bubbles = [
+ ("bubbleId:comp1:bub1", {
+ "_v": 3, "type": 1, "text": "Hello Cursor",
+ "createdAt": "2026-06-20T10:00:00Z",
+ }),
+ ("bubbleId:comp1:bub2", {
+ "_v": 3, "type": 2, "text": "Hi! How can I help?",
+ "createdAt": "2026-06-20T10:00:01Z",
+ }),
+ ("nonBubbleKey", {"some": "other data"}),
+ ]
+ _create_global_db(str(global_db), bubbles)
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ events = w.query_new_events(0)
+
+ # Should only return bubbleId:* keys, not nonBubbleKey
+ assert len(events) == 2
+ assert all(e["key"].startswith("bubbleId:") for e in events)
+ assert events[0]["composer_id"] == "comp1"
+ assert events[0]["_cursor_position"] > 0
+
+
+def test_query_new_events_respects_cursor(tmp_path):
+ user_dir = _make_cursor_user_dir(tmp_path)
+ global_db = user_dir / "globalStorage" / "state.vscdb"
+
+ bubbles = [
+ ("bubbleId:c1:b1", {"_v": 3, "type": 1, "text": "old msg", "createdAt": "t1"}),
+ ("bubbleId:c1:b2", {"_v": 3, "type": 2, "text": "new msg", "createdAt": "t2"}),
+ ]
+ _create_global_db(str(global_db), bubbles)
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ # First query: get all
+ all_events = w.query_new_events(0)
+ assert len(all_events) == 2
+
+ # Second query: use last rowid as cursor
+ last_rowid = all_events[-1]["rowid"]
+ new_events = w.query_new_events(last_rowid)
+ assert len(new_events) == 0
+
+
+# --- normalize_event ---
+
+def test_normalize_user_message():
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ raw = {
+ "value": {
+ "_v": 3,
+ "type": 1,
+ "text": "How do I use FastAPI?",
+ "createdAt": "2026-06-20T10:00:00Z",
+ },
+ "composer_id": "comp-abc",
+ "_cursor_position": 5,
+ }
+ result = w.normalize_event(raw)
+
+ assert result is not None
+ assert result["role"] == "user"
+ assert result["content"] == "How do I use FastAPI?"
+ assert result["type"] == "message"
+ assert result["timestamp"] == "2026-06-20T10:00:00Z"
+ assert result["session_id"] == "comp-abc"
+
+
+def test_normalize_assistant_message():
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ raw = {
+ "value": {
+ "_v": 3,
+ "type": 2,
+ "text": "FastAPI is a modern Python framework.",
+ "createdAt": "2026-06-20T10:00:01Z",
+ "allThinkingBlocks": [{"thinking": "Let me think..."}],
+ },
+ "composer_id": "comp-abc",
+ "_cursor_position": 6,
+ }
+ result = w.normalize_event(raw)
+
+ assert result is not None
+ assert result["role"] == "assistant"
+ assert result["content"] == "FastAPI is a modern Python framework."
+
+
+def test_normalize_skips_empty_text():
+ """Streaming artifacts with empty text should be skipped."""
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ raw = {
+ "value": {"_v": 3, "type": 2, "text": "", "createdAt": "t"},
+ "composer_id": "c1",
+ "_cursor_position": 7,
+ }
+ assert w.normalize_event(raw) is None
+
+
+def test_normalize_skips_unknown_type():
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ raw = {
+ "value": {"_v": 3, "type": 99, "text": "unknown", "createdAt": "t"},
+ "composer_id": "c1",
+ "_cursor_position": 8,
+ }
+ assert w.normalize_event(raw) is None
+
+
+def test_normalize_skips_non_dict_value():
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ raw = {"value": "not a dict", "composer_id": "c1", "_cursor_position": 9}
+ assert w.normalize_event(raw) is None
+
+
+def test_normalize_future_schema_version():
+ """Future _v values should produce a debug log, not crash."""
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+
+ raw = {
+ "value": {"_v": 99, "type": 1, "text": "future format", "createdAt": "t"},
+ "composer_id": "c1",
+ "_cursor_position": 10,
+ }
+ result = w.normalize_event(raw)
+ assert result is not None # Still processes with warning
+
+
+# --- filter_event ---
+
+def test_filter_short_content():
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir="/tmp/fake",
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ assert w.filter_event({"content": "ok"}) is False # < 10 chars
+ assert w.filter_event({"content": "This is long enough content"}) is True
+
+
+# --- _discover_composer_ids ---
+
+def test_discover_composer_ids(tmp_path):
+ user_dir = _make_cursor_user_dir(tmp_path)
+
+ # Create two workspace DBs
+ ws1_db = user_dir / "workspaceStorage" / "hash1" / "state.vscdb"
+ _create_workspace_db(str(ws1_db), [
+ {"id": "comp-1", "createdAt": "t1"},
+ {"id": "comp-2", "createdAt": "t2"},
+ ])
+
+ ws2_db = user_dir / "workspaceStorage" / "hash2" / "state.vscdb"
+ _create_workspace_db(str(ws2_db), [
+ {"id": "comp-3", "createdAt": "t3"},
+ ])
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ ids = w._discover_composer_ids()
+ assert set(ids) == {"comp-1", "comp-2", "comp-3"}
+
+
+def test_discover_composer_ids_no_workspaces(tmp_path):
+ user_dir = _make_cursor_user_dir(tmp_path)
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: None,
+ )
+ ids = w._discover_composer_ids()
+ assert ids == []
+
+
+# --- Integration: full poll cycle ---
+
+def test_full_poll_cycle(tmp_path):
+ """End-to-end: create mock DBs -> start watcher -> verify batch_callback."""
+ batches = []
+ user_dir = _make_cursor_user_dir(tmp_path)
+ global_db = user_dir / "globalStorage" / "state.vscdb"
+
+ bubbles = [
+ ("bubbleId:comp1:b1", {
+ "_v": 3, "type": 1,
+ "text": "How do I implement a binary search in Python?",
+ "createdAt": "2026-06-20T10:00:00Z",
+ }),
+ ("bubbleId:comp1:b2", {
+ "_v": 3, "type": 2,
+ "text": "Here is a binary search implementation using iterative approach...",
+ "createdAt": "2026-06-20T10:00:01Z",
+ }),
+ ]
+ _create_global_db(str(global_db), bubbles)
+
+ cm = FakeCursorManager()
+ w = CursorDBWatcher(
+ watch_dir=str(user_dir),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ poll_interval=1,
+ batch_trigger_lines=2,
+ )
+
+ w.start()
+ time.sleep(2.5)
+ w.stop()
+
+ assert len(batches) >= 1
+ events = batches[0]
+ assert len(events) == 2
+ assert events[0]["role"] == "user"
+ assert events[0]["tool_name"] == "cursor_db"
+ assert events[1]["role"] == "assistant"
+
+ # Cursor should be updated
+ cursor = cm.get_cursor(str(user_dir))
+ assert cursor.last_position > 0
diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py
index d3a797ffa0..ad641c39d9 100644
--- a/tests/daemon/test_multi_watcher_integration.py
+++ b/tests/daemon/test_multi_watcher_integration.py
@@ -33,9 +33,9 @@ def update_cursor(self, file_path, position):
# --- Registry Tests ---
def test_all_watchers_registered():
- """All 5 built-in watchers should be registered."""
+ """All 6 built-in watchers should be registered."""
available = list_available_watchers()
- expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev"}
+ expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev", "cursor_db"}
assert expected.issubset(set(available)), f"Missing: {expected - set(available)}"
@@ -328,3 +328,58 @@ def test_knowledge_router_uses_source_tool():
)
uri2 = router.route(k2)
assert "general" in uri2
+
+
+# --- CursorDBWatcher Integration ---
+
+def test_cursor_db_watcher_via_factory(tmp_path):
+ """cursor_db watcher should be creatable via factory and satisfy Protocol."""
+ batches = []
+ cm = FakeCursorManager()
+
+ watcher = create_watcher(
+ tool_name="cursor_db",
+ watch_dir=str(tmp_path),
+ cursor_manager=cm,
+ batch_callback=lambda e: batches.append(e),
+ poll_interval=60,
+ )
+ assert isinstance(watcher, BaseWatcher)
+ assert watcher.tool_name == "cursor_db"
+
+
+def test_cursor_db_normalize_compatible_with_reconstructor():
+ """Events from cursor_db watcher should work with ConversationReconstructor."""
+ from openviking.daemon.conversation_reconstructor import ConversationReconstructor
+
+ events = [
+ {"role": "user", "content": "How to use Cursor effectively?",
+ "tool_name": "cursor_db", "timestamp": "2026-06-20T10:00:00Z",
+ "session_id": "comp-123"},
+ {"role": "assistant", "content": "Here are some tips for using Cursor...",
+ "tool_name": "cursor_db", "timestamp": "2026-06-20T10:00:01Z",
+ "session_id": "comp-123"},
+ ]
+
+ reconstructor = ConversationReconstructor()
+ turns = reconstructor.reconstruct(events)
+
+ assert len(turns) == 1
+ assert turns[0].user_prompt == "How to use Cursor effectively?"
+ assert turns[0].source_tool == "cursor_db"
+
+
+def test_cursor_db_events_compatible_with_filter():
+ """Events from cursor_db should work with LowValueFilter."""
+ from openviking.daemon.filters import LowValueFilter
+
+ events = [
+ {"role": "user", "content": "A meaningful question about architecture design",
+ "tool_name": "cursor_db"},
+ {"role": "assistant", "content": "Here is a detailed explanation of the pattern",
+ "tool_name": "cursor_db"},
+ ]
+
+ f = LowValueFilter()
+ filtered = f.apply(events)
+ assert len(filtered) == 2
From f6f260f1c4b0fd37ab388cc7e0849fd6a11e4da0 Mon Sep 17 00:00:00 2001
From: baobaodae <2014596548@qq.com>
Date: Sat, 20 Jun 2026 12:37:31 +0800
Subject: [PATCH 4/6] docs(daemon): update db-watcher plan with E2E results and
cursor fix
- Mark Phase 1-2 as complete (BasePollingWatcher + CursorDBWatcher)
- Add section 4.1: real data E2E validation results against state.vscdb
- Add section 4.2: cursor advancement bug fix documentation
- Document _discover_composer_ids real data finding (cursorDiskKV, not ItemTable)
- Update task checklist and file change list with status
---
docs/daemon/db-watcher-plan.md | 167 ++++++++++++++++++++++++++-------
1 file changed, 132 insertions(+), 35 deletions(-)
diff --git a/docs/daemon/db-watcher-plan.md b/docs/daemon/db-watcher-plan.md
index 49ae875c41..2e07e7fc9c 100644
--- a/docs/daemon/db-watcher-plan.md
+++ b/docs/daemon/db-watcher-plan.md
@@ -293,6 +293,13 @@ class BasePollingWatcher(ABC):
if raw_events:
new_position = cursor.last_position
for raw in raw_events:
+ # ⚠️ 关键:先推进 cursor,再做 normalize/filter。
+ # 否则被过滤掉的条目(如空文本工具调用)如果恰好是最高 rowid,
+ # 会导致 cursor 永远不越过它,每次 poll 都重复查出。
+ pos = raw.get("_cursor_position", 0)
+ if pos > new_position:
+ new_position = pos
+
normalized = self.normalize_event(raw)
if normalized is None:
continue
@@ -301,11 +308,6 @@ class BasePollingWatcher(ABC):
normalized["tool_name"] = self._tool_name
self._buffer.add_line(normalized, byte_size=0)
- # 追踪最大 cursor position
- pos = raw.get("_cursor_position", 0)
- if pos > new_position:
- new_position = pos
-
# 更新 cursor
if new_position > cursor.last_position:
self.cursor_manager.update_cursor(cursor_key, new_position)
@@ -343,6 +345,7 @@ class BasePollingWatcher(ABC):
|--------|------|------|
| 轮询机制 | `Thread` + `Event.wait(interval)` | 与 watchdog Observer 平级,DaemonService 已处理线程安全 |
| cursor key | 使用 `watch_dir` 字符串 | 与文件 watcher 的 `file_path` 互不冲突,CursorManager 无需改动 |
+| cursor 推进时机 | **先推进再 normalize/filter** | ⚠️ 实测发现:被过滤掉的条目(如空文本工具调用)如果是最高 rowid,不先推进会导致无限重复查询 |
| DB 连接管理 | 子类在 `query_new_events` 内自行 open/close | SQLite `?mode=ro` 只读,避免锁冲突;CursorManager 已有 per-call connect 模式 |
| `_cursor_position` 约定 | raw event dict 中必须携带 | 让子类灵活定义位置语义(rowid/timestamp/offset),基类只取 max |
| `BatchBuffer` 复用 | 直接用现有实现 | `byte_size=0`(DB 无字节偏移概念),time trigger 正常工作 |
@@ -537,6 +540,36 @@ sqlite3 "%APPDATA%/Cursor/User/globalStorage/state.vscdb" \
"SELECT [key], value FROM cursorDiskKV WHERE [key] LIKE 'bubbleId:%' LIMIT 3"
```
+**⚠️ `_discover_composer_ids` 待修正(实测发现)**:
+
+真实数据验证发现 `composer.composerData` **不在** 工作区 ItemTable 中,而是以 `composerData:` 的形式存在全局 DB 的 `cursorDiskKV` 表中(13 条记录)。工作区 ItemTable 只有 `composer.composerHeaders`。
+
+当前实现(查 workspace ItemTable)不会报错但收集不到数据。主流程不依赖此辅助方法(直接扫描 bubbleId:*),但如需使用应改为:
+
+```python
+def _discover_composer_ids(self) -> List[str]:
+ """从全局 DB cursorDiskKV 查询 composerData:* keys。"""
+ composer_ids = []
+ db_path = self.resolve_db_path()
+ if not db_path:
+ return composer_ids
+ try:
+ conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+ try:
+ rows = conn.execute(
+ "SELECT [key] FROM cursorDiskKV WHERE [key] LIKE 'composerData:%'"
+ ).fetchall()
+ for (key,) in rows:
+ cid = key.split(":", 1)[1]
+ if cid:
+ composer_ids.append(cid)
+ finally:
+ conn.close()
+ except Exception:
+ pass
+ return composer_ids
+```
+
#### 3.3 WindsurfDBWatcher(P2,待调研)
**⚠️ Windsurf 的具体表名和 key 模式尚无公开文档。** Phase 4 需先 dump 真实 state.vscdb 确认:
@@ -655,24 +688,26 @@ CREATE TABLE IF NOT EXISTS file_cursors (
### 四、实施任务清单
-#### Phase 1:基础设施(BasePollingWatcher + 测试框架)
+#### Phase 1:基础设施(BasePollingWatcher + 测试框架)— ✅ 已完成
-| # | 任务 | 文件 | 预估 |
+| # | 任务 | 文件 | 状态 |
|---|------|------|------|
-| 1.1 | 创建 `BasePollingWatcher` 基类 | `watchers/base_polling_watcher.py` | ~120 行 |
-| 1.2 | 创建 `test_base_polling_watcher.py` 单元测试 | `tests/daemon/` | mock poll loop / buffer / flush / Protocol 验证 |
+| 1.1 | 创建 `BasePollingWatcher` 基类 | `watchers/base_polling_watcher.py` | ✅ ~170 行 |
+| 1.2 | 创建 `test_base_polling_watcher.py` 单元测试 | `tests/daemon/` | ✅ 12 tests passing |
-#### Phase 2:CursorDBWatcher(P0 核心)
+#### Phase 2:CursorDBWatcher(P0 核心)— ✅ 已完成
-| # | 任务 | 文件 | 预估 |
+| # | 任务 | 文件 | 状态 |
|---|------|------|------|
-| 2.1 | ~~调研~~:dump 真实 Cursor state.vscdb 结构 | — | **已完成**(见 §2.1) |
-| 2.2 | 创建 `CursorDBWatcher`(双库架构、cursorDiskKV、bubbleId 解析) | `watchers/cursor_db_watcher.py` | ~150 行 |
-| 2.3 | 单元测试 `test_cursor_db_watcher.py` | `tests/daemon/` | 创建临时双库 SQLite → mock bubble 数据 → normalize 验证 |
-| 2.4 | 集成测试:完整 poll 周期 → batch_callback 验证 | 同上 | |
-| 2.5 | Registry 注册 `cursor_db` | `watchers/registry.py` | 5 行 |
+| 2.1 | dump 真实 Cursor state.vscdb 结构 | — | ✅ 见 §2.1 + §4.1 |
+| 2.2 | 创建 `CursorDBWatcher`(双库架构、cursorDiskKV、bubbleId 解析) | `watchers/cursor_db_watcher.py` | ✅ ~170 行 |
+| 2.3 | 单元测试 `test_cursor_db_watcher.py` | `tests/daemon/` | ✅ 16 tests passing |
+| 2.4 | 集成测试:完整 poll 周期 → batch_callback 验证 | 同上 | ✅ `test_full_poll_cycle` |
+| 2.5 | Registry 注册 `cursor_db` | `watchers/registry.py` | ✅ |
+| 2.6 | **E2E 验证**:真实 state.vscdb 端到端 | — | ✅ 见 §4.1 |
+| 2.7 | **Bug 修复**:cursor 推进时机(先于 normalize) | `base_polling_watcher.py` | ✅ 见 §4.2 |
-#### Phase 3:TraeDBWatcher(P1)
+#### Phase 3:TraeDBWatcher(P1)— 🔲 待实施
| # | 任务 | 文件 | 预估 |
|---|------|------|------|
@@ -680,7 +715,7 @@ CREATE TABLE IF NOT EXISTS file_cursors (
| 3.2 | `normalize_event`:处理 `content` 为空的 fallback(`agentTaskContent.proposalText`) | 同上 | |
| 3.3 | 单元测试 + Registry 注册 `trae_db` | `tests/daemon/test_trae_db_watcher.py` | |
-#### Phase 4:WindsurfDBWatcher(P2,需先 dump 确认格式)
+#### Phase 4:WindsurfDBWatcher(P2,需先 dump 确认格式)— 🔲 待实施
| # | 任务 | 文件 | 预估 |
|---|------|------|------|
@@ -688,13 +723,74 @@ CREATE TABLE IF NOT EXISTS file_cursors (
| 4.2 | 创建 `WindsurfDBWatcher` | `watchers/windsurf_db_watcher.py` | ~30 行(若格式同 Cursor) |
| 4.3 | 单元测试 + Registry 注册 `windsurf_db` | | |
-#### Phase 5:集成验证
+#### Phase 5:集成验证 — ✅ 部分完成
-| # | 任务 | 文件 | 预估 |
+| # | 任务 | 文件 | 状态 |
|---|------|------|------|
-| 5.1 | 更新 `test_multi_watcher_integration.py`:加入 DB watcher 测试 | `tests/daemon/` | |
-| 5.2 | 端到端:真实 Cursor 对话 → watcher → ETL → viking:// | 手动验证 | |
-| 5.3 | 更新 `test_all_watchers_registered` 验证新增 watcher | | 2 行 |
+| 5.1 | 更新 `test_multi_watcher_integration.py`:加入 DB watcher 测试 | `tests/daemon/` | ✅ 3 new tests |
+| 5.2 | 端到端:真实 Cursor 对话 → watcher → ETL → viking:// | 手动验证 | 🔲 需 `serve --with-daemon` |
+| 5.3 | 更新 `test_all_watchers_registered` 验证新增 watcher | | ✅ 6 watchers |
+| 5.4 | `_discover_composer_ids` 修正为查 cursorDiskKV | `cursor_db_watcher.py` | 🔲 低优先级 |
+| 5.5 | 全量回归测试(53 tests) | — | ✅ all passing |
+
+---
+
+### 四.一、实施记录
+
+#### 4.1 真实数据 E2E 验证(2026-06-20)
+
+使用本机 `C:\Users\20145\AppData\Roaming\Cursor\User\globalStorage\state.vscdb`(1.4MB)对 CursorDBWatcher 进行端到端验证。
+
+**DB 概况**:
+
+| 表 | 行数 | 说明 |
+|----|------|------|
+| `ItemTable` | 128 | workspace 元数据(`composer.composerHeaders` 等) |
+| `cursorDiskKV` | 104 | 对话内容 + composer 数据 |
+
+**cursorDiskKV key 分布**:
+
+| 前缀 | 数量 | 说明 |
+|------|------|------|
+| `agentKv:blob:*` | 66 | Agent 知识片段 |
+| `bubbleId:*` | 20 | 对话消息(5 条有文本,15 条空文本/工具调用) |
+| `composerData:*` | 13 | 会话元数据(⚠️ 不在 ItemTable 中) |
+| `checkpointId:*` | 1 | 检查点 |
+| `inlineDiffs-*` | 4 | 内联 diff |
+
+**E2E 测试结果**:
+
+| 检查项 | 结果 |
+|--------|------|
+| DB 发现 & 连接 | ✅ PASS |
+| 首次 poll 查出 20 条 bubbleId | ✅ PASS |
+| normalize 保留 5 条(过滤 15 条空文本工具调用) | ✅ PASS |
+| batch 回调正确投递 5 条事件 | ✅ PASS |
+| cursor 持久化到 rowid=91 | ✅ PASS |
+| 二次 poll 幂等(0 条新数据) | ✅ PASS |
+| cursor 二次 poll 不变 | ✅ PASS |
+| 角色分布 user=1, assistant=4 | ✅ PASS |
+| 53 项单元测试全部通过,0 回归 | ✅ PASS |
+
+**真实 bubble 格式确认**:
+- `_v: 3` ✓
+- `type: 1`(user)/ `2`(assistant) ✓
+- `text` 字段包含对话内容 ✓
+- `createdAt` ISO-8601 时间戳 ✓
+- `toolFormerData` 对象(工具调用,text 为空)✓
+- `capabilityType: 15`(工具调用标记)✓
+
+#### 4.2 Cursor 推进 Bug 修复
+
+**问题**:`_poll_loop` 中 cursor 只在 normalize 保留事件后才推进。被过滤掉的条目(如空文本工具调用 bubble,`text: ""`)如果恰好是批次中最高 rowid,会导致 cursor 永远不越过它,每次 poll 都重复查出该条目。
+
+**症状**:E2E 验证中首次 poll 后 cursor 停在 rowid=90(最后一条有文本的 bubble),但 rowid=91 是一条空文本工具调用 bubble。第二次 poll 返回 1 条 raw event(虽然 normalize 正确返回 None 不产生输出),但浪费查询且 cursor 不推进。
+
+**修复**:将 `_cursor_position` 追踪提前到 normalize/filter 之前。无论事件是否被保留,cursor 都推进到已扫描的最大 rowid。
+
+**影响范围**:`base_polling_watcher.py` 的 `_poll_loop` 方法。所有 BasePollingWatcher 子类(包括未来的 Trae、Windsurf watcher)均受益。
+
+**验证**:修复后二次 poll 返回 0 条 raw event,cursor 稳定在 rowid=91。
---
@@ -724,18 +820,19 @@ CREATE TABLE IF NOT EXISTS file_cursors (
### 七、文件变更清单
-| 操作 | 文件路径 |
-|------|----------|
-| **新增** | `openviking/daemon/watchers/base_polling_watcher.py` |
-| **新增** | `openviking/daemon/watchers/cursor_db_watcher.py`(P0) |
-| **新增** | `openviking/daemon/watchers/trae_db_watcher.py`(P1) |
-| **新增** | `openviking/daemon/watchers/windsurf_db_watcher.py`(P2) |
-| **新增** | `tests/daemon/test_base_polling_watcher.py` |
-| **新增** | `tests/daemon/test_cursor_db_watcher.py` |
-| **新增** | `tests/daemon/test_trae_db_watcher.py` |
-| **修改** | `openviking/daemon/watchers/registry.py` — 添加 cursor_db / trae_db / windsurf_db 注册 |
-| **修改** | `tests/daemon/test_multi_watcher_integration.py` — 添加 DB watcher 测试用例 |
-| **不改** | `models.py` / `cursor_manager.py` / `service.py` / `etl_pipeline.py` / `config.py` / `base_file_watcher.py` |
+| 操作 | 文件路径 | 状态 |
+|------|----------|------|
+| ✅ 新增 | `openviking/daemon/watchers/base_polling_watcher.py` | ~170 行,含 cursor 推进 bug fix |
+| ✅ 新增 | `openviking/daemon/watchers/cursor_db_watcher.py`(P0) | ~170 行 |
+| 🔲 新增 | `openviking/daemon/watchers/trae_db_watcher.py`(P1) | 待 Phase 3 |
+| 🔲 新增 | `openviking/daemon/watchers/windsurf_db_watcher.py`(P2) | 待 Phase 4 |
+| ✅ 新增 | `tests/daemon/test_base_polling_watcher.py` | 12 tests |
+| ✅ 新增 | `tests/daemon/test_cursor_db_watcher.py` | 16 tests |
+| 🔲 新增 | `tests/daemon/test_trae_db_watcher.py` | 待 Phase 3 |
+| ✅ 修改 | `openviking/daemon/watchers/registry.py` | +cursor_db 注册 |
+| 🔲 修改 | `openviking/daemon/watchers/registry.py` | +trae_db / +windsurf_db 注册 |
+| ✅ 修改 | `tests/daemon/test_multi_watcher_integration.py` | +3 cursor_db 测试 |
+| ✅ 不改 | `models.py` / `cursor_manager.py` / `service.py` / `etl_pipeline.py` / `config.py` / `base_file_watcher.py` | 验证无需改动 |
---
From 17fd773d29acf0954ab059301a5e661dfc884457 Mon Sep 17 00:00:00 2001
From: baobaodae <2014596548@qq.com>
Date: Thu, 25 Jun 2026 21:36:21 +0800
Subject: [PATCH 5/6] chore(daemon): add license headers, trim watchers to
verified set, remove Echo2.md
- Add SPDX/AGPL license headers to all 21 openviking/daemon/**/*.py files
- Remove unverified watchers: aider, cursor (file), continue_dev
- Keep only verified watchers: claude_code, generic_jsonl, cursor_db
- Update registry, tests, and docs to reflect the trimmed watcher set
- Remove Echo2.md (to be published as [RFC] GitHub Discussion)
- Fix viking:// URI paths in daemon README
---
Echo2.md | 429 ------------------
docs/daemon/README.md | 10 +-
docs/daemon/Watcher.md | 32 --
docs/daemon/configuration.md | 47 +-
openviking/daemon/__init__.py | 2 +
.../daemon/conversation_reconstructor.py | 2 +
openviking/daemon/cursor_manager.py | 2 +
openviking/daemon/deduplicator.py | 2 +
openviking/daemon/etl_pipeline.py | 2 +
openviking/daemon/filters.py | 2 +
openviking/daemon/knowledge_extractor.py | 2 +
openviking/daemon/knowledge_router.py | 2 +
openviking/daemon/models.py | 2 +
openviking/daemon/service.py | 2 +
openviking/daemon/storage_adapter.py | 2 +
openviking/daemon/watchers/__init__.py | 2 +
openviking/daemon/watchers/aider_watcher.py | 148 ------
.../daemon/watchers/base_file_watcher.py | 2 +
.../daemon/watchers/base_polling_watcher.py | 2 +
.../daemon/watchers/claude_code_watcher.py | 2 +
.../daemon/watchers/continue_dev_watcher.py | 68 ---
.../daemon/watchers/cursor_db_watcher.py | 2 +
openviking/daemon/watchers/cursor_watcher.py | 87 ----
.../daemon/watchers/generic_jsonl_watcher.py | 2 +
openviking/daemon/watchers/registry.py | 20 +-
tests/daemon/test_aider_watcher.py | 183 --------
tests/daemon/test_continue_dev_watcher.py | 193 --------
tests/daemon/test_cursor_watcher.py | 187 --------
.../daemon/test_multi_watcher_integration.py | 123 +----
tests/daemon/test_service_multi.py | 2 +-
30 files changed, 105 insertions(+), 1458 deletions(-)
delete mode 100644 Echo2.md
delete mode 100644 docs/daemon/Watcher.md
delete mode 100644 openviking/daemon/watchers/aider_watcher.py
delete mode 100644 openviking/daemon/watchers/continue_dev_watcher.py
delete mode 100644 openviking/daemon/watchers/cursor_watcher.py
delete mode 100644 tests/daemon/test_aider_watcher.py
delete mode 100644 tests/daemon/test_continue_dev_watcher.py
delete mode 100644 tests/daemon/test_cursor_watcher.py
diff --git a/Echo2.md b/Echo2.md
deleted file mode 100644
index 3f18cb473d..0000000000
--- a/Echo2.md
+++ /dev/null
@@ -1,429 +0,0 @@
-# Echo II: Multi-Source Watcher Extension
-
-> 从单一 Claude Code 监听到多工具 AI 编码助手日志的统一监听与知识提取框架
-
----
-
-## 一、动机与背景
-
-Active Daemon 的 v1 实现仅支持 Claude Code JSONL 日志监听。然而现实中,开发者的日常工具栈远不止于此:
-
-- **Claude Code CLI** — JSONL 日志(已实现)
-- **Cursor** — 日志文件 + SQLite 数据库
-- **Aider** — `.aider.chat.history.md` Markdown 文件
-- **GitHub Copilot** — VS Code workspace 日志
-- **Continue.dev** — `~/.continue/` 下的 JSON 日志
-- **Windsurf / Cascade** — 类 Cursor 的日志目录
-
-Echo.md 原文已明确提到需要支持 Cursor(SQLite polling)和 OpenCode(rotating text logs),但 v1 仅完成了最简单的 Claude Code 路径。本次扩展的核心目标是:**将 Daemon 从单源监听器升级为多源监听框架**,使开发者可以自由配置任意数量的工具监听路径。
-
----
-
-## 二、目标工具可行性分析
-
-| 工具 | 存储格式 | 驱动模式 | 可行性 | 优先级 |
-|------|----------|----------|--------|--------|
-| **Claude Code** | JSONL append-only | 文件增量监听 | ★★★★★ 已实现 | P0 |
-| **Aider** | Markdown `.aider.chat.history.md` | 文件增量监听 | ★★★★☆ | P1 |
-| **Cursor** | SQLite `state.vscdb` + 日志 | DB polling + 文件监听 | ★★★☆☆ | P1 |
-| **Continue.dev** | JSON 日志 | 文件增量监听 | ★★★★☆ | P2 |
-| **GitHub Copilot** | VS Code workspace logs | 文件增量监听 | ★★★☆☆ | P2 |
-| **Generic JSONL** | 任意 JSONL | 文件增量监听 | ★★★★★ | P1 |
-
-### 驱动模式分类
-
-根据日志格式差异,Watcher 分为两种驱动模式:
-
-**File-Append Watcher**(文件追加监听)
-- 使用 watchdog 监听文件变更事件
-- 通过 FileCursor(byte offset)实现增量读取
-- 适用于:Claude Code、Aider、Continue.dev、Generic JSONL
-
-**Database Polling Watcher**(数据库轮询监听)
-- 定时轮询 SQLite 数据库
-- 通过 timestamp cursor 实现增量查询
-- 适用于:Cursor(state.vscdb)
-
----
-
-## 三、架构设计
-
-### 3.1 核心原则
-
-**每个 Watcher 负责将自己的原始事件归一化为统一格式**,下游 ETL pipeline 完全不需要感知工具差异。
-
-```
- ┌─ ClaudeCodeWatcher ──┐
- │ │
-AiderWatcher ────────┤ ├──→ normalized events ──→ ETL Pipeline ──→ viking://
- │ (shared queue) │
-GenericJSONLWatcher──┘ │
- │
-CursorWatcher (polling) ────────────────────┘
-```
-
-### 3.2 归一化事件格式 (NormalizedEvent)
-
-所有 Watcher 将原始日志转换为以下统一格式的 dict:
-
-```python
-{
- "role": "user" | "assistant", # 必须
- "content": "...", # 必须,消息正文
- "type": "message", # 必须,固定 "message"
- "timestamp": "2026-06-16T10:30:00Z", # 可选
- "session_id": "...", # 可选,工具特定
- "project_name": "...", # 可选
- "tool_name": "claude_code", # 必须,来源工具标识
-}
-```
-
-新增 `tool_name` 字段用于区分来源。现有的 `ConversationReconstructor`、`LowValueFilter`、`KnowledgeExtractor` 均通过 `event.get("role")` / `event.get("content")` 读取,**完全兼容**。
-
-### 3.3 BaseWatcher Protocol
-
-```python
-# openviking/daemon/watchers/__init__.py
-
-from typing import Protocol, runtime_checkable
-
-@runtime_checkable
-class BaseWatcher(Protocol):
- """All tool-specific watchers must implement this interface."""
-
- @property
- def tool_name(self) -> str: ...
-
- def start(self) -> None: ...
- def stop(self) -> None: ...
- def flush(self) -> None: ...
-```
-
-### 3.4 BaseFileWatcher 抽象基类
-
-为 File-Append 模式的 Watcher 提取公共逻辑(watchdog Observer + 增量读取 + 游标管理 + 批处理缓冲):
-
-```python
-# openviking/daemon/watchers/base_file_watcher.py
-
-class BaseFileWatcher(ABC):
- """Abstract base for file-append based watchers."""
-
- def __init__(self, watch_dir, cursor_manager, batch_callback,
- file_pattern, batch_trigger_lines, batch_trigger_seconds):
- ...
-
- @abstractmethod
- def parse_line(self, line: str) -> Optional[Dict]:
- """Parse a raw line into a raw event dict. Tool-specific."""
- ...
-
- @abstractmethod
- def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
- """Convert raw event to NormalizedEvent format. Tool-specific."""
- ...
-
- def filter_event(self, event: Dict) -> bool:
- """Optional: additional tool-specific filtering. Default: True."""
- return True
-
- # --- 公共逻辑(不需要子类重写) ---
- def start(self) -> None: ... # 创建 Observer + Handler
- def stop(self) -> None: ...
- def flush(self) -> None: ...
-```
-
-子类只需实现 `parse_line()` 和 `normalize_event()` 两个方法。
-
-### 3.5 Watcher 注册表 (WatcherRegistry)
-
-```python
-# openviking/daemon/watchers/registry.py
-
-WATCHER_REGISTRY: Dict[str, Type[BaseWatcher]] = {
- "claude_code": ClaudeCodeWatcher,
- "aider": AiderWatcher,
- "cursor": CursorWatcher,
- "continue_dev": ContinueDevWatcher,
- "generic_jsonl": GenericJSONLWatcher,
-}
-
-def create_watcher(tool_name: str, **kwargs) -> BaseWatcher:
- """Factory function to create a watcher by tool name."""
- cls = WATCHER_REGISTRY.get(tool_name)
- if not cls:
- raise ValueError(f"Unknown tool: {tool_name}. Available: {list(WATCHER_REGISTRY.keys())}")
- return cls(**kwargs)
-```
-
----
-
-## 四、配置设计
-
-### 4.1 新配置结构
-
-`DaemonConfig` 从单一 `watch_dir` 演进为支持 `watchers` 列表:
-
-```python
-class WatcherConfig(BaseModel):
- tool_name: str # 必填:工具标识
- watch_dir: str # 必填:监听路径
- file_pattern: str = "*.jsonl" # 可选:文件匹配模式
- enabled: bool = True # 可选:是否启用
- batch_trigger_lines: int = Field(50, gt=0) # 可选:批处理行数
- batch_trigger_seconds: int = Field(300, gt=0) # 可选:批处理秒数
- extra: Dict[str, Any] = Field(default_factory=dict) # 可选:工具特定参数
-
-class DaemonConfig(BaseModel):
- enabled: bool = False
- db_path: Optional[str] = None
- watchers: List[WatcherConfig] = Field(default_factory=list)
-
- # 向后兼容:保留旧字段
- watch_dir: Optional[str] = None # deprecated
- batch_trigger_lines: int = Field(50, gt=0) # deprecated, 全局默认值
- batch_trigger_seconds: int = Field(300, gt=0) # deprecated, 全局默认值
-
- def get_effective_watchers(self) -> List[WatcherConfig]:
- """向后兼容:如果 watchers 为空但 watch_dir 存在,自动构造单个 Claude Code watcher."""
- if self.watchers:
- return [w for w in self.watchers if w.enabled]
- elif self.watch_dir:
- return [WatcherConfig(
- tool_name="claude_code",
- watch_dir=self.watch_dir,
- batch_trigger_lines=self.batch_trigger_lines,
- batch_trigger_seconds=self.batch_trigger_seconds,
- )]
- else:
- return [WatcherConfig(
- tool_name="claude_code",
- watch_dir=str(Path.home() / ".claude" / "projects"),
- batch_trigger_lines=self.batch_trigger_lines,
- batch_trigger_seconds=self.batch_trigger_seconds,
- )]
-```
-
-### 4.2 ov.conf 示例
-
-```json
-{
- "server": {
- "daemon": {
- "enabled": true,
- "db_path": "~/.openviking/daemon/cursors.db",
- "watchers": [
- {
- "tool_name": "claude_code",
- "watch_dir": "~/.claude/projects",
- "file_pattern": "*.jsonl"
- },
- {
- "tool_name": "aider",
- "watch_dir": "~/Projects",
- "file_pattern": ".aider.chat.history.md"
- },
- {
- "tool_name": "cursor",
- "watch_dir": "~/.cursor",
- "extra": {"poll_interval_seconds": 60}
- },
- {
- "tool_name": "generic_jsonl",
- "watch_dir": "~/ai-logs/copilot",
- "file_pattern": "*.jsonl",
- "extra": {"user_role_field": "author", "assistant_role_value": "copilot"}
- }
- ]
- }
- }
-}
-```
-
-### 4.3 环境变量
-
-| 变量 | 说明 |
-|------|------|
-| `OV_DAEMON_ENABLED` | 全局开关 |
-| `OV_DAEMON_WATCHERS` | JSON 格式的 watchers 列表(覆盖 ov.conf) |
-
----
-
-## 五、实现计划
-
-### Batch 1:基础抽象层(3 文件 + 6 tests)
-
-| 任务 | 文件 | 说明 |
-|------|------|------|
-| 1.1 | `watchers/__init__.py` | 定义 `BaseWatcher` Protocol |
-| 1.2 | `watchers/base_file_watcher.py` | 抽象基类,提取 ClaudeCodeWatcher 的公共逻辑 |
-| 1.3 | `watchers/registry.py` | Watcher 注册表 + `create_watcher()` 工厂 |
-| 1.4 | `tests/daemon/test_base_file_watcher.py` | BaseFileWatcher 的 parse/normalize/buffer 测试 |
-| 1.5 | `tests/daemon/test_registry.py` | 注册表查找、未知工具报错测试 |
-
-### Batch 2:重构 ClaudeCodeWatcher + GenericJSONLWatcher(3 文件 + 8 tests)
-
-| 任务 | 文件 | 说明 |
-|------|------|------|
-| 2.1 | `watchers/claude_code_watcher.py` | 重构为继承 BaseFileWatcher,实现 parse_line/normalize_event |
-| 2.2 | `watchers/generic_jsonl_watcher.py` | 通用 JSONL Watcher,支持自定义字段映射 |
-| 2.3 | `models.py` | 在 BatchBuffer 中增加 `tool_name` 字段 |
-| 2.4 | `tests/daemon/test_claude_code_watcher.py` | 更新已有测试(保持兼容) |
-| 2.5 | `tests/daemon/test_generic_jsonl_watcher.py` | GenericJSONLWatcher 的解析/归一化/字段映射测试 |
-
-### Batch 3:AiderWatcher + CursorWatcher(4 文件 + 10 tests)
-
-| 任务 | 文件 | 说明 |
-|------|------|------|
-| 3.1 | `watchers/aider_watcher.py` | Aider Markdown 历史解析 → 归一化事件 |
-| 3.2 | `watchers/cursor_watcher.py` | Cursor SQLite polling + 日志文件监听 |
-| 3.3 | `watchers/continue_dev_watcher.py` | Continue.dev JSON 日志监听 |
-| 3.4 | `tests/daemon/test_aider_watcher.py` | Markdown 解析、对话重组、增量读取测试 |
-| 3.5 | `tests/daemon/test_cursor_watcher.py` | SQLite polling、增量查询测试 |
-| 3.6 | `tests/daemon/test_continue_dev_watcher.py` | JSON 日志解析测试 |
-
-### Batch 4:配置 + 服务层重构(4 文件 + 6 tests)
-
-| 任务 | 文件 | 说明 |
-|------|------|------|
-| 4.1 | `server/config.py` | 新增 WatcherConfig,重构 DaemonConfig |
-| 4.2 | `daemon/service.py` | 单 watcher → watchers 列表,使用 registry 创建 |
-| 4.3 | `daemon/knowledge_router.py` | URI 中包含 tool_name:`viking://skills/{tool_name}/...` |
-| 4.4 | `daemon/storage_adapter.py` | Markdown 模板中 Source 字段参数化 |
-| 4.5 | `daemon/models.py` | ExtractedKnowledge 增加 `source_tool` 字段 |
-| 4.6 | `server/bootstrap.py` | 环境变量 OV_DAEMON_WATCHERS 支持 |
-| 4.7 | `tests/daemon/test_config.py` | WatcherConfig 验证 + 向后兼容测试 |
-| 4.8 | `tests/daemon/test_service_multi.py` | 多 watcher 启动/停止/路由测试 |
-
-### Batch 5:API + Web Studio(3 文件 + 0 tests)
-
-| 任务 | 文件 | 说明 |
-|------|------|------|
-| 5.1 | `server/routers/daemon.py` | 扩展 status 端点:返回每个 watcher 的状态 |
-| 5.2 | `web-studio/.../daemon-status-card.tsx` | 重构为多 watcher 视图:表格 + 状态 |
-| 5.3 | `web-studio/.../i18n` | 多 watcher 相关翻译 |
-
-### Batch 6:集成测试 + 收尾(2 文件)
-
-| 任务 | 文件 | 说明 |
-|------|------|------|
-| 6.1 | `tests/daemon/test_multi_watcher_integration.py` | 多 watcher → ETL → storage 全链路 |
-| 6.2 | 文档更新 | README、Echo.md 引用更新 |
-
----
-
-## 六、关键设计决策
-
-### 6.1 归一化 vs 分支管道
-
-**选择:Watcher 层归一化**,而非 ETL 层分支。
-
-理由:
-- ETL Pipeline 的四个阶段(Filter → Reconstruct → Extract → Deduplicate)已经设计为通用的
-- 如果在 ETL 层引入工具分支,每个阶段都需要 if/else,复杂度 O(tools × stages)
-- Watcher 层归一化后,新增工具只需实现 `parse_line()` + `normalize_event()`,ETL 零修改
-
-### 6.2 向后兼容策略
-
-旧的 `DaemonConfig` 字段(`watch_dir`、`batch_trigger_lines`、`batch_trigger_seconds`)保留为 deprecated,`get_effective_watchers()` 方法自动将旧配置转换为单个 `WatcherConfig`。这确保现有用户的 ov.conf 不需要修改。
-
-### 6.3 Cursor SQLite Polling
-
-Cursor 使用 SQLite `state.vscdb` 存储对话历史。与文件监听不同,需要:
-- 定时轮询(默认 60 秒)
-- 通过 `last_timestamp` 游标实现增量查询
-- 复用 `CursorManager` 存储轮询位置
-
-这是一个独立于 watchdog 的驱动模式,但通过实现相同的 `BaseWatcher` Protocol,对 DaemonService 完全透明。
-
-### 6.4 GenericJSONLWatcher 字段映射
-
-为了支持任意 JSONL 日志(Copilot、自研工具等),GenericJSONLWatcher 支持通过 `extra` 配置自定义字段映射:
-
-```json
-{
- "tool_name": "generic_jsonl",
- "watch_dir": "~/ai-logs/my-tool",
- "extra": {
- "role_field": "author",
- "user_role_value": "human",
- "assistant_role_value": "ai",
- "content_field": "text",
- "timestamp_field": "ts"
- }
-}
-```
-
-默认值兼容 Claude Code 格式(`role` / `user` / `assistant` / `content` / `timestamp`)。
-
----
-
-## 七、API 变更
-
-### 7.1 GET /api/v1/daemon/status(增强)
-
-```json
-{
- "enabled": true,
- "running": true,
- "watchers": [
- {
- "tool_name": "claude_code",
- "watch_dir": "/Users/xxx/.claude/projects",
- "enabled": true,
- "running": true,
- "cursor_count": 12,
- "batch_trigger_lines": 50,
- "batch_trigger_seconds": 300
- },
- {
- "tool_name": "aider",
- "watch_dir": "/Users/xxx/Projects",
- "enabled": true,
- "running": true,
- "cursor_count": 3,
- "batch_trigger_lines": 50,
- "batch_trigger_seconds": 300
- }
- ],
- "db_path": "...",
- "available_tools": ["claude_code", "aider", "cursor", "continue_dev", "generic_jsonl"]
-}
-```
-
----
-
-## 八、Web Studio 变更
-
-Home 页面 DaemonStatusCard 从单状态卡片升级为多 Watcher 表格视图:
-
-- 顶部:全局状态(enabled/running)+ watcher 数量统计
-- 中部:Watcher 列表表格(工具名、路径、状态、游标数、最后活动)
-- 底部:可用工具列表(available_tools)
-
-每 30 秒自动刷新。
-
----
-
-## 九、风险与缓解
-
-| 风险 | 影响 | 缓解措施 |
-|------|------|----------|
-| Cursor SQLite schema 变更 | 解析失败 | 防御性编程 + 版本检测 + 降级到日志文件监听 |
-| 多 watcher 并发写入 SQLite cursor DB | 竞态条件 | SQLite WAL mode + 每个 watcher 独立 cursor key |
-| Aider Markdown 格式不稳定 | 解析错误 | 宽松正则 + 跳过不可解析段落 + 详细日志 |
-| 大量 watcher 导致 ETL 队列拥堵 | 延迟增加 | Queue 大小限制 + 背压 + 优先级队列 |
-
----
-
-## 十、里程碑
-
-| 阶段 | 内容 | 预计 commits |
-|------|------|-------------|
-| Batch 1 | 基础抽象层 | 3-4 |
-| Batch 2 | 重构 + GenericJSONL | 4-5 |
-| Batch 3 | Aider + Cursor + Continue | 5-6 |
-| Batch 4 | 配置 + 服务层 | 4-5 |
-| Batch 5 | API + Web Studio | 3-4 |
-| Batch 6 | 集成 + 收尾 | 2-3 |
-| **总计** | | **21-27 commits** |
diff --git a/docs/daemon/README.md b/docs/daemon/README.md
index a7e5173c1f..874b210eac 100644
--- a/docs/daemon/README.md
+++ b/docs/daemon/README.md
@@ -29,17 +29,19 @@ Daemon 会在服务器启动时自动运行(如果 `OV_DAEMON_ENABLED=true`)
2. **增量读取** — 文件游标技术,只处理新增内容
3. **批量处理** — 累积 50 行或 5 分钟后触发 ETL 管道
4. **知识提取** — 调用 LLM 过滤噪声,提取有价值的知识
-5. **自动存储** — 写入 `viking://skills/`、`viking://memories/`、`viking://resources/`
+5. **自动存储** — 写入 `viking://resources/skills/`、`viking://resources/memories/`、`viking://resources/`
## 知识分类
| 类型 | 目标路径 | 说明 |
|------|---------|------|
-| Skills | `viking://skills/claude_code/.md` | 可复用的操作指南 |
-| Memories (有项目) | `viking://memories/projects//decisions.md` | 项目决策日志 |
-| Memories (无项目) | `viking://memories/global/.md` | 全局记忆 |
+| Skills | `viking://resources/skills//.md` | 可复用的操作指南 |
+| Memories (有项目) | `viking://resources/memories//decisions.md` | 项目决策日志 |
+| Memories (无项目) | `viking://resources/memories/global/.md` | 全局记忆 |
| Resources | `viking://resources//.md` | 参考资源 |
+> **注意**:viking:// URI 只支持 `resources/`、`user/`、`agent/` 三个顶级 scope。所有知识统一路由到 `resources/` 下。
+
## 架构
```
diff --git a/docs/daemon/Watcher.md b/docs/daemon/Watcher.md
deleted file mode 100644
index 5b672d74f2..0000000000
--- a/docs/daemon/Watcher.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Daemon Watcher 排查计划
-
-## 状态:148 tests pass,服务器在 1966 端口运行中,daemon 已处理 3 个文件
-
-## 待完成
-
-### 1. 确认端到端写入
-目标:验证知识是否真正写入 `viking://resources/...`
-
-方法:
-- 服务器日志当前输出到后台进程 stdout,无法查看。需要改为写文件:在 `ov.conf` 中加 `"log": {"level": "INFO", "output": "D:\\Develop\\ov-test\\server.log"}`
-- 杀掉当前服务器,清 cursor DB,重启
-- touch JSONL 触发处理,等 ETL 跑完
-- 查看 `server.log` 中的写入日志(成功/失败)
-
-预期日志链路:
-```
-[claude_code] Flushing N events
-_enqueue_batch: received N events
-ETL loop: dequeued N events
-Processing batch with N events → After filtering: X → Reconstructed Y turns → Extracted Z items
-Knowledge ingested: viking://resources/... ← 关键确认点
-```
-
-### 2. 根据结果处理
-- 写入成功 → 提交 commit 和 PR
-- 写入失败 → 根据日志修 `storage_adapter.py`(可能是 `resource_service.add_resource()` 调用参数或 URI 格式问题)
-
-### 3. 提交 PR
-- 分支基于 `upstream/main`
-- 只包含 daemon 相关文件变更
-- PR 目标:`volcengine/OpenViking:main`
diff --git a/docs/daemon/configuration.md b/docs/daemon/configuration.md
index 169bf341e1..422c50e1a1 100644
--- a/docs/daemon/configuration.md
+++ b/docs/daemon/configuration.md
@@ -10,19 +10,54 @@
| `OV_DAEMON_BATCH_LINES` | 批处理触发行数 | `50` |
| `OV_DAEMON_BATCH_SECONDS` | 批处理触发秒数 | `300` |
-## YAML 配置 (ov.conf)
+## JSON 配置 (ov.conf)
+
+> **重要**:daemon 配置必须放在 `"server"` 节内,不是顶层配置。ov.conf 的 JSON 解析器不支持 `#` 注释。
+
+单 watcher 配置:
+
+```json
+{
+ "server": {
+ "port": 1988,
+ "daemon": {
+ "enabled": true,
+ "watch_dir": "~/.claude/projects",
+ "batch_trigger_lines": 50,
+ "batch_trigger_seconds": 300
+ }
+ }
+}
+```
+
+多 watcher 配置(推荐,支持同时监听多个 AI 工具):
```json
{
- "daemon": {
- "enabled": true,
- "watch_dir": "~/.claude/projects",
- "batch_trigger_lines": 50,
- "batch_trigger_seconds": 300
+ "server": {
+ "port": 1988,
+ "daemon": {
+ "enabled": true,
+ "watchers": [
+ {
+ "tool_name": "claude_code",
+ "watch_dir": "C:\\Users\\xxx\\.claude\\projects",
+ "batch_trigger_lines": 5,
+ "batch_trigger_seconds": 60
+ },
+ {
+ "tool_name": "cursor_db",
+ "watch_dir": "C:\\Users\\xxx\\AppData\\Roaming\\Cursor\\User\\globalStorage",
+ "poll_interval": 60
+ }
+ ]
+ }
}
}
```
+`watchers` 数组中每项支持 `tool_name`、`watch_dir`、`batch_trigger_lines`、`batch_trigger_seconds`、`extra` 字段。当 `watchers` 存在时,顶层的 `watch_dir` 被忽略。
+
## Docker 部署
```yaml
diff --git a/openviking/daemon/__init__.py b/openviking/daemon/__init__.py
index ba94f0d050..5042a04371 100644
--- a/openviking/daemon/__init__.py
+++ b/openviking/daemon/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
OpenViking Active Daemon package.
Monitors AI tool logs and automatically extracts knowledge into viking:// storage.
diff --git a/openviking/daemon/conversation_reconstructor.py b/openviking/daemon/conversation_reconstructor.py
index 179873467d..fdfbcdc1e0 100644
--- a/openviking/daemon/conversation_reconstructor.py
+++ b/openviking/daemon/conversation_reconstructor.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Reconstruct conversation turns from flat event lists.
Pairs user prompts with assistant responses into structured ConversationTurn objects.
diff --git a/openviking/daemon/cursor_manager.py b/openviking/daemon/cursor_manager.py
index 6b76bc2565..e5009d6082 100644
--- a/openviking/daemon/cursor_manager.py
+++ b/openviking/daemon/cursor_manager.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Cursor Manager for tracking file read positions.
Persists state in SQLite so Daemon can resume after restart.
diff --git a/openviking/daemon/deduplicator.py b/openviking/daemon/deduplicator.py
index 9066b57bb4..5721fc2a67 100644
--- a/openviking/daemon/deduplicator.py
+++ b/openviking/daemon/deduplicator.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Knowledge deduplication based on content hashing.
Prevents duplicate knowledge from being written to viking:// storage.
diff --git a/openviking/daemon/etl_pipeline.py b/openviking/daemon/etl_pipeline.py
index b0f35d58d1..1f580b43d4 100644
--- a/openviking/daemon/etl_pipeline.py
+++ b/openviking/daemon/etl_pipeline.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Batch ETL pipeline for processing conversation events into structured knowledge.
Orchestrates: Filter -> Reconstruct -> Extract -> Deduplicate
diff --git a/openviking/daemon/filters.py b/openviking/daemon/filters.py
index 4a3a58bbe2..04f32cb198 100644
--- a/openviking/daemon/filters.py
+++ b/openviking/daemon/filters.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Rule-based filters for low-value conversations.
Removes noise before LLM processing to save cost and improve quality.
diff --git a/openviking/daemon/knowledge_extractor.py b/openviking/daemon/knowledge_extractor.py
index 260ca2622d..5e69d3b96c 100644
--- a/openviking/daemon/knowledge_extractor.py
+++ b/openviking/daemon/knowledge_extractor.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
LLM-based knowledge extraction from conversations.
Uses OpenViking's existing VLM configuration for intelligent filtering and summarization.
diff --git a/openviking/daemon/knowledge_router.py b/openviking/daemon/knowledge_router.py
index e2adcb112f..ea67490510 100644
--- a/openviking/daemon/knowledge_router.py
+++ b/openviking/daemon/knowledge_router.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Route extracted knowledge to appropriate viking:// URIs based on category and project.
"""
diff --git a/openviking/daemon/models.py b/openviking/daemon/models.py
index ea88df8953..78a041ae00 100644
--- a/openviking/daemon/models.py
+++ b/openviking/daemon/models.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Data models for OpenViking Active Daemon.
"""
diff --git a/openviking/daemon/service.py b/openviking/daemon/service.py
index bd6bc8bebe..bbb07dbd6c 100644
--- a/openviking/daemon/service.py
+++ b/openviking/daemon/service.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
OpenViking Active Daemon main service.
Orchestrates multi-tool file watching, ETL processing, and knowledge storage.
diff --git a/openviking/daemon/storage_adapter.py b/openviking/daemon/storage_adapter.py
index e1a76684ef..5678fae925 100644
--- a/openviking/daemon/storage_adapter.py
+++ b/openviking/daemon/storage_adapter.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Adapter for writing extracted knowledge to OpenViking via ResourceService.
Generates temporary Markdown files and ingests them through the standard resource pipeline.
diff --git a/openviking/daemon/watchers/__init__.py b/openviking/daemon/watchers/__init__.py
index 450f68faf4..9e47c6fe8f 100644
--- a/openviking/daemon/watchers/__init__.py
+++ b/openviking/daemon/watchers/__init__.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Watcher abstractions for OpenViking Active Daemon.
Provides BaseWatcher protocol and watcher registry for multi-tool support.
diff --git a/openviking/daemon/watchers/aider_watcher.py b/openviking/daemon/watchers/aider_watcher.py
deleted file mode 100644
index cd2e20dc95..0000000000
--- a/openviking/daemon/watchers/aider_watcher.py
+++ /dev/null
@@ -1,148 +0,0 @@
-"""
-Aider chat history watcher.
-Monitors .aider.chat.history.md files in project directories.
-"""
-import re
-import time
-import os
-from typing import Dict, List, Optional
-
-from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
-from openviking.daemon.watchers.registry import register_watcher
-from openviking.daemon.models import FileCursor
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-@register_watcher("aider")
-class AiderWatcher(BaseFileWatcher):
- """Watches Aider .aider.chat.history.md files."""
-
- # Regex patterns
- HEADER_RE = re.compile(r'^# aider chat started at (.+)$')
- PROJECT_RE = re.compile(r'^> (.+)$')
- USER_RE = re.compile(r'^#### user:\s*$')
- ASSISTANT_RE = re.compile(r'^#### assistant:\s*$')
-
- def __init__(self, watch_dir, cursor_manager, batch_callback,
- file_pattern=".aider.chat.history.md",
- batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs):
- super().__init__(
- watch_dir=watch_dir,
- cursor_manager=cursor_manager,
- batch_callback=batch_callback,
- file_pattern=file_pattern,
- batch_trigger_lines=batch_trigger_lines,
- batch_trigger_seconds=batch_trigger_seconds,
- )
-
- @property
- def tool_name(self) -> str:
- return "aider"
-
- def matches_file_pattern(self, file_path: str) -> bool:
- """Match .aider.chat.history.md files."""
- filename = os.path.basename(file_path)
- return filename == ".aider.chat.history.md"
-
- def parse_line(self, line: str) -> Optional[Dict]:
- """Not used - Aider uses multi-line parsing via _process_file override."""
- return None
-
- def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
- """Not used directly - _process_file creates normalized events."""
- return raw_event
-
- def _process_file(self, file_path: str):
- """
- Override: Parse Aider's multi-line markdown format.
- Extracts user/assistant conversation blocks from .aider.chat.history.md.
- """
- try:
- cursor = self.cursor_manager.get_cursor(file_path)
- file_size = os.path.getsize(file_path)
-
- if file_size <= cursor.last_position:
- return
-
- with open(file_path, "r", encoding="utf-8", errors="replace") as f:
- f.seek(cursor.last_position)
- new_content = f.read()
-
- new_position = cursor.last_position + len(new_content.encode("utf-8"))
-
- # Parse conversation blocks
- events = self._parse_aider_content(new_content)
-
- for event in events:
- event["tool_name"] = self.tool_name
- byte_size = len(event.get("content", "").encode("utf-8"))
- self._buffer.add_line(event, byte_size)
-
- self.cursor_manager.update_cursor(file_path, new_position)
- self._check_batch_trigger()
-
- except Exception as e:
- logger.error("[%s] Error processing %s: %s", self.tool_name, file_path, e)
-
- def _parse_aider_content(self, content: str) -> List[Dict]:
- """Parse Aider markdown content into normalized events."""
- events = []
- lines = content.splitlines()
-
- current_role = None
- current_content = []
- current_timestamp = None
- project_name = None
-
- for line in lines:
- # Check for timestamp header
- header_match = self.HEADER_RE.match(line)
- if header_match:
- current_timestamp = header_match.group(1).strip()
- continue
-
- # Check for project path
- project_match = self.PROJECT_RE.match(line)
- if project_match:
- project_name = project_match.group(1).strip()
- continue
-
- # Check for role markers
- if self.USER_RE.match(line):
- # Flush previous block
- if current_role and current_content:
- events.append(self._make_event(current_role, current_content, current_timestamp, project_name))
- current_role = "user"
- current_content = []
- continue
-
- if self.ASSISTANT_RE.match(line):
- if current_role and current_content:
- events.append(self._make_event(current_role, current_content, current_timestamp, project_name))
- current_role = "assistant"
- current_content = []
- continue
-
- # Accumulate content for current role
- if current_role:
- current_content.append(line)
-
- # Flush last block
- if current_role and current_content:
- events.append(self._make_event(current_role, current_content, current_timestamp, project_name))
-
- return events
-
- def _make_event(self, role: str, content_lines: List[str],
- timestamp: Optional[str], project_name: Optional[str]) -> Dict:
- """Create a normalized event dict from parsed content."""
- content = "\n".join(content_lines).strip()
- return {
- "role": role,
- "content": content,
- "type": "message",
- "timestamp": timestamp,
- "project_name": project_name,
- }
diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py
index 1f3e71147d..2ea3760b76 100644
--- a/openviking/daemon/watchers/base_file_watcher.py
+++ b/openviking/daemon/watchers/base_file_watcher.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Abstract base class for file-append based watchers.
Handles watchdog Observer lifecycle, cursor management, and batch buffering.
diff --git a/openviking/daemon/watchers/base_polling_watcher.py b/openviking/daemon/watchers/base_polling_watcher.py
index 29059b7a93..5e07760187 100644
--- a/openviking/daemon/watchers/base_polling_watcher.py
+++ b/openviking/daemon/watchers/base_polling_watcher.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Abstract base class for database/API-based watchers that use periodic polling.
Unlike BaseFileWatcher (watchdog + file cursor), this uses Thread + Event.wait(interval).
diff --git a/openviking/daemon/watchers/claude_code_watcher.py b/openviking/daemon/watchers/claude_code_watcher.py
index 88ffae8c87..ca8b9c4a88 100644
--- a/openviking/daemon/watchers/claude_code_watcher.py
+++ b/openviking/daemon/watchers/claude_code_watcher.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Claude Code JSONL log watcher.
Monitors ~/.claude/projects//.jsonl files.
diff --git a/openviking/daemon/watchers/continue_dev_watcher.py b/openviking/daemon/watchers/continue_dev_watcher.py
deleted file mode 100644
index 0d87b64324..0000000000
--- a/openviking/daemon/watchers/continue_dev_watcher.py
+++ /dev/null
@@ -1,68 +0,0 @@
-"""
-Continue.dev log watcher.
-Monitors ~/.continue/ JSON log files for AI conversation events.
-"""
-import json
-from typing import Dict, List, Optional
-
-from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
-from openviking.daemon.watchers.registry import register_watcher
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-@register_watcher("continue_dev")
-class ContinueDevWatcher(BaseFileWatcher):
- """Watches Continue.dev JSON log files."""
-
- def __init__(self, watch_dir, cursor_manager, batch_callback,
- file_pattern="*.json",
- batch_trigger_lines=50, batch_trigger_seconds=300, **kwargs):
- super().__init__(
- watch_dir=watch_dir,
- cursor_manager=cursor_manager,
- batch_callback=batch_callback,
- file_pattern=file_pattern,
- batch_trigger_lines=batch_trigger_lines,
- batch_trigger_seconds=batch_trigger_seconds,
- )
-
- @property
- def tool_name(self) -> str:
- return "continue_dev"
-
- def parse_line(self, line: str) -> Optional[Dict]:
- """Parse a Continue.dev JSON log line."""
- if not line:
- return None
- try:
- data = json.loads(line)
- if not isinstance(data, dict):
- return None
- return data
- except (json.JSONDecodeError, ValueError):
- return None
-
- def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
- """
- Normalize Continue.dev event.
- Continue.dev uses format:
- {"role": "user"|"assistant", "content": "...", "timestamp": "..."}
- """
- role = raw_event.get("role", "")
- content = raw_event.get("content", "")
-
- if role not in ("user", "assistant"):
- return None
- if not content:
- return None
-
- return {
- "role": role,
- "content": content,
- "type": "message",
- "timestamp": raw_event.get("timestamp"),
- "session_id": raw_event.get("sessionId") or raw_event.get("session_id"),
- "project_name": raw_event.get("workspaceDirectory"),
- }
diff --git a/openviking/daemon/watchers/cursor_db_watcher.py b/openviking/daemon/watchers/cursor_db_watcher.py
index 77a6444318..4796510da0 100644
--- a/openviking/daemon/watchers/cursor_db_watcher.py
+++ b/openviking/daemon/watchers/cursor_db_watcher.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Cursor IDE SQLite database watcher.
Monitors Cursor's dual-SQLite storage for AI conversations:
diff --git a/openviking/daemon/watchers/cursor_watcher.py b/openviking/daemon/watchers/cursor_watcher.py
deleted file mode 100644
index 6224fc67d7..0000000000
--- a/openviking/daemon/watchers/cursor_watcher.py
+++ /dev/null
@@ -1,87 +0,0 @@
-"""
-Cursor IDE log watcher.
-Monitors Cursor log files (JSON format) for AI conversation events.
-"""
-import json
-from typing import Dict, List, Optional
-
-from openviking.daemon.watchers.base_file_watcher import BaseFileWatcher
-from openviking.daemon.watchers.registry import register_watcher
-from openviking_cli.utils.logger import get_logger
-
-logger = get_logger(__name__)
-
-
-@register_watcher("cursor")
-class CursorWatcher(BaseFileWatcher):
- """
- Watches Cursor IDE log files.
-
- Cursor stores AI conversations in log files under its workspace storage.
- The format varies by version, so we implement flexible parsing.
- """
-
- def __init__(self, watch_dir, cursor_manager, batch_callback,
- file_pattern="*.log",
- batch_trigger_lines=50, batch_trigger_seconds=300,
- extra=None, **kwargs):
- super().__init__(
- watch_dir=watch_dir,
- cursor_manager=cursor_manager,
- batch_callback=batch_callback,
- file_pattern=file_pattern,
- batch_trigger_lines=batch_trigger_lines,
- batch_trigger_seconds=batch_trigger_seconds,
- )
- self.extra = extra or {}
-
- @property
- def tool_name(self) -> str:
- return "cursor"
-
- def parse_line(self, line: str) -> Optional[Dict]:
- """Parse a Cursor log line (JSON format)."""
- if not line:
- return None
- try:
- data = json.loads(line)
- if not isinstance(data, dict):
- return None
- return data
- except (json.JSONDecodeError, ValueError):
- return None
-
- def normalize_event(self, raw_event: Dict) -> Optional[Dict]:
- """
- Normalize Cursor log event.
- Cursor logs use various schemas. We look for common patterns:
- - {"type": "chat", "role": "user"|"assistant", "message": "..."}
- - {"event": "ai_response", "content": "..."}
- """
- # Try standard chat format
- event_type = raw_event.get("type", "")
- role = raw_event.get("role", "")
-
- if role in ("user", "human", "human_turn"):
- content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or ""
- if content:
- return {
- "role": "user",
- "content": content,
- "type": "message",
- "timestamp": raw_event.get("timestamp") or raw_event.get("ts"),
- "session_id": raw_event.get("conversationId") or raw_event.get("session_id"),
- }
-
- if role in ("assistant", "ai", "ai_response", "bot"):
- content = raw_event.get("message") or raw_event.get("content") or raw_event.get("text") or ""
- if content:
- return {
- "role": "assistant",
- "content": content,
- "type": "message",
- "timestamp": raw_event.get("timestamp") or raw_event.get("ts"),
- "session_id": raw_event.get("conversationId") or raw_event.get("session_id"),
- }
-
- return None
diff --git a/openviking/daemon/watchers/generic_jsonl_watcher.py b/openviking/daemon/watchers/generic_jsonl_watcher.py
index 1bc1b1da62..a0607fdc2a 100644
--- a/openviking/daemon/watchers/generic_jsonl_watcher.py
+++ b/openviking/daemon/watchers/generic_jsonl_watcher.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Generic JSONL log watcher.
Supports arbitrary JSONL logs with customizable field mapping.
diff --git a/openviking/daemon/watchers/registry.py b/openviking/daemon/watchers/registry.py
index b76b9624d3..137d881d1a 100644
--- a/openviking/daemon/watchers/registry.py
+++ b/openviking/daemon/watchers/registry.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
"""
Watcher registry for creating tool-specific watchers by name.
"""
@@ -50,24 +52,6 @@ def _register_builtins():
except ImportError:
pass
- try:
- from openviking.daemon.watchers.aider_watcher import AiderWatcher
- _WATCHER_REGISTRY["aider"] = AiderWatcher
- except ImportError:
- pass
-
- try:
- from openviking.daemon.watchers.cursor_watcher import CursorWatcher
- _WATCHER_REGISTRY["cursor"] = CursorWatcher
- except ImportError:
- pass
-
- try:
- from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher
- _WATCHER_REGISTRY["continue_dev"] = ContinueDevWatcher
- except ImportError:
- pass
-
try:
from openviking.daemon.watchers.cursor_db_watcher import CursorDBWatcher
_WATCHER_REGISTRY["cursor_db"] = CursorDBWatcher
diff --git a/tests/daemon/test_aider_watcher.py b/tests/daemon/test_aider_watcher.py
deleted file mode 100644
index 6ea7721d08..0000000000
--- a/tests/daemon/test_aider_watcher.py
+++ /dev/null
@@ -1,183 +0,0 @@
-"""Tests for AiderWatcher parsing logic."""
-import time
-from typing import Dict, Optional
-
-from openviking.daemon.watchers.aider_watcher import AiderWatcher
-
-
-class FakeCursorManager:
- """Minimal stub for testing."""
- def __init__(self):
- self.cursors = {}
- self.updates = []
-
- def get_cursor(self, file_path):
- from openviking.daemon.models import FileCursor
- return self.cursors.get(file_path, FileCursor(file_path=file_path))
-
- def update_cursor(self, file_path, position):
- from openviking.daemon.models import FileCursor
- self.cursors[file_path] = FileCursor(
- file_path=file_path,
- last_position=position,
- last_read_time=time.time(),
- )
- self.updates.append((file_path, position))
-
-
-def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
- batches = []
- cursor_mgr = FakeCursorManager()
- watcher = AiderWatcher(
- watch_dir=str(tmp_path),
- cursor_manager=cursor_mgr,
- batch_callback=lambda events: batches.append(events),
- batch_trigger_lines=batch_trigger_lines,
- batch_trigger_seconds=batch_trigger_seconds,
- )
- return watcher, batches, cursor_mgr
-
-
-def test_matches_file_pattern(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.matches_file_pattern("/project/.aider.chat.history.md")
- assert not w.matches_file_pattern("/project/other.md")
- assert not w.matches_file_pattern("/project/aider.chat.history.md")
-
-
-def test_parse_single_user_assistant_block(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- content = (
- "# aider chat started at 2024-01-15 10:30:00\n"
- "\n"
- "> /path/to/project\n"
- "\n"
- "#### user:\n"
- "How do I implement a REST API in Flask?\n"
- "\n"
- "#### assistant:\n"
- "Here's how to create a basic Flask REST API:\n"
- "Use Flask and add routes.\n"
- )
- events = w._parse_aider_content(content)
- assert len(events) == 2
- assert events[0]["role"] == "user"
- assert events[0]["content"] == "How do I implement a REST API in Flask?"
- assert events[1]["role"] == "assistant"
- assert "Flask REST API" in events[1]["content"]
-
-
-def test_parse_multiple_blocks(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- content = (
- "# aider chat started at 2024-01-15 10:30:00\n"
- "> /my/project\n"
- "#### user:\n"
- "First question\n"
- "#### assistant:\n"
- "First answer\n"
- "#### user:\n"
- "Second question\n"
- "#### assistant:\n"
- "Second answer\n"
- )
- events = w._parse_aider_content(content)
- assert len(events) == 4
- assert events[0]["role"] == "user"
- assert events[0]["content"] == "First question"
- assert events[1]["role"] == "assistant"
- assert events[1]["content"] == "First answer"
- assert events[2]["role"] == "user"
- assert events[2]["content"] == "Second question"
- assert events[3]["role"] == "assistant"
- assert events[3]["content"] == "Second answer"
-
-
-def test_timestamp_and_project_extraction(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- content = (
- "# aider chat started at 2024-01-15 10:30:00\n"
- "> /home/user/myproject\n"
- "#### user:\n"
- "Hello\n"
- )
- events = w._parse_aider_content(content)
- assert len(events) == 1
- assert events[0]["timestamp"] == "2024-01-15 10:30:00"
- assert events[0]["project_name"] == "/home/user/myproject"
-
-
-def test_empty_content_handling(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- events = w._parse_aider_content("")
- assert events == []
-
- events = w._parse_aider_content("# aider chat started at 2024-01-15 10:30:00\n")
- assert events == []
-
-
-def test_multiline_content(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- content = (
- "#### user:\n"
- "Line one\n"
- "Line two\n"
- "Line three\n"
- )
- events = w._parse_aider_content(content)
- assert len(events) == 1
- assert events[0]["content"] == "Line one\nLine two\nLine three"
-
-
-def test_process_file_integration(tmp_path):
- w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
-
- test_file = tmp_path / ".aider.chat.history.md"
- test_file.write_text(
- "# aider chat started at 2024-01-15 10:30:00\n"
- "> /project\n"
- "#### user:\n"
- "Hello\n"
- "#### assistant:\n"
- "Hi there\n",
- encoding="utf-8",
- )
-
- w._process_file(str(test_file))
-
- assert len(batches) == 1
- assert len(batches[0]) == 2
- assert batches[0][0]["role"] == "user"
- assert batches[0][0]["tool_name"] == "aider"
- assert batches[0][1]["role"] == "assistant"
-
-
-def test_incremental_read(tmp_path):
- w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100)
-
- test_file = tmp_path / ".aider.chat.history.md"
- test_file.write_text(
- "#### user:\n"
- "First message\n",
- encoding="utf-8",
- )
- w._process_file(str(test_file))
-
- # Append more content
- with open(str(test_file), "a", encoding="utf-8") as f:
- f.write(
- "#### assistant:\n"
- "Response\n"
- )
- w._process_file(str(test_file))
-
- w.flush()
- assert len(batches) == 1
- assert len(batches[0]) == 2
- assert batches[0][0]["role"] == "user"
- assert batches[0][1]["role"] == "assistant"
-
-
-def test_tool_name(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.tool_name == "aider"
diff --git a/tests/daemon/test_continue_dev_watcher.py b/tests/daemon/test_continue_dev_watcher.py
deleted file mode 100644
index 2a6e01dc14..0000000000
--- a/tests/daemon/test_continue_dev_watcher.py
+++ /dev/null
@@ -1,193 +0,0 @@
-"""Tests for ContinueDevWatcher parsing and normalization logic."""
-import time
-from typing import Dict, Optional
-
-from openviking.daemon.watchers.continue_dev_watcher import ContinueDevWatcher
-
-
-class FakeCursorManager:
- """Minimal stub for testing."""
- def __init__(self):
- self.cursors = {}
- self.updates = []
-
- def get_cursor(self, file_path):
- from openviking.daemon.models import FileCursor
- return self.cursors.get(file_path, FileCursor(file_path=file_path))
-
- def update_cursor(self, file_path, position):
- from openviking.daemon.models import FileCursor
- self.cursors[file_path] = FileCursor(
- file_path=file_path,
- last_position=position,
- last_read_time=time.time(),
- )
- self.updates.append((file_path, position))
-
-
-def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
- batches = []
- cursor_mgr = FakeCursorManager()
- watcher = ContinueDevWatcher(
- watch_dir=str(tmp_path),
- cursor_manager=cursor_mgr,
- batch_callback=lambda events: batches.append(events),
- batch_trigger_lines=batch_trigger_lines,
- batch_trigger_seconds=batch_trigger_seconds,
- )
- return watcher, batches, cursor_mgr
-
-
-def test_parse_line_valid_json(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- line = '{"role": "user", "content": "Hello"}'
- result = w.parse_line(line)
- assert result is not None
- assert result["role"] == "user"
- assert result["content"] == "Hello"
-
-
-def test_parse_line_invalid_json(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.parse_line("not valid json") is None
- assert w.parse_line("") is None
- assert w.parse_line(" ") is None
-
-
-def test_parse_line_non_dict(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.parse_line('"just a string"') is None
- assert w.parse_line("[1, 2, 3]") is None
- assert w.parse_line("42") is None
-
-
-def test_normalize_standard_format(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "user", "content": "How do I sort a list?"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "user"
- assert event["content"] == "How do I sort a list?"
- assert event["type"] == "message"
-
-
-def test_normalize_assistant(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "assistant", "content": "Use the sorted() function."}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "assistant"
- assert event["content"] == "Use the sorted() function."
-
-
-def test_normalize_missing_content_filtered(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- # Empty content
- raw = {"role": "user", "content": ""}
- assert w.normalize_event(raw) is None
-
- # Missing content key entirely
- raw = {"role": "user"}
- assert w.normalize_event(raw) is None
-
-
-def test_normalize_invalid_role_filtered(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "system", "content": "System message"}
- assert w.normalize_event(raw) is None
-
- raw = {"role": "tool", "content": "Tool output"}
- assert w.normalize_event(raw) is None
-
- raw = {"content": "No role at all"}
- assert w.normalize_event(raw) is None
-
-
-def test_normalize_session_id_mapping(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- # sessionId field
- raw = {"role": "user", "content": "test", "sessionId": "session-abc-123"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["session_id"] == "session-abc-123"
-
- # session_id field (alternative)
- raw = {"role": "user", "content": "test", "session_id": "session-xyz-789"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["session_id"] == "session-xyz-789"
-
-
-def test_normalize_workspace_directory_to_project_name(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {
- "role": "user",
- "content": "test",
- "workspaceDirectory": "/home/user/my-project",
- }
- event = w.normalize_event(raw)
- assert event is not None
- assert event["project_name"] == "/home/user/my-project"
-
-
-def test_normalize_timestamp(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "user", "content": "test", "timestamp": "2024-01-15T10:30:00Z"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["timestamp"] == "2024-01-15T10:30:00Z"
-
-
-def test_normalize_optional_fields_none(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "user", "content": "minimal event"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["timestamp"] is None
- assert event["session_id"] is None
- assert event["project_name"] is None
-
-
-def test_tool_name(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.tool_name == "continue_dev"
-
-
-def test_process_file_integration(tmp_path):
- w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
-
- test_file = tmp_path / "continue.json"
- test_file.write_text(
- '{"role": "user", "content": "Hello"}\n'
- '{"role": "assistant", "content": "Hi there"}\n',
- encoding="utf-8",
- )
-
- w._process_file(str(test_file))
-
- assert len(batches) == 1
- assert len(batches[0]) == 2
- assert batches[0][0]["role"] == "user"
- assert batches[0][0]["tool_name"] == "continue_dev"
- assert batches[0][1]["role"] == "assistant"
-
-
-def test_process_file_filters_non_chat(tmp_path):
- w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=100)
-
- test_file = tmp_path / "continue.json"
- test_file.write_text(
- '{"role": "user", "content": "Hello"}\n'
- '{"role": "system", "content": "System prompt"}\n'
- '{"role": "assistant", "content": "Hi"}\n'
- 'not json at all\n',
- encoding="utf-8",
- )
-
- w._process_file(str(test_file))
- w.flush()
-
- assert len(batches) == 1
- assert len(batches[0]) == 2 # only user + assistant, system filtered
- assert batches[0][0]["role"] == "user"
- assert batches[0][1]["role"] == "assistant"
diff --git a/tests/daemon/test_cursor_watcher.py b/tests/daemon/test_cursor_watcher.py
deleted file mode 100644
index e3982608f4..0000000000
--- a/tests/daemon/test_cursor_watcher.py
+++ /dev/null
@@ -1,187 +0,0 @@
-"""Tests for CursorWatcher parsing and normalization logic."""
-import time
-from typing import Dict, Optional
-
-from openviking.daemon.watchers.cursor_watcher import CursorWatcher
-
-
-class FakeCursorManager:
- """Minimal stub for testing."""
- def __init__(self):
- self.cursors = {}
- self.updates = []
-
- def get_cursor(self, file_path):
- from openviking.daemon.models import FileCursor
- return self.cursors.get(file_path, FileCursor(file_path=file_path))
-
- def update_cursor(self, file_path, position):
- from openviking.daemon.models import FileCursor
- self.cursors[file_path] = FileCursor(
- file_path=file_path,
- last_position=position,
- last_read_time=time.time(),
- )
- self.updates.append((file_path, position))
-
-
-def _make_watcher(tmp_path, batch_trigger_lines=50, batch_trigger_seconds=300):
- batches = []
- cursor_mgr = FakeCursorManager()
- watcher = CursorWatcher(
- watch_dir=str(tmp_path),
- cursor_manager=cursor_mgr,
- batch_callback=lambda events: batches.append(events),
- batch_trigger_lines=batch_trigger_lines,
- batch_trigger_seconds=batch_trigger_seconds,
- )
- return watcher, batches, cursor_mgr
-
-
-def test_parse_line_valid_json(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- line = '{"type": "chat", "role": "user", "message": "Hello"}'
- result = w.parse_line(line)
- assert result is not None
- assert result["role"] == "user"
- assert result["message"] == "Hello"
-
-
-def test_parse_line_invalid_json(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.parse_line("not valid json") is None
- assert w.parse_line("") is None
- assert w.parse_line(" ") is None
-
-
-def test_parse_line_non_dict(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.parse_line('"just a string"') is None
- assert w.parse_line("[1, 2, 3]") is None
- assert w.parse_line("42") is None
-
-
-def test_normalize_standard_chat_format(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"type": "chat", "role": "user", "message": "How do I sort a list?"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "user"
- assert event["content"] == "How do I sort a list?"
- assert event["type"] == "message"
-
-
-def test_normalize_human_role(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "human", "content": "What is Python?"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "user"
- assert event["content"] == "What is Python?"
-
-
-def test_normalize_human_turn_role(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "human_turn", "text": "Explain recursion"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "user"
- assert event["content"] == "Explain recursion"
-
-
-def test_normalize_ai_response(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "ai_response", "text": "Recursion is when a function calls itself."}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "assistant"
- assert event["content"] == "Recursion is when a function calls itself."
-
-
-def test_normalize_assistant_role(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "assistant", "message": "Here is the answer."}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "assistant"
- assert event["content"] == "Here is the answer."
-
-
-def test_normalize_ai_role(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "ai", "content": "AI generated response"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "assistant"
- assert event["content"] == "AI generated response"
-
-
-def test_normalize_bot_role(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "bot", "message": "Bot reply"}
- event = w.normalize_event(raw)
- assert event is not None
- assert event["role"] == "assistant"
- assert event["content"] == "Bot reply"
-
-
-def test_normalize_non_chat_event_filtered(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- # System message should be filtered
- raw = {"role": "system", "content": "System prompt"}
- assert w.normalize_event(raw) is None
-
- # No role
- raw = {"type": "info", "content": "some log"}
- assert w.normalize_event(raw) is None
-
- # Empty content
- raw = {"role": "user", "message": ""}
- assert w.normalize_event(raw) is None
-
-
-def test_normalize_timestamp_fields(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "user", "message": "test", "timestamp": "2024-01-15T10:30:00Z"}
- event = w.normalize_event(raw)
- assert event["timestamp"] == "2024-01-15T10:30:00Z"
-
- # Also check ts field
- raw = {"role": "user", "message": "test", "ts": "2024-01-15T11:00:00Z"}
- event = w.normalize_event(raw)
- assert event["timestamp"] == "2024-01-15T11:00:00Z"
-
-
-def test_normalize_session_id_mapping(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- raw = {"role": "user", "message": "test", "conversationId": "conv-123"}
- event = w.normalize_event(raw)
- assert event["session_id"] == "conv-123"
-
- raw = {"role": "user", "message": "test", "session_id": "sess-456"}
- event = w.normalize_event(raw)
- assert event["session_id"] == "sess-456"
-
-
-def test_tool_name(tmp_path):
- w, _, _ = _make_watcher(tmp_path)
- assert w.tool_name == "cursor"
-
-
-def test_process_file_integration(tmp_path):
- w, batches, _ = _make_watcher(tmp_path, batch_trigger_lines=2)
-
- test_file = tmp_path / "cursor.log"
- test_file.write_text(
- '{"role": "user", "message": "Hello"}\n'
- '{"role": "assistant", "message": "Hi there"}\n',
- encoding="utf-8",
- )
-
- w._process_file(str(test_file))
-
- assert len(batches) == 1
- assert len(batches[0]) == 2
- assert batches[0][0]["role"] == "user"
- assert batches[0][0]["tool_name"] == "cursor"
- assert batches[0][1]["role"] == "assistant"
diff --git a/tests/daemon/test_multi_watcher_integration.py b/tests/daemon/test_multi_watcher_integration.py
index ad641c39d9..23ffe2f95c 100644
--- a/tests/daemon/test_multi_watcher_integration.py
+++ b/tests/daemon/test_multi_watcher_integration.py
@@ -33,9 +33,9 @@ def update_cursor(self, file_path, position):
# --- Registry Tests ---
def test_all_watchers_registered():
- """All 6 built-in watchers should be registered."""
+ """All 3 built-in watchers should be registered."""
available = list_available_watchers()
- expected = {"claude_code", "generic_jsonl", "aider", "cursor", "continue_dev", "cursor_db"}
+ expected = {"claude_code", "generic_jsonl", "cursor_db"}
assert expected.issubset(set(available)), f"Missing: {expected - set(available)}"
@@ -47,19 +47,20 @@ def test_create_all_watchers():
file_patterns = {
"claude_code": "*.jsonl",
"generic_jsonl": "*.jsonl",
- "aider": ".aider.chat.history.md",
- "cursor": "*.log",
- "continue_dev": "*.json",
}
for tool_name in list_available_watchers():
- watcher = create_watcher(
+ kwargs = dict(
tool_name=tool_name,
watch_dir="/tmp/test",
cursor_manager=cm,
batch_callback=lambda e: batches.append(e),
- file_pattern=file_patterns.get(tool_name, "*.jsonl"),
)
+ if tool_name in file_patterns:
+ kwargs["file_pattern"] = file_patterns[tool_name]
+ if tool_name == "cursor_db":
+ kwargs["poll_interval"] = 60
+ watcher = create_watcher(**kwargs)
assert isinstance(watcher, BaseWatcher)
assert watcher.tool_name == tool_name
@@ -128,88 +129,6 @@ def test_generic_jsonl_custom_mapping(tmp_path):
assert batches[0][0]["tool_name"] == "generic_jsonl"
-def test_aider_multiline_parsing(tmp_path):
- """Aider markdown format should produce user+assistant events."""
- batches = []
- cm = FakeCursorManager()
- watcher = create_watcher(
- tool_name="aider",
- watch_dir=str(tmp_path),
- cursor_manager=cm,
- batch_callback=lambda e: batches.append(e),
- batch_trigger_lines=100,
- )
-
- test_file = tmp_path / ".aider.chat.history.md"
- test_file.write_text(
- "# aider chat started at 2026-01-15 10:30:00\n\n"
- "> /my/project\n\n"
- "#### user:\n"
- "How do I use fastapi?\n\n"
- "#### assistant:\n"
- "FastAPI is great for building APIs.\n"
- )
- watcher._process_file(str(test_file))
- watcher.flush()
-
- assert len(batches) == 1
- events = batches[0]
- assert len(events) == 2
- assert events[0]["role"] == "user"
- assert events[0]["content"] == "How do I use fastapi?"
- assert events[0]["tool_name"] == "aider"
- assert events[1]["role"] == "assistant"
-
-
-def test_cursor_log_parsing(tmp_path):
- """Cursor log format should be normalized correctly."""
- batches = []
- cm = FakeCursorManager()
- watcher = create_watcher(
- tool_name="cursor",
- watch_dir=str(tmp_path),
- cursor_manager=cm,
- batch_callback=lambda e: batches.append(e),
- file_pattern="*.log",
- batch_trigger_lines=1,
- )
-
- test_file = tmp_path / "cursor.log"
- test_file.write_text(
- '{"type": "chat", "role": "user", "message": "Fix this bug"}\n'
- )
- watcher._process_file(str(test_file))
-
- assert len(batches) == 1
- assert batches[0][0]["role"] == "user"
- assert batches[0][0]["content"] == "Fix this bug"
- assert batches[0][0]["tool_name"] == "cursor"
-
-
-def test_continue_dev_log_parsing(tmp_path):
- """Continue.dev JSON format should be normalized."""
- batches = []
- cm = FakeCursorManager()
- watcher = create_watcher(
- tool_name="continue_dev",
- watch_dir=str(tmp_path),
- cursor_manager=cm,
- batch_callback=lambda e: batches.append(e),
- file_pattern="*.json",
- batch_trigger_lines=1,
- )
-
- test_file = tmp_path / "continue.json"
- test_file.write_text(
- '{"role": "user", "content": "Explain this code", "sessionId": "abc123"}\n'
- )
- watcher._process_file(str(test_file))
-
- assert len(batches) == 1
- assert batches[0][0]["role"] == "user"
- assert batches[0][0]["tool_name"] == "continue_dev"
- assert batches[0][0]["session_id"] == "abc123"
-
# --- Cross-Watcher ETL Compatibility ---
@@ -222,9 +141,9 @@ def test_normalized_events_compatible_with_reconstructor():
"timestamp": "2026-01-15T10:00:00Z", "session_id": "s1", "project_name": "proj"},
{"role": "assistant", "content": "Answer from CC", "tool_name": "claude_code",
"timestamp": "2026-01-15T10:00:01Z", "session_id": "s1", "project_name": "proj"},
- {"role": "user", "content": "Question from Aider", "tool_name": "aider",
+ {"role": "user", "content": "Question from CursorDB", "tool_name": "cursor_db",
"timestamp": "2026-01-15T10:00:02Z"},
- {"role": "assistant", "content": "Answer from Aider", "tool_name": "aider",
+ {"role": "assistant", "content": "Answer from CursorDB", "tool_name": "cursor_db",
"timestamp": "2026-01-15T10:00:03Z"},
]
@@ -233,7 +152,7 @@ def test_normalized_events_compatible_with_reconstructor():
assert len(turns) == 2
assert turns[0].user_prompt == "Question from CC"
- assert turns[1].user_prompt == "Question from Aider"
+ assert turns[1].user_prompt == "Question from CursorDB"
def test_normalized_events_compatible_with_filter():
@@ -244,9 +163,9 @@ def test_normalized_events_compatible_with_filter():
{"role": "user", "content": "A meaningful question about architecture",
"tool_name": "claude_code"},
{"role": "user", "content": "npm install express",
- "tool_name": "cursor"},
+ "tool_name": "cursor_db"},
{"role": "assistant", "content": "Here is a detailed explanation of the design pattern",
- "tool_name": "aider"},
+ "tool_name": "generic_jsonl"},
]
f = LowValueFilter()
@@ -261,9 +180,9 @@ def test_source_tool_propagated_through_pipeline():
from openviking.daemon.conversation_reconstructor import ConversationReconstructor
events = [
- {"role": "user", "content": "How to use FastAPI?", "tool_name": "cursor",
+ {"role": "user", "content": "How to use FastAPI?", "tool_name": "cursor_db",
"timestamp": "2026-01-15T10:00:00Z"},
- {"role": "assistant", "content": "Install FastAPI with pip...", "tool_name": "cursor",
+ {"role": "assistant", "content": "Install FastAPI with pip...", "tool_name": "cursor_db",
"timestamp": "2026-01-15T10:00:01Z"},
]
@@ -271,7 +190,7 @@ def test_source_tool_propagated_through_pipeline():
turns = reconstructor.reconstruct(events)
assert len(turns) == 1
- assert turns[0].source_tool == "cursor"
+ assert turns[0].source_tool == "cursor_db"
def test_multi_watcher_config_effective_watchers():
@@ -283,14 +202,14 @@ def test_multi_watcher_config_effective_watchers():
enabled=True,
watchers=[
WatcherConfig(tool_name="claude_code", watch_dir="/a"),
- WatcherConfig(tool_name="aider", watch_dir="/b"),
- WatcherConfig(tool_name="cursor", watch_dir="/c", enabled=False),
+ WatcherConfig(tool_name="generic_jsonl", watch_dir="/b"),
+ WatcherConfig(tool_name="cursor_db", watch_dir="/c", enabled=False),
],
)
effective = cfg.get_effective_watchers()
assert len(effective) == 2 # disabled watcher filtered out
assert effective[0].tool_name == "claude_code"
- assert effective[1].tool_name == "aider"
+ assert effective[1].tool_name == "generic_jsonl"
# Backward compat: watch_dir only
cfg2 = DaemonConfig(enabled=True, watch_dir="~/.claude/projects")
@@ -315,10 +234,10 @@ def test_knowledge_router_uses_source_tool():
# With source_tool
k1 = ExtractedKnowledge(
status="EXTRACTED", category="skills", title="FastAPI Tips",
- content="...", confidence=0.9, source_tool="cursor",
+ content="...", confidence=0.9, source_tool="cursor_db",
)
uri1 = router.route(k1)
- assert "cursor" in uri1
+ assert "cursor_db" in uri1
assert "claude_code" not in uri1
# Without source_tool (fallback)
diff --git a/tests/daemon/test_service_multi.py b/tests/daemon/test_service_multi.py
index 60fe888eff..a84b997cdd 100644
--- a/tests/daemon/test_service_multi.py
+++ b/tests/daemon/test_service_multi.py
@@ -9,7 +9,7 @@ def test_service_stores_watcher_configs():
from openviking.daemon.service import DaemonService
configs = [
WatcherConfig(tool_name="claude_code", watch_dir="/tmp/cc"),
- WatcherConfig(tool_name="aider", watch_dir="/tmp/aider"),
+ WatcherConfig(tool_name="cursor_db", watch_dir="/tmp/cursor"),
]
svc = DaemonService(
resource_service=MagicMock(),
From 3b8a63d0cbda66cb06efc2e24b114af3d07fb6a1 Mon Sep 17 00:00:00 2001
From: baobaodae <2014596548@qq.com>
Date: Thu, 25 Jun 2026 21:57:23 +0800
Subject: [PATCH 6/6] fix(daemon): resolve CRLF cursor drift, file truncation,
and quiet-session flush gap
- Switch _process_file() to binary mode read for exact byte-offset cursors,
eliminating CRLF drift on Windows (universal newline translation stripped \r)
- Detect file truncation/rotation (file_size < cursor) and reset cursor to 0
instead of silently skipping the file forever
- Add independent periodic flush timer thread so buffered events are flushed
after batch_trigger_seconds even when no new file events arrive
- Ensure final flush on stop() to prevent data loss during shutdown
- Add 3 new tests: CRLF accuracy, truncation reset, periodic flush on quiet session
---
.../daemon/watchers/base_file_watcher.py | 50 ++++++++--
tests/daemon/test_base_file_watcher.py | 95 +++++++++++++++++++
2 files changed, 138 insertions(+), 7 deletions(-)
diff --git a/openviking/daemon/watchers/base_file_watcher.py b/openviking/daemon/watchers/base_file_watcher.py
index 2ea3760b76..4f62408c8d 100644
--- a/openviking/daemon/watchers/base_file_watcher.py
+++ b/openviking/daemon/watchers/base_file_watcher.py
@@ -6,6 +6,7 @@
Subclasses only need to implement parse_line() and normalize_event().
"""
import os
+import threading
import time
from abc import ABC, abstractmethod
from pathlib import Path
@@ -112,16 +113,33 @@ def matches_file_pattern(self, file_path: str) -> bool:
return filename == self.file_pattern
def start(self) -> None:
- """Start the watchdog Observer."""
+ """Start the watchdog Observer and periodic flush timer."""
self._handler = _FileHandler(self)
self._observer = Observer()
self._observer.schedule(self._handler, self.watch_dir, recursive=True)
self._observer.daemon = True
self._observer.start()
+
+ # Periodic flush timer for time-based batch trigger
+ self._stop_event = threading.Event()
+ self._flush_thread = threading.Thread(
+ target=self._periodic_flush_loop, daemon=True
+ )
+ self._flush_thread.start()
+
logger.info("[%s] Watcher started on %s", self.tool_name, self.watch_dir)
def stop(self) -> None:
- """Stop the watchdog Observer."""
+ """Stop the watchdog Observer and periodic flush timer, flushing remaining data."""
+ # Signal flush thread to stop and wait for it
+ if hasattr(self, "_stop_event"):
+ self._stop_event.set()
+ if hasattr(self, "_flush_thread") and self._flush_thread:
+ self._flush_thread.join(timeout=5)
+
+ # Final flush to avoid losing buffered events on shutdown
+ self._flush_buffer()
+
if self._observer:
self._observer.stop()
self._observer.join(timeout=5)
@@ -131,23 +149,41 @@ def flush(self) -> None:
"""Force flush the buffer."""
self._flush_buffer()
+ def _periodic_flush_loop(self):
+ """Background thread that periodically flushes the buffer based on time threshold."""
+ while not self._stop_event.wait(timeout=self.batch_trigger_seconds):
+ if not self._buffer.is_empty():
+ age = time.time() - self._buffer.created_at
+ if age >= self.batch_trigger_seconds:
+ logger.debug("[%s] Periodic flush: %d events (age %.0fs)",
+ self.tool_name, len(self._buffer.lines), age)
+ self._flush_buffer()
+
def _process_file(self, file_path: str):
"""Read new content from file using cursor, parse, normalize, buffer."""
try:
cursor = self.cursor_manager.get_cursor(file_path)
file_size = os.path.getsize(file_path)
- if file_size <= cursor.last_position:
+ if file_size < cursor.last_position:
+ # File truncated or rotated — reset cursor to beginning
+ logger.warning("[%s] File truncated (size %d < cursor %d), resetting",
+ self.tool_name, file_size, cursor.last_position)
+ self.cursor_manager.update_cursor(file_path, 0)
+ return
+ if file_size == cursor.last_position:
return
- with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+ # Use binary mode to get exact byte offsets (avoids CRLF drift on Windows)
+ with open(file_path, "rb") as f:
f.seek(cursor.last_position)
- new_content = f.read()
+ raw_bytes = f.read()
- new_position = cursor.last_position + len(new_content.encode("utf-8"))
+ new_position = cursor.last_position + len(raw_bytes)
+ new_content = raw_bytes.decode("utf-8", errors="replace")
logger.info("[%s] Processing %s: %d bytes new content from pos %d",
- self.tool_name, file_path, len(new_content.encode("utf-8")), cursor.last_position)
+ self.tool_name, file_path, len(raw_bytes), cursor.last_position)
event_count = 0
for line in new_content.splitlines():
diff --git a/tests/daemon/test_base_file_watcher.py b/tests/daemon/test_base_file_watcher.py
index badefba21f..8ab404435b 100644
--- a/tests/daemon/test_base_file_watcher.py
+++ b/tests/daemon/test_base_file_watcher.py
@@ -161,3 +161,98 @@ def filter_event(self, event):
assert len(batches) == 1
assert len(batches[0]) == 1
assert batches[0][0]["content"] == "keep this"
+
+
+def test_crlf_byte_offset_accuracy(tmp_path):
+ """Binary mode read ensures cursor byte offset is exact even with CRLF line endings."""
+ w, batches, cm = _make_watcher(tmp_path, batch_trigger_lines=100)
+
+ test_file = tmp_path / "test.jsonl"
+ # Write CRLF-terminated lines (simulating Windows line endings)
+ with open(str(test_file), "wb") as f:
+ f.write(b'{"role": "user", "content": "hello"}\r\n')
+ f.write(b'{"role": "assistant", "content": "hi"}\r\n')
+
+ w._process_file(str(test_file))
+
+ # Cursor should point to exact end of file (including \r\n bytes)
+ cursor = cm.get_cursor(str(test_file))
+ actual_size = os.path.getsize(str(test_file))
+ assert cursor.last_position == actual_size, (
+ f"Cursor {cursor.last_position} != file size {actual_size} (CRLF drift)"
+ )
+
+ # Append more and verify incremental read still works
+ with open(str(test_file), "ab") as f:
+ f.write(b'{"role": "user", "content": "second"}\r\n')
+ w._process_file(str(test_file))
+
+ cursor2 = cm.get_cursor(str(test_file))
+ assert cursor2.last_position == os.path.getsize(str(test_file))
+
+ w.flush()
+ assert len(batches) == 1
+ assert len(batches[0]) == 3
+
+
+def test_file_truncation_resets_cursor(tmp_path):
+ """When a file is truncated/rotated (size < cursor), cursor resets to 0."""
+ w, batches, cm = _make_watcher(tmp_path, batch_trigger_lines=100)
+
+ test_file = tmp_path / "test.jsonl"
+ test_file.write_text(
+ '{"role": "user", "content": "first line"}\n'
+ '{"role": "assistant", "content": "first response"}\n'
+ )
+ w._process_file(str(test_file))
+
+ # Cursor should be at end of file
+ cursor = cm.get_cursor(str(test_file))
+ assert cursor.last_position > 0
+
+ # Flush to clear buffer from first read
+ w.flush()
+ batches.clear()
+
+ # Simulate file truncation/rotation: rewrite with shorter content
+ test_file.write_text('{"role": "user", "content": "new"}\n')
+
+ # First call detects truncation and resets cursor to 0
+ w._process_file(str(test_file))
+ cursor_reset = cm.get_cursor(str(test_file))
+ assert cursor_reset.last_position == 0
+
+ # Second call reads from beginning
+ w._process_file(str(test_file))
+ cursor2 = cm.get_cursor(str(test_file))
+ assert cursor2.last_position == os.path.getsize(str(test_file))
+
+ w.flush()
+ assert len(batches) == 1
+ assert batches[0][0]["content"] == "new"
+
+
+def test_periodic_flush_on_quiet_session(tmp_path):
+ """Periodic flush thread should auto-flush buffered events after timeout."""
+ w, batches, _ = _make_watcher(
+ tmp_path, batch_trigger_lines=100, batch_trigger_seconds=1
+ )
+
+ test_file = tmp_path / "test.jsonl"
+ test_file.write_text('{"role": "user", "content": "lonely message"}\n')
+ w._process_file(str(test_file))
+
+ # Not flushed yet (line threshold not reached)
+ assert len(batches) == 0
+
+ # Start the watcher (launches periodic flush thread)
+ w.start()
+ try:
+ # Wait for the periodic flush (1 second trigger + buffer)
+ time.sleep(2.5)
+ finally:
+ w.stop()
+
+ # The periodic flush thread should have flushed the buffer
+ assert len(batches) == 1
+ assert batches[0][0]["content"] == "lonely message"