openclaw-memory-sync/notion_sync.py at master · ClaireAICodes/openclaw-memory-sync · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
#!/usr/bin/env python3
"""
MEMORY.md → Notion Knowledge Base Sync

自动将本地OpenClaw内存文件同步到Notion知识库。
"""

import os
import re
import json
import hashlib
import argparse
import logging
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
import subprocess
import html

# 配置
NOTION_KEY_PATH = os.path.expanduser('~/.config/notion/api_key')
WORKSPACE = Path('/home/ubuntu/.openclaw/workspace')
MEMORY_DIR = WORKSPACE / 'memory'
MEMORY_FILE = WORKSPACE / 'MEMORY.md'

# Notion数据库ID (从环境变量读取，或在此处填写您的数据库ID)
NOTION_DATABASE_ID = os.getenv('NOTION_DATABASE_ID', 'YOUR_NOTION_DATABASE_ID_HERE')

# Notion属性限制
MAX_BODY_LENGTH = 2000
MAX_TITLE_LENGTH = 100
MAX_TAGS = 7

# 日志设置
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


class NotionClient:
    """简单的Notion API客户端"""

    def __init__(self, api_key: str, database_id: str):
        self.api_key = api_key
        self.database_id = self._format_uuid(database_id)

    def _format_uuid(self, id_str: str) -> str:
        """Convert a 32-char hex ID to hyphenated UUID format if needed."""
        if '-' in id_str:
            return id_str
        if len(id_str) == 32:
            return f"{id_str[:8]}-{id_str[8:12]}-{id_str[12:16]}-{id_str[16:20]}-{id_str[20:]}"
        return id_str

    def _curl_request(self, method: str, endpoint: str, data: Dict = None) -> Optional[Dict]:
        """执行curl请求"""
        cmd = [
            "curl", "-s", "-X", method, endpoint,
            "-H", f"Authorization: Bearer {self.api_key}",
            "-H", "Notion-Version: 2022-06-28",
            "-H", "Content-Type: application/json",
        ]
        if data:
            cmd.extend(["-d", json.dumps(data)])

        result = subprocess.run(cmd, capture_output=True, text=True)
        if result.returncode != 0:
            err = result.stderr.strip() if result.stderr else "Unknown error"
            logger.error(f"curl error (code {result.returncode}): {err}")
            return None
        try:
            return json.loads(result.stdout)
        except json.JSONDecodeError:
            logger.error(f"JSON decode error: stdout={result.stdout[:500]}, stderr={result.stderr[:500]}")
            return None

    def create_page(self, properties: Dict, children: List[Dict] = None) -> Optional[str]:
        """创建新页面"""
        payload = {"parent": {"database_id": self.database_id}, "properties": properties}
        if children:
            payload["children"] = children

        resp = self._curl_request("POST", "https://api.notion.com/v1/pages", payload)
        if resp and resp.get("object") == "page":
            page_id = resp.get("id")
            logger.info(f"✅ Created page: {page_id}")
            return page_id
        else:
            logger.error(f"Failed to create page: {resp.get('message') if resp else 'No response'}")
            return None

    def update_page(self, page_id: str, properties: Dict = None) -> bool:
        """更新页面属性"""
        if not properties:
            return True

        payload = {"properties": properties}
        resp = self._curl_request("PATCH", f"https://api.notion.com/v1/pages/{page_id}", payload)
        return resp and resp.get("object") == "page"

    def query_by_source_file(self, source_file: str) -> Optional[str]:
        """根据Source_File查找页面 (deprecated - using local state instead)"""
        query = {
            "filter": {
                "property": "Source File",
                "rich_text": {"contains": source_file}
            }
        }
        resp = self._curl_request("POST", f"https://api.notion.com/v1/databases/{self.database_id}/query", query)
        if resp and resp.get("results"):
            return resp["results"][0].get("id")
        return None

    def archive_page(self, page_id: str) -> bool:
        """Archive (delete) a page by setting archived=True."""
        payload = {"archived": True}
        resp = self._curl_request("PATCH", f"https://api.notion.com/v1/pages/{page_id}", payload)
        if resp and resp.get("object") == "page":
            logger.info(f"✅ Archived page: {page_id}")
            return True
        else:
            err_msg = resp.get('message', 'Unknown error') if resp else 'No response'
            logger.error(f"❌ Failed to archive page {page_id}: {err_msg}")
            return False

    def list_all_pages(self) -> List[Dict]:
        """List all pages in the database."""
        pages = []
        cursor = None
        logger.debug(f"Querying database {self.database_id} for all pages")
        while True:
            payload = {"page_size": 100}
            if cursor:
                payload["start_cursor"] = cursor
            resp = self._curl_request("POST", f"https://api.notion.com/v1/databases/{self.database_id}/query", payload)
            if resp is None:
                logger.error("No response from query (resp is None)")
                break
            if not isinstance(resp, dict):
                logger.error(f"Invalid response type: {type(resp)}")
                break
            if resp.get("object") == "error":
                logger.error(f"Notion API error: {resp.get('code')} - {resp.get('message')}")
                break
            results = resp.get("results", [])
            logger.debug(f"Query returned {len(results)} pages in this batch")
            pages.extend(results)
            if not resp.get("has_more"):
                break
            cursor = resp.get("next_cursor")
        logger.info(f"Total pages retrieved: {len(pages)}")
        return pages


class MemoryParser:
    """解析MEMORY.md和每日文件"""

    def __init__(self, workspace: Path):
        self.workspace = workspace
        self.memory_dir = workspace / 'memory'

    def parse_memory_file(self) -> List[Dict]:
        """解析MEMORY.md"""
        entries = []
        memory_path = self.workspace / 'MEMORY.md'
        if not memory_path.exists():
            logger.warning(f"MEMORY.md not found")
            return entries

        content = memory_path.read_text()
        lines = content.split('\n')

        current_section = None
        current_entry = None
        entry_buffer = []

        for line in lines:
            if line.startswith('## '):
                if current_entry and entry_buffer:
                    current_entry['body'] = '\n'.join(entry_buffer).strip()
                    entries.append(current_entry)
                    entry_buffer = []

                section_title = line[3:].strip()
                if any(kw in section_title for kw in ['Standard', 'Protocol', 'Lesson', 'Framework']):
                    current_section = section_title
                else:
                    current_section = None
                continue

            if line.startswith('### ') and current_section:
                if current_entry and entry_buffer:
                    current_entry['body'] = '\n'.join(entry_buffer).strip()
                    entries.append(current_entry)
                    entry_buffer = []

                entry_title = line[4:].strip()
                current_entry = {
                    'title': entry_title,
                    'source': 'MEMORY.md',
                    'section': current_section,
                    'file': 'MEMORY.md'
                }
                continue

            if current_entry is not None:
                entry_buffer.append(line)

        if current_entry and entry_buffer:
            current_entry['body'] = '\n'.join(entry_buffer).strip()
            entries.append(current_entry)

        logger.info(f"Parsed {len(entries)} entries from MEMORY.md")
        return entries

    def parse_daily_files(self, days_back: int = 7) -> List[Dict]:
        """解析最近N天的memory文件"""
        entries = []
        today = datetime.now().date()

        for i in range(days_back):
            date = today - timedelta(days=i)
            date_str = date.strftime('%Y-%m-%d')
            file_path = self.memory_dir / f'{date_str}.md'

            if not file_path.exists():
                continue

            content = file_path.read_text()
            lines = content.split('\n')

            current_section = None
            current_entry = None
            entry_buffer = []

            for line in lines:
                if line.startswith('## '):
                    if current_entry and entry_buffer:
                        current_entry['body'] = '\n'.join(entry_buffer).strip()
                        entries.append(current_entry)
                        entry_buffer = []

                    section_title = line[3:].strip()
                    if any(keyword in section_title.lower() for keyword in
                          ['research', 'finding', 'lesson', 'decision', 'insight', 'pattern', 'key takeaway', 'benchmark']):
                        current_section = section_title
                    else:
                        current_section = None
                    continue

                if line.startswith('### ') and current_section:
                    if current_entry and entry_buffer:
                        current_entry['body'] = '\n'.join(entry_buffer).strip()
                        entries.append(current_entry)
                        entry_buffer = []

                    entry_title = line[4:].strip()
                    current_entry = {
                        'title': entry_title,
                        'source': 'daily',
                        'file': f'{date_str}.md',
                        'date': date_str,
                        'section': current_section
                    }
                    continue

                if current_entry is not None:
                    entry_buffer.append(line)

            if current_entry and entry_buffer:
                current_entry['body'] = '\n'.join(entry_buffer).strip()
                entries.append(current_entry)

        logger.info(f"Parsed {len(entries)} knowledge entries from daily files (last {days_back} days)")
        return entries

    def extract_all_entries(self, days_back: int = 7) -> List[Dict]:
        """提取所有知识条目，去重"""
        all_entries = []
        all_entries.extend(self.parse_memory_file())
        all_entries.extend(self.parse_daily_files(days_back))

        # 去重（基于标题+内容哈希）并附加content_hash
        seen = set()
        unique_entries = []
        for entry in all_entries:
            content_hash = hashlib.md5(
                f"{entry['title']}|{entry.get('date', '')}|{entry['body'][:200]}".encode()
            ).hexdigest()[:16]
            if content_hash not in seen:
                seen.add(content_hash)
                entry['content_hash'] = content_hash
                unique_entries.append(entry)

        logger.info(f"Total unique knowledge entries: {len(unique_entries)}")
        return unique_entries


class EntryClassifier:
    """分类和属性分配"""

    TYPE_KEYWORDS = {
        'Research': ['research', 'benchmark', 'analysis', 'comparison', 'technical deep dive', 'performance', 'detailed breakdown'],
        'Lesson': ['lesson', 'learned', 'mistake', 'error', 'issue', 'problem', 'fixed', 'resolved', 'blocker'],
        'Decision': ['decision', 'choose', 'selected', 'opted', 'concluded', 'determined', 'agreed', 'strategy'],
        'Pattern': ['pattern', 'trend', 'recurring', 'common', 'usually', 'typically', 'observation'],
        'Tutorial': ['how to', 'tutorial', 'guide', 'step', 'instruction', 'walkthrough', 'setup', 'configure'],
        'Reference': ['reference', 'cheatsheet', 'spec', 'specification', 'documentation', 'api', 'quick reference'],
        'Insight': ['insight', 'realized', 'noticed', 'observed', 'thought', 'idea', 'aha', 'epiphany'],
    }

    DOMAIN_KEYWORDS = {
        'AI Models': ['model', 'llm', 'gpt', 'claude', 'gemini', 'stepflash', 'deepseek', 'mimo', 'devstral', 'openrouter', 'free tier', 'notion'],
        'OpenClaw': ['openclaw', 'agent', 'workflow', 'skill', 'tool', 'automation', 'sync', 'database'],
        'Cost Optimization': ['cost', 'price', '$', 'budget', 'free', 'tier', 'routing', 'saving', 'optimization', 'value'],
        'Trading': ['trading', 'invest', 'stock', 'crypto', 'nft', 'web3', 'defi', 'bitcoin', 'ethereum'],
        'Learning': ['learn', 'study', 'japanese', 'language', 'course', 'tutorial', 'duolingo'],
        'Process': ['process', 'workflow', 'method', 'procedure', 'system', 'framework'],
    }

    CERTAINTY_PHRASES = {
        'Verified': ['proven', 'confirmed', 'tested', 'verified', 'measured', 'data shows', 'benchmark result'],
        'Likely': ['likely', 'probably', 'most likely', 'seems', 'appears', 'suggest'],
        'Speculative': ['maybe', 'might', 'could', 'possibly', 'hypothesis', 'guess', 'uncertain'],
        'Opinion': ['i think', 'believe', 'feel', 'in my view', 'personally', 'prefer']
    }

    IMPACT_INDICATORS = {
        'High': ['critical', 'important', 'must', 'essential', 'key', 'major', 'significant', 'game changer'],
        'Medium': ['relevant', 'useful', 'helpful', 'worth', 'good', 'beneficial'],
        'Low': ['minor', 'small', 'slight', 'marginal', 'nice to have'],
        'Negligible': ['negligible', 'tiny', 'minimal', 'barely', 'insignificant']
    }

    def classify_type(self, title: str, body: str) -> str:
        text = (title + ' ' + body).lower()
        for ctype, keywords in self.TYPE_KEYWORDS.items():
            if any(kw in text for kw in keywords):
                return ctype
        return 'Insight'

    def classify_domain(self, title: str, body: str) -> str:
        text = (title + ' ' + body).lower()
        for domain, keywords in self.DOMAIN_KEYWORDS.items():
            if any(kw in text for kw in keywords):
                return domain
        return 'General'

    def classify_certainty(self, body: str) -> str:
        text = body.lower()
        for certainty, phrases in self.CERTAINTY_PHRASES.items():
            if any(phrase in text for phrase in phrases):
                return certainty
        return 'Verified'

    def classify_impact(self, title: str, body: str) -> str:
        text = (title + ' ' + body).lower()
        for impact, indicators in self.IMPACT_INDICATORS.items():
            if any(ind in text for ind in indicators):
                return impact
        return 'Medium'

    def extract_tags(self, title: str, body: str, section: str) -> List[str]:
        tags = []
        text = (title + ' ' + body + ' ' + section).lower()

        keyword_map = {
            'AI': ['ai', 'artificial intelligence', 'ml', 'machine learning', 'model'],
            'OpenRouter': ['openrouter', 'router', 'provider', 'stepfun', 'moonshot', 'xiaomi', 'mistral'],
            'FreeTier': ['free', 'free tier', 'no cost'],
            'Benchmark': ['benchmark', 'test', 'score', 'performance', 'swe-bench', 'aime'],
            'Cost': ['cost', 'price', '$', 'pricing', 'budget', 'optimization'],
            'Automation': ['automation', 'auto', 'script', 'workflow', 'agent', 'tool'],
            'Coding': ['code', 'programming', 'development', 'swe', 'coding'],
            'Notion': ['notion', 'database', 'knowledge base', 'sync'],
            'Decision': ['decision', 'choose', 'selected', 'strategy'],
        }

        for tag, words in keyword_map.items():
            if any(w in text for w in words):
                tags.append(tag)

        return list(set(tags))[:MAX_TAGS]

    def classify(self, entry: Dict) -> Dict:
        title = entry.get('title', '')
        body = entry.get('body', '')

        meta = {
            'content_type': self.classify_type(title, body),
            'domain': self.classify_domain(title, body),
            'certainty': self.classify_certainty(body),
            'impact': self.classify_impact(title, body),
            'confidence_score': self._estimate_confidence(title, body, entry.get('source')),
            'tags': self.extract_tags(title, body, entry.get('section', '')),
            'source': entry.get('source', 'Manual'),
        }
        entry['metadata'] = meta
        return entry

    def _estimate_confidence(self, title: str, body: str, source: str) -> int:
        score = 7
        if source == 'MEMORY.md':
            score += 1
        if len(body) > 500:
            score += 1
        if any(w in body.lower() for w in ['data', 'benchmark', 'measured', 'tested']):
            score += 1
        return min(10, max(1, score))


class MarkdownToNotion:
    """将Markdown风格文本转换为Notion块"""

    def convert(self, text: str) -> List[Dict]:
        """转换文本为Notion块列表"""
        blocks = []
        lines = text.split('\n')
        i = 0

        while i < len(lines):
            line = lines[i].rstrip()

            if not line.strip():
                i += 1
                continue

            # 标题
            if line.startswith('# '):
                blocks.append(self._create_heading(1, line[2:]))
            elif line.startswith('## '):
                blocks.append(self._create_heading(2, line[3:]))
            elif line.startswith('### '):
                blocks.append(self._create_heading(3, line[4:]))
            # 无序列表
            elif line.startswith('- ') or line.startswith('* '):
                blocks.extend(self._parse_list(lines, i, '-'))
                i = self._skip_list(lines, i, '-')
                continue
            # 有序列表
            elif re.match(r'^\d+\. ', line):
                blocks.extend(self._parse_list(lines, i, 'numbered'))
                i = self._skip_list(lines, i, 'numbered')
                continue
            # 引用
            elif line.startswith('> '):
                blocks.append(self._create_quote(line[2:]))
            # 代码块
            elif line.startswith('```'):
                code_lines, new_i = self._parse_code_block(lines, i)
                lang = lines[i][3:].strip() if len(lines[i]) > 3 else ''
                blocks.append(self._create_code('\n'.join(code_lines), language=lang if lang else None))
                i = new_i
                continue
            # 分割线
            elif line.strip() in ['---', '***', '___']:
                blocks.append(self._create_divider())
            # 表格
            elif '|' in line and line.count('|') >= 3:
                table_text = line
                j = i + 1
                while j < len(lines) and '|' in lines[j]:
                    table_text += '\n' + lines[j]
                    j += 1
                blocks.append(self._create_code(table_text, language='markdown-table'))
                i = j
                continue
            # 普通段落
            else:
                blocks.append(self._create_paragraph(line))

            i += 1

        return blocks

    def _create_heading(self, level: int, text: str) -> Dict:
        text = text[:2000]  # 安全截断
        return {"object": "block", "type": f"heading_{level}", f"heading_{level}": {"rich_text": [{"text": {"content": text}}]}}

    def _create_paragraph(self, text: str) -> Dict:
        text = text[:2000]
        return {"object": "block", "type": "paragraph", "paragraph": {"rich_text": [{"text": {"content": text}}]}}

    def _create_list_item(self, text: str) -> Dict:
        text = text[:2000]
        return {"object": "block", "type": "bulleted_list_item", "bulleted_list_item": {"rich_text": [{"text": {"content": text}}]}}

    def _parse_list(self, lines: List[str], start: int, list_type: str) -> List[Dict]:
        blocks = []
        i = start
        while i < len(lines):
            line = lines[i].rstrip()
            if not line.strip():
                i += 1
                continue
            is_match = False
            if list_type == '-' and line.startswith('- '):
                is_match = True
                prefix = '- '
            elif list_type == 'numbered' and re.match(r'^\d+\. ', line):
                is_match = True
                prefix = re.match(r'^\d+\. ', line).group(0)
            else:
                break

            if is_match:
                content = line[len(prefix):].strip()
                blocks.append(self._create_list_item(content))
            i += 1
        return blocks

    def _skip_list(self, lines: List[str], start: int, list_type: str) -> int:
        i = start
        while i < len(lines):
            line = lines[i].rstrip()
            if not line.strip():
                i += 1
                continue

            is_match = False
            if list_type == '-' and line.startswith('- '):
                is_match = True
            elif list_type == 'numbered' and re.match(r'^\d+\. ', line):
                is_match = True

            if is_match:
                i += 1
            else:
                break
        return i

    def _parse_code_block(self, lines: List[str], start: int) -> Tuple[List[str], int]:
        code_lines = []
        i = start + 1
        while i < len(lines):
            if lines[i].strip() == '```':
                return code_lines, i + 1
            code_lines.append(lines[i].rstrip())
            i += 1
        return code_lines, i

    def _create_code(self, code: str, language: str = None) -> Dict:
        code = code[:2000]
        return {
            "object": "block",
            "type": "code",
            "code": {
                "rich_text": [{"text": {"content": code}}],
                "caption": [],
                "language": language if language else "plain text"
            }
        }

    def _create_quote(self, text: str) -> Dict:
        text = text[:2000]
        return {"object": "block", "type": "quote", "quote": {"rich_text": [{"text": {"content": text}}]}}

    def _create_divider(self) -> Dict:
        return {"object": "block", "type": "divider", "divider": {}}


class SyncOrchestrator:
    """同步编排器"""

    def __init__(self, notion_client: NotionClient, memory_parser: MemoryParser, classifier: EntryClassifier, converter: MarkdownToNotion):
        self.notion = notion_client
        self.parser = memory_parser
        self.classifier = classifier
        self.converter = converter
        self.sync_log_path = WORKSPACE / 'memory' / 'sync-log.md'
        self.state_path = WORKSPACE / 'memory' / 'notion-sync-state.json'
        self.state = self.load_state()

    def load_state(self) -> Dict[str, str]:
        """Load sync state from file (maps content_hash -> page_id)"""
        if self.state_path.exists():
            try:
                with open(self.state_path, 'r') as f:
                    state = json.load(f)
                if isinstance(state, dict):
                    return state
            except Exception as e:
                logger.warning(f"Failed to load state file: {e}")
        return {}

    def save_state(self):
        """Save sync state to file"""
        try:
            with open(self.state_path, 'w') as f:
                json.dump(self.state, f, indent=2)
        except Exception as e:
            logger.error(f"Failed to save state: {e}")

    def log_action(self, action: str, details: str):
        """记录同步操作"""
        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        log_entry = f"- {timestamp}: {action} - {details}\n"
        with open(self.sync_log_path, 'a') as f:
            f.write(log_entry)
        logger.info(f"📝 {action}: {details}")

    def cleanup_orphans(self, dry_run: bool = False):
        """Archive (remove) pages not in sync state."""
        logger.info("🔍 Scanning for orphan pages...")
        all_pages = self.notion.list_all_pages()
        all_page_ids = {p.get("id") for p in all_pages if p.get("id")}
        state_page_ids = set(self.state.values())
        orphan_ids = all_page_ids - state_page_ids

        logger.info(f"Database contains {len(all_page_ids)} pages")
        logger.info(f"State tracks {len(state_page_ids)} pages")
        logger.info(f"Found {len(orphan_ids)} orphan pages to archive")

        if dry_run:
            logger.info("DRY-RUN: would archive these pages:")
            for pid in orphan_ids:
                logger.info(f"  - {pid}")
            return

        for pid in orphan_ids:
            try:
                if self.notion.archive_page(pid):
                    self.log_action("ARCHIVED_ORPHAN", f"page {pid}")
                else:
                    self.log_action("ARCHIVE_FAILED", f"page {pid}")
            except Exception as e:
                logger.error(f"Error archiving page {pid}: {e}")
                self.log_action("ARCHIVE_ERROR", f"page {pid}: {e}")

    def _truncate_text(self, text: str, max_len: int) -> str:
        """智能截断文本，尽量在句子边界截断"""
        if len(text) <= max_len:
            return text
        # 尝试在句子边界截断
        for sep in ['\n\n', '. ', '! ', '? ', '; ']:
            idx = text.rfind(sep, 0, max_len)
            if idx > max_len * 0.5:  # 至少保留50%才用句子截断
                return text[:idx+1].strip()
        # 否则硬截断并加...
        return text[:max_len-3].strip() + '...'

    def process_entry(self, entry: Dict, dry_run: bool = False) -> Optional[str]:
        """处理单个条目"""
        entry = self.classifier.classify(entry)
        meta = entry['metadata']
        source_file = entry.get('file', 'unknown')
        content_hash = entry.get('content_hash')
        if not content_hash:
            # Fallback: compute hash
            content_hash = hashlib.md5(
                f"{entry['title']}|{entry.get('date', '')}|{entry['body'][:200]}".encode()
            ).hexdigest()[:16]

        # Check state: already synced?
        if content_hash in self.state:
            existing_page_id = self.state[content_hash]
            if dry_run:
                logger.info(f"SKIP (already synced): {entry['title']} (page: {existing_page_id})")
            else:
                logger.info(f"SKIP (already synced): {entry['title']} (page: {existing_page_id})")
            return existing_page_id

        # 构建属性 - 使用空格名称
        properties = {
            "Name": {"title": [{"text": {"content": entry['title'][:MAX_TITLE_LENGTH]}}]},
            "Content Type": {"select": {"name": meta['content_type']}},
            "Domain": {"select": {"name": meta['domain']}},
            "Certainty": {"select": {"name": meta['certainty']}},
            "Source": {"select": {"name": meta['source']}},
            "Confidence Score": {"number": meta['confidence_score']},
            "Impact": {"select": {"name": meta['impact']}},
            "Source File": {"rich_text": [{"text": {"content": source_file}}]}
        }

        if meta['tags']:
            properties["Tags"] = {"multi_select": [{"name": tag} for tag in meta['tags']]}

        body_content = entry.get('body', '')
        if body_content and len(body_content) > 50:
            # Body属性（长度限制2000字符）
            truncated_body = self._truncate_text(body_content, MAX_BODY_LENGTH)
            properties["Body"] = {"rich_text": [{"text": {"content": truncated_body}}]}

        # 转换为Notion块用于页面内容
        children = None
        if body_content and len(body_content) > 50:
            children = self.converter.convert(body_content)

        if dry_run:
            self.log_action("DRY-RUN", f"Would create: {entry['title']}")
            logger.info(f"[DRY-RUN] Title: {entry['title']}")
            logger.info(f"  Type: {meta['content_type']}, Domain: {meta['domain']}")
            logger.info(f"  Confidence: {meta['confidence_score']}, Impact: {meta['impact']}")
            logger.info(f"  Tags: {meta['tags']}")
            logger.info(f"  Body length: {len(body_content)} (truncated to {MAX_BODY_LENGTH})")
            logger.info(f"  Children: {len(children) if children else 0} blocks")
            return None

        page_id = self.notion.create_page(properties, children=children)
        if page_id:
            self.log_action("CREATED", f"{entry['title']} (page: {page_id})")
            # Record in state
            self.state[content_hash] = page_id
            self.save_state()
            return page_id
        else:
            self.log_action("CREATE_FAILED", f"{entry['title']}")
            return None

    def sync(self, days_back: int = 7, dry_run: bool = False, limit: int = None):
        logger.info("=" * 60)
        logger.info(f"Starting sync: days_back={days_back}, dry_run={dry_run}, limit={limit}")
        logger.info("=" * 60)

        entries = self.parser.extract_all_entries(days_back)
        if limit:
            entries = entries[:limit]

        logger.info(f"Processing {len(entries)} entries...")

        stats = {'created': 0, 'updated': 0, 'failed': 0}
        for i, entry in enumerate(entries, 1):
            logger.info(f"[{i}/{len(entries)}] Processing: {entry.get('title', 'Unknown')}")
            try:
                page_id = self.process_entry(entry, dry_run)
                if page_id:
                    stats['created' if not self.notion.query_by_source_file(entry['file']) else 'updated'] += 1
                else:
                    stats['failed'] += 1
            except Exception as e:
                logger.error(f"Error processing entry: {e}", exc_info=True)
                stats['failed'] += 1

        logger.info("=" * 60)
        logger.info("Sync completed!")
        logger.info(f"  Created: {stats['created']}")
        logger.info(f"  Updated: {stats['updated']}")
        logger.info(f"  Failed:  {stats['failed']}")
        logger.info("=" * 60)


def main():
    parser_cli = argparse.ArgumentParser(description='Sync MEMORY.md to Notion Knowledge Base')
    parser_cli.add_argument('--dry-run', action='store_true', help='Preview changes without making them')
    parser_cli.add_argument('--verbose', action='store_true', help='Show debug logs')
    parser_cli.add_argument('--since', type=str, help='Only sync entries since YYYY-MM-DD')
    parser_cli.add_argument('--limit', type=int, help='Limit number of entries to process')
    parser_cli.add_argument('--cleanup', action='store_true', help='Cleanup orphan pages not in sync state')
    args = parser_cli.parse_args()

    if args.verbose:
        logger.setLevel(logging.DEBUG)

    if not os.path.exists(NOTION_KEY_PATH):
        logger.error(f"Notion API key not found at {NOTION_KEY_PATH}")
        return 1

    with open(NOTION_KEY_PATH) as f:
        api_key = f.read().strip()

    notion = NotionClient(api_key, NOTION_DATABASE_ID)
    memory_parser = MemoryParser(WORKSPACE)
    classifier = EntryClassifier()
    converter = MarkdownToNotion()

    days_back = 7
    if args.since:
        try:
            since_date = datetime.strptime(args.since, '%Y-%m-%d').date()
            days_back = (datetime.now().date() - since_date).days + 1
        except ValueError:
            logger.error("Invalid date format for --since. Use YYYY-MM-DD")
            return 1

    orchestrator = SyncOrchestrator(notion, memory_parser, classifier, converter)

    if args.cleanup:
        orchestrator.cleanup_orphans(dry_run=args.dry_run)
    else:
        orchestrator.sync(days_back=days_back, dry_run=args.dry_run, limit=args.limit)

    return 0


if __name__ == '__main__':
    exit(main())