diff --git a/plugins/curaitor/scripts/recycle-reindex.py b/plugins/curaitor/scripts/recycle-reindex.py new file mode 100644 index 0000000..fc68a01 --- /dev/null +++ b/plugins/curaitor/scripts/recycle-reindex.py @@ -0,0 +1,205 @@ +#!/usr/bin/env python3 +"""Build or rebuild the Recycle.md TSV index. + +Scans the Obsidian vault's live Recycle.md + most-recent N monthly archives +(Curaitor/Archive/Recycle-YYYY-MM.md) and writes a TSV at +`.curaitor/recycle-index.tsv` with one row per unique normalized URL: + + \t\t + +This is the fast-path dedup index that `has_recycled.py` reads. Each triage +run can cache the whole thing in memory (370 rows today, bounded growth) and +do O(1) lookups without parsing markdown at all. + +Idempotent. Safe to re-run. The TSV is fully derived from the markdown — if +it drifts from the markdown (user hand-edits Recycle.md), just rerun this +script or let the checksum watchdog trigger it (future work). + +Usage: + python3 scripts/recycle-reindex.py # auto-discover vault + python3 scripts/recycle-reindex.py --vault <path> # explicit vault path + python3 scripts/recycle-reindex.py --dry-run # count only, no write + +Exit codes: + 0 — success (or dry-run completed) + 1 — vault not found / invalid + 2 — IO error writing TSV +""" + +from __future__ import annotations + +import argparse +import hashlib +import importlib.util +import json +import os +import sys +from pathlib import Path + +SCRIPT_DIR = Path(__file__).resolve().parent + +# Reuse triage-write.py's URL normalization + recycle-parsing helpers so this +# script can't drift out of sync with what the live dedup code does. +_spec = importlib.util.spec_from_file_location('_tw', SCRIPT_DIR / 'triage-write.py') +if _spec is None or _spec.loader is None: + print('ERROR: cannot load triage-write.py from script dir', file=sys.stderr) + sys.exit(1) +triage_write = importlib.util.module_from_spec(_spec) +_spec.loader.exec_module(triage_write) + + +# Matches the tagged-recycle-line format: +# - [title](url) (duplicate) +# - [title](url) (duplicate from Recycle) +# - [title](url) +# First-group=title, second-group=url. Same regex as triage-write's +# _RECYCLE_LINE but we also want the title for the TSV. +import re # noqa: E402 +_RECYCLE_LINE_WITH_TITLE = re.compile( + r'^\s*-\s+\[([^\]]*)\]\(\s*<?([^)\s>]+)>?\s*\)', +) + + +def parse_recycle_file(path: Path) -> list[tuple[str, str, str]]: + """Return list of (normalized_url, source_file_rel, title) tuples.""" + rows: list[tuple[str, str, str]] = [] + if not path.is_file(): + return rows + try: + with path.open(encoding='utf-8') as fh: + for line in fh: + m = _RECYCLE_LINE_WITH_TITLE.match(line) + if not m: + continue + title = m.group(1).strip() + url = m.group(2).strip() + norm = triage_write.normalize_url(url) + if norm: + rows.append((norm, path.name, title)) + except (OSError, UnicodeDecodeError) as e: + print(f'WARN: cannot read {path}: {e}', file=sys.stderr) + return rows + + +def collect_sources(vault: Path, archive_window: int) -> list[Path]: + """Return live Recycle.md + most recent `archive_window` monthly archives.""" + sources = [] + live = vault / 'Curaitor' / 'Recycle.md' + if live.is_file(): + sources.append(live) + archive_dir = vault / 'Curaitor' / 'Archive' + if archive_dir.is_dir(): + archives = sorted( + (f for f in archive_dir.iterdir() + if f.name.startswith('Recycle-') and f.name.endswith('.md')), + reverse=True, + ) + sources.extend(archives[:archive_window]) + return sources + + +def _content_checksum(sources: list[Path]) -> str: + """SHA-256 over the concatenation of every source file, for drift detection.""" + h = hashlib.sha256() + for p in sources: + try: + h.update(p.read_bytes()) + except OSError: + # Don't silently include a missing file in the checksum. + h.update(b'__MISSING__') + return h.hexdigest() + + +def write_tsv(tsv_path: Path, rows: list[tuple[str, str, str]], checksum: str) -> None: + """Atomic-ish TSV write. First line is a header with the source checksum.""" + tsv_path.parent.mkdir(parents=True, exist_ok=True) + tmp = tsv_path.with_suffix(tsv_path.suffix + '.tmp') + with tmp.open('w', encoding='utf-8') as fh: + fh.write(f'# recycle-index v1 checksum={checksum}\n') + fh.write('url_normalized\tsource_file\ttitle\n') + for norm, src, title in rows: + # Escape tabs/newlines in title just in case someone puts one there. + safe_title = title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ') + fh.write(f'{norm}\t{src}\t{safe_title}\n') + os.replace(tmp, tsv_path) + + +def main() -> int: + parser = argparse.ArgumentParser(description='Build or rebuild recycle-index.tsv') + parser.add_argument('--vault', help='Override auto-discovered vault path') + parser.add_argument('--archive-window', type=int, default=3, + help='Number of monthly archives to include (default 3)') + parser.add_argument('--dry-run', action='store_true', + help='Count rows, print summary, do not write TSV') + parser.add_argument('--json', action='store_true', + help='Emit a machine-readable summary on stdout') + args = parser.parse_args() + + vault_str = args.vault or triage_write.find_vault() + vault = Path(vault_str) + if not vault.is_dir(): + print(f'ERROR: vault not found at {vault}', file=sys.stderr) + return 1 + + sources = collect_sources(vault, args.archive_window) + if not sources: + print(f'WARN: no Recycle.md or archives in {vault}/Curaitor/', file=sys.stderr) + # Still write an empty index — dedup callers expect a file to exist. + + # Dedup on normalized URL across all sources, preferring the earliest (live + # file first, then most-recent archive, then older). This matches the live + # dedup's intent — if the same URL is in Recycle.md AND Recycle-2026-04.md + # we only keep one row in the index. Exactly which one wins is irrelevant + # for membership testing; we keep the first seen for stable output order. + seen = set() + rows: list[tuple[str, str, str]] = [] + per_file: dict[str, int] = {} + duplicates_across_files = 0 + for p in sources: + before = len(rows) + for norm, src, title in parse_recycle_file(p): + if norm in seen: + duplicates_across_files += 1 + continue + seen.add(norm) + rows.append((norm, src, title)) + per_file[p.name] = len(rows) - before + + checksum = _content_checksum(sources) + tsv_path = vault / '.curaitor' / 'recycle-index.tsv' + + summary = { + 'vault': str(vault), + 'tsv_path': str(tsv_path), + 'sources_scanned': [p.name for p in sources], + 'unique_urls': len(rows), + 'per_file_new': per_file, + 'duplicates_across_files': duplicates_across_files, + 'checksum': checksum, + 'dry_run': args.dry_run, + } + + if not args.dry_run: + try: + write_tsv(tsv_path, rows, checksum) + except OSError as e: + print(f'ERROR: cannot write TSV: {e}', file=sys.stderr) + return 2 + + if args.json: + json.dump(summary, sys.stdout, indent=2) + sys.stdout.write('\n') + else: + print(f'Vault: {vault}') + print(f'TSV: {tsv_path}') + print(f'Sources: {", ".join(s.name for s in sources) or "(none)"}') + print(f'Unique URLs: {len(rows)}') + print(f'Cross-file dupes: {duplicates_across_files}') + print(f'Checksum: {checksum[:16]}...') + if args.dry_run: + print('(dry-run — no TSV written)') + return 0 + + +if __name__ == '__main__': + raise SystemExit(main()) diff --git a/plugins/curaitor/scripts/triage-write.py b/plugins/curaitor/scripts/triage-write.py index fe89000..da0346f 100755 --- a/plugins/curaitor/scripts/triage-write.py +++ b/plugins/curaitor/scripts/triage-write.py @@ -274,8 +274,15 @@ def build_url_index(vault): Includes live note folders, the live Recycle.md, and the most recent monthly recycle archives. See `dedup_sources()` for the canonical list. + + Uses `.curaitor/recycle-index.tsv` as a fast-path for the recycle portion + when it exists and is in sync with the markdown sources. Falls back to + line-by-line parse when the TSV is stale or missing. """ known = set() + # Try the recycle fast-path once for all recycle sources. If it hits, we + # can skip per-file recycle parsing entirely. + cached_recycle = _load_recycle_tsv(vault) for src in dedup_sources(vault): if src['kind'] == 'folder': for f in os.listdir(src['path']): @@ -286,13 +293,116 @@ def build_url_index(vault): if m: url = m.group(1).strip().strip('"').strip("'") known.add(normalize_url(url)) - elif src['kind'] == 'recycle': + elif src['kind'] == 'recycle' and cached_recycle is None: + # Only fall back to parsing markdown if the TSV wasn't usable. + # If we took the fast-path, cached_recycle already includes every + # recycle source's URLs (the TSV is built from the same file list). known |= _parse_recycle(src['path']) + if cached_recycle is not None: + known |= cached_recycle return known +_RECYCLE_TSV_REL = os.path.join('.curaitor', 'recycle-index.tsv') + + +def _recycle_tsv_path(vault): + return os.path.join(vault, _RECYCLE_TSV_REL) + + +def _recycle_sources_checksum(vault): + """SHA-256 over Recycle.md + archive files included in dedup. Must match + the `_content_checksum` the reindex script writes, so keep it byte-exact. + """ + import hashlib + h = hashlib.sha256() + for src in dedup_sources(vault): + if src['kind'] != 'recycle': + continue + try: + with open(src['path'], 'rb') as fh: + h.update(fh.read()) + except OSError: + h.update(b'__MISSING__') + return h.hexdigest() + + +def _load_recycle_tsv(vault): + """Load URLs from the TSV fast-path if it's valid. + + Returns a set of normalized URLs on success, or None if: + - TSV doesn't exist + - TSV header checksum doesn't match current Recycle.md + archives + - TSV is malformed + Callers fall back to the line-by-line markdown parse on None and + schedule a background reindex so next run is fast. + """ + path = _recycle_tsv_path(vault) + if not os.path.isfile(path): + return None + try: + with open(path, encoding='utf-8') as fh: + first = fh.readline() + if not first.startswith('# recycle-index v1 checksum='): + return None + stored_checksum = first.strip().split('checksum=', 1)[1] + if stored_checksum != _recycle_sources_checksum(vault): + return None # markdown edited since last reindex; fall back + header = fh.readline() # consume "url_normalized\t..." header + if not header.startswith('url_normalized\t'): + return None + urls = set() + for line in fh: + # First column is the normalized URL; tabs may appear in titles + # (we strip them on write, but be defensive on read). + tab = line.find('\t') + if tab <= 0: + continue + urls.add(line[:tab]) + return urls + except (OSError, UnicodeDecodeError): + return None + + +def _rebuild_recycle_tsv_in_background(vault): + """Fire-and-forget background rebuild of the recycle TSV. Best-effort; a + failed rebuild is harmless because the fallback path works without it. + + Runs async so a triage batch doesn't wait on the rebuild. + """ + try: + import subprocess + script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recycle-reindex.py') + if not os.path.isfile(script): + return + subprocess.Popen( + ['python3', script, '--vault', vault], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + start_new_session=True, + ) + except (OSError, ImportError): + pass # best-effort + + def build_recycle_index(vault): - """Return only the recycled URLs (live + archives), for distinguishing duplicate sources.""" + """Return the set of normalized URLs in Recycle.md + most-recent archives. + + Prefers the `.curaitor/recycle-index.tsv` fast-path when it exists and its + checksum matches the live markdown sources. Falls back to the line-by-line + markdown parse on a miss AND schedules a background rebuild so the next + call hits the fast path. The fallback is always correct; the fast-path is + just an optimization that scales as the Recycle log grows. + + See `scripts/recycle-reindex.py` for how the TSV is built / rebuilt. + """ + cached = _load_recycle_tsv(vault) + if cached is not None: + return cached + # Fallback: parse markdown sources. Also kick off a rebuild so next run + # is fast (harmless if it races with another rebuilder; last-writer-wins + # via atomic tmp+rename in recycle-reindex.py). + _rebuild_recycle_tsv_in_background(vault) urls = set() for src in dedup_sources(vault): if src['kind'] == 'recycle':