Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
205 changes: 205 additions & 0 deletions plugins/curaitor/scripts/recycle-reindex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#!/usr/bin/env python3
"""Build or rebuild the Recycle.md TSV index.

Scans the Obsidian vault's live Recycle.md + most-recent N monthly archives
(Curaitor/Archive/Recycle-YYYY-MM.md) and writes a TSV at
`.curaitor/recycle-index.tsv` with one row per unique normalized URL:

<url_normalized>\t<source_file>\t<title>

This is the fast-path dedup index that `has_recycled.py` reads. Each triage
run can cache the whole thing in memory (370 rows today, bounded growth) and
do O(1) lookups without parsing markdown at all.

Idempotent. Safe to re-run. The TSV is fully derived from the markdown — if
it drifts from the markdown (user hand-edits Recycle.md), just rerun this
script or let the checksum watchdog trigger it (future work).

Usage:
python3 scripts/recycle-reindex.py # auto-discover vault
python3 scripts/recycle-reindex.py --vault <path> # explicit vault path
python3 scripts/recycle-reindex.py --dry-run # count only, no write

Exit codes:
0 — success (or dry-run completed)
1 — vault not found / invalid
2 — IO error writing TSV
"""

from __future__ import annotations

import argparse
import hashlib
import importlib.util
import json
import os
import sys
from pathlib import Path

SCRIPT_DIR = Path(__file__).resolve().parent

# Reuse triage-write.py's URL normalization + recycle-parsing helpers so this
# script can't drift out of sync with what the live dedup code does.
_spec = importlib.util.spec_from_file_location('_tw', SCRIPT_DIR / 'triage-write.py')
if _spec is None or _spec.loader is None:
print('ERROR: cannot load triage-write.py from script dir', file=sys.stderr)
sys.exit(1)
triage_write = importlib.util.module_from_spec(_spec)
_spec.loader.exec_module(triage_write)


# Matches the tagged-recycle-line format:
# - [title](url) (duplicate)
# - [title](url) (duplicate from Recycle)
# - [title](url)
# First-group=title, second-group=url. Same regex as triage-write's
# _RECYCLE_LINE but we also want the title for the TSV.
import re # noqa: E402
_RECYCLE_LINE_WITH_TITLE = re.compile(
r'^\s*-\s+\[([^\]]*)\]\(\s*<?([^)\s>]+)>?\s*\)',
)


def parse_recycle_file(path: Path) -> list[tuple[str, str, str]]:
"""Return list of (normalized_url, source_file_rel, title) tuples."""
rows: list[tuple[str, str, str]] = []
if not path.is_file():
return rows
try:
with path.open(encoding='utf-8') as fh:
for line in fh:
m = _RECYCLE_LINE_WITH_TITLE.match(line)
if not m:
continue
title = m.group(1).strip()
url = m.group(2).strip()
norm = triage_write.normalize_url(url)
if norm:
rows.append((norm, path.name, title))
except (OSError, UnicodeDecodeError) as e:
print(f'WARN: cannot read {path}: {e}', file=sys.stderr)
return rows


def collect_sources(vault: Path, archive_window: int) -> list[Path]:
"""Return live Recycle.md + most recent `archive_window` monthly archives."""
sources = []
live = vault / 'Curaitor' / 'Recycle.md'
if live.is_file():
sources.append(live)
archive_dir = vault / 'Curaitor' / 'Archive'
if archive_dir.is_dir():
archives = sorted(
(f for f in archive_dir.iterdir()
if f.name.startswith('Recycle-') and f.name.endswith('.md')),
reverse=True,
)
sources.extend(archives[:archive_window])
return sources


def _content_checksum(sources: list[Path]) -> str:
"""SHA-256 over the concatenation of every source file, for drift detection."""
h = hashlib.sha256()
for p in sources:
try:
h.update(p.read_bytes())
except OSError:
# Don't silently include a missing file in the checksum.
h.update(b'__MISSING__')
return h.hexdigest()


def write_tsv(tsv_path: Path, rows: list[tuple[str, str, str]], checksum: str) -> None:
"""Atomic-ish TSV write. First line is a header with the source checksum."""
tsv_path.parent.mkdir(parents=True, exist_ok=True)
tmp = tsv_path.with_suffix(tsv_path.suffix + '.tmp')
with tmp.open('w', encoding='utf-8') as fh:
fh.write(f'# recycle-index v1 checksum={checksum}\n')
fh.write('url_normalized\tsource_file\ttitle\n')
for norm, src, title in rows:
# Escape tabs/newlines in title just in case someone puts one there.
safe_title = title.replace('\t', ' ').replace('\n', ' ').replace('\r', ' ')
fh.write(f'{norm}\t{src}\t{safe_title}\n')
os.replace(tmp, tsv_path)


def main() -> int:
parser = argparse.ArgumentParser(description='Build or rebuild recycle-index.tsv')
parser.add_argument('--vault', help='Override auto-discovered vault path')
parser.add_argument('--archive-window', type=int, default=3,
help='Number of monthly archives to include (default 3)')
parser.add_argument('--dry-run', action='store_true',
help='Count rows, print summary, do not write TSV')
parser.add_argument('--json', action='store_true',
help='Emit a machine-readable summary on stdout')
args = parser.parse_args()

vault_str = args.vault or triage_write.find_vault()
vault = Path(vault_str)
if not vault.is_dir():
print(f'ERROR: vault not found at {vault}', file=sys.stderr)
return 1

sources = collect_sources(vault, args.archive_window)
if not sources:
print(f'WARN: no Recycle.md or archives in {vault}/Curaitor/', file=sys.stderr)
# Still write an empty index — dedup callers expect a file to exist.

# Dedup on normalized URL across all sources, preferring the earliest (live
# file first, then most-recent archive, then older). This matches the live
# dedup's intent — if the same URL is in Recycle.md AND Recycle-2026-04.md
# we only keep one row in the index. Exactly which one wins is irrelevant
# for membership testing; we keep the first seen for stable output order.
seen = set()
rows: list[tuple[str, str, str]] = []
per_file: dict[str, int] = {}
duplicates_across_files = 0
for p in sources:
before = len(rows)
for norm, src, title in parse_recycle_file(p):
if norm in seen:
duplicates_across_files += 1
continue
seen.add(norm)
rows.append((norm, src, title))
per_file[p.name] = len(rows) - before

checksum = _content_checksum(sources)
tsv_path = vault / '.curaitor' / 'recycle-index.tsv'

summary = {
'vault': str(vault),
'tsv_path': str(tsv_path),
'sources_scanned': [p.name for p in sources],
'unique_urls': len(rows),
'per_file_new': per_file,
'duplicates_across_files': duplicates_across_files,
'checksum': checksum,
'dry_run': args.dry_run,
}

if not args.dry_run:
try:
write_tsv(tsv_path, rows, checksum)
except OSError as e:
print(f'ERROR: cannot write TSV: {e}', file=sys.stderr)
return 2

if args.json:
json.dump(summary, sys.stdout, indent=2)
sys.stdout.write('\n')
else:
print(f'Vault: {vault}')
print(f'TSV: {tsv_path}')
print(f'Sources: {", ".join(s.name for s in sources) or "(none)"}')
print(f'Unique URLs: {len(rows)}')
print(f'Cross-file dupes: {duplicates_across_files}')
print(f'Checksum: {checksum[:16]}...')
if args.dry_run:
print('(dry-run — no TSV written)')
return 0


if __name__ == '__main__':
raise SystemExit(main())
114 changes: 112 additions & 2 deletions plugins/curaitor/scripts/triage-write.py
Original file line number Diff line number Diff line change
Expand Up @@ -274,8 +274,15 @@ def build_url_index(vault):

Includes live note folders, the live Recycle.md, and the most recent
monthly recycle archives. See `dedup_sources()` for the canonical list.

Uses `.curaitor/recycle-index.tsv` as a fast-path for the recycle portion
when it exists and is in sync with the markdown sources. Falls back to
line-by-line parse when the TSV is stale or missing.
"""
known = set()
# Try the recycle fast-path once for all recycle sources. If it hits, we
# can skip per-file recycle parsing entirely.
cached_recycle = _load_recycle_tsv(vault)
for src in dedup_sources(vault):
if src['kind'] == 'folder':
for f in os.listdir(src['path']):
Expand All @@ -286,13 +293,116 @@ def build_url_index(vault):
if m:
url = m.group(1).strip().strip('"').strip("'")
known.add(normalize_url(url))
elif src['kind'] == 'recycle':
elif src['kind'] == 'recycle' and cached_recycle is None:
# Only fall back to parsing markdown if the TSV wasn't usable.
# If we took the fast-path, cached_recycle already includes every
# recycle source's URLs (the TSV is built from the same file list).
known |= _parse_recycle(src['path'])
if cached_recycle is not None:
known |= cached_recycle
return known


_RECYCLE_TSV_REL = os.path.join('.curaitor', 'recycle-index.tsv')


def _recycle_tsv_path(vault):
return os.path.join(vault, _RECYCLE_TSV_REL)


def _recycle_sources_checksum(vault):
"""SHA-256 over Recycle.md + archive files included in dedup. Must match
the `_content_checksum` the reindex script writes, so keep it byte-exact.
"""
import hashlib
h = hashlib.sha256()
for src in dedup_sources(vault):
if src['kind'] != 'recycle':
continue
try:
with open(src['path'], 'rb') as fh:
h.update(fh.read())
except OSError:
h.update(b'__MISSING__')
return h.hexdigest()


def _load_recycle_tsv(vault):
"""Load URLs from the TSV fast-path if it's valid.

Returns a set of normalized URLs on success, or None if:
- TSV doesn't exist
- TSV header checksum doesn't match current Recycle.md + archives
- TSV is malformed
Callers fall back to the line-by-line markdown parse on None and
schedule a background reindex so next run is fast.
"""
path = _recycle_tsv_path(vault)
if not os.path.isfile(path):
return None
try:
with open(path, encoding='utf-8') as fh:
first = fh.readline()
if not first.startswith('# recycle-index v1 checksum='):
return None
stored_checksum = first.strip().split('checksum=', 1)[1]
if stored_checksum != _recycle_sources_checksum(vault):
return None # markdown edited since last reindex; fall back
header = fh.readline() # consume "url_normalized\t..." header
if not header.startswith('url_normalized\t'):
return None
urls = set()
for line in fh:
# First column is the normalized URL; tabs may appear in titles
# (we strip them on write, but be defensive on read).
tab = line.find('\t')
if tab <= 0:
continue
urls.add(line[:tab])
return urls
except (OSError, UnicodeDecodeError):
return None


def _rebuild_recycle_tsv_in_background(vault):
"""Fire-and-forget background rebuild of the recycle TSV. Best-effort; a
failed rebuild is harmless because the fallback path works without it.

Runs async so a triage batch doesn't wait on the rebuild.
"""
try:
import subprocess
script = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'recycle-reindex.py')
if not os.path.isfile(script):
return
subprocess.Popen(
['python3', script, '--vault', vault],
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
start_new_session=True,
)
except (OSError, ImportError):
pass # best-effort


def build_recycle_index(vault):
"""Return only the recycled URLs (live + archives), for distinguishing duplicate sources."""
"""Return the set of normalized URLs in Recycle.md + most-recent archives.

Prefers the `.curaitor/recycle-index.tsv` fast-path when it exists and its
checksum matches the live markdown sources. Falls back to the line-by-line
markdown parse on a miss AND schedules a background rebuild so the next
call hits the fast path. The fallback is always correct; the fast-path is
just an optimization that scales as the Recycle log grows.

See `scripts/recycle-reindex.py` for how the TSV is built / rebuilt.
"""
cached = _load_recycle_tsv(vault)
if cached is not None:
return cached
# Fallback: parse markdown sources. Also kick off a rebuild so next run
# is fast (harmless if it races with another rebuilder; last-writer-wins
# via atomic tmp+rename in recycle-reindex.py).
_rebuild_recycle_tsv_in_background(vault)
urls = set()
for src in dedup_sources(vault):
if src['kind'] == 'recycle':
Expand Down