diff --git a/.env.example b/.env.example index 9c332b8..82dd14b 100644 --- a/.env.example +++ b/.env.example @@ -25,3 +25,21 @@ CADDY_HTTPS_PORT=8443 # CADDY_BIND=0.0.0.0 # CADDY_HTTP_PORT=80 # CADDY_HTTPS_PORT=443 + +# --- Auto-ingest loop --- +# Background loop inside the container scans /vault/raw/ every +# INGEST_POLL_INTERVAL seconds and ingests files that pass all filters: +# - mtime older than INGEST_STABILITY_SECS (avoids catching Web Clipper +# mid-write or files you're actively typing) +# - size >= 50 bytes (skips empty stubs) +# - no `status: draft` in YAML frontmatter (use this on daily notes +# you're still working on; remove it when ready) +# - sha256 not already logged as ingested in /vault/log.md +# Files are grouped into batches up to INGEST_BATCH_BYTES; each batch +# runs in a fresh `claude -p` process. On failure the loop backs off +# to 1 hour. Set INGEST_ENABLED=0 to turn the loop off. +INGEST_ENABLED=1 +INGEST_POLL_INTERVAL=300 +INGEST_STABILITY_SECS=60 +INGEST_BATCH_BYTES=200000 +INGEST_MAX_BATCHES_PER_WAKE=50 diff --git a/autoblog/Dockerfile b/autoblog/Dockerfile index 84bb936..8ce5508 100644 --- a/autoblog/Dockerfile +++ b/autoblog/Dockerfile @@ -29,7 +29,7 @@ RUN echo 'if [ -n "$SSH_CONNECTION" ] && [ -z "$TMUX" ]; then exec tmux new-sess # sshd config + entrypoint scripts COPY sshd_config /etc/ssh/sshd_config -COPY entrypoint.sh bootstrap-volumes.sh /opt/autoblog/bin/ +COPY entrypoint.sh bootstrap-volumes.sh ingest-loop.sh migrate-log-sha.sh /opt/autoblog/bin/ RUN chmod +x /opt/autoblog/bin/*.sh # Templates (read at first boot; never mutated at runtime) diff --git a/autoblog/bootstrap-volumes.sh b/autoblog/bootstrap-volumes.sh index 56cb2b7..2508079 100644 --- a/autoblog/bootstrap-volumes.sh +++ b/autoblog/bootstrap-volumes.sh @@ -70,6 +70,7 @@ if ! git -C /vault-remote.git rev-parse --verify HEAD >/dev/null 2>&1; then git init --bare -b main /vault-remote.git git push /vault-remote.git main + cd / # leave /tmp/vault-seed before deleting it rm -rf /tmp/vault-seed git clone /vault-remote.git /vault @@ -110,6 +111,9 @@ chown -R autoblog:autoblog \ /agent /site /vault /vault-remote.git \ /home/autoblog/.claude /home/autoblog/.ssh +# root's safe.directory setting doesn't apply to autoblog's git calls from the loop +sudo -u autoblog git config --global --add safe.directory '*' + # Readiness marker for the healthcheck touch /var/run/autoblog-ready echo "[bootstrap] ready" diff --git a/autoblog/entrypoint.sh b/autoblog/entrypoint.sh index e98bcab..51a6952 100644 --- a/autoblog/entrypoint.sh +++ b/autoblog/entrypoint.sh @@ -14,5 +14,20 @@ start-stop-daemon --start --background --chuid autoblog:autoblog \ --startas /bin/bash -- \ -c "exec npm run dev -- --host 0.0.0.0 --port 4321 >> /var/log/astro-dev.log 2>&1" +echo "[autoblog] starting ingest-loop" +touch /var/log/ingest-loop.log +chown autoblog:autoblog /var/log/ingest-loop.log +start-stop-daemon --start --background --chuid autoblog:autoblog \ + --chdir /agent \ + --make-pidfile --pidfile /var/run/ingest-loop.pid \ + --startas /bin/bash -- \ + -c "export ANTHROPIC_API_KEY='${ANTHROPIC_API_KEY:-}' \ + INGEST_ENABLED='${INGEST_ENABLED:-1}' \ + INGEST_POLL_INTERVAL='${INGEST_POLL_INTERVAL:-300}' \ + INGEST_STABILITY_SECS='${INGEST_STABILITY_SECS:-60}' \ + INGEST_BATCH_BYTES='${INGEST_BATCH_BYTES:-200000}' \ + INGEST_MAX_BATCHES_PER_WAKE='${INGEST_MAX_BATCHES_PER_WAKE:-50}'; \ + exec /opt/autoblog/bin/ingest-loop.sh >> /var/log/ingest-loop.log 2>&1" + echo "[autoblog] starting sshd" exec /usr/sbin/sshd -D -e diff --git a/autoblog/ingest-loop.sh b/autoblog/ingest-loop.sh new file mode 100755 index 0000000..233342d --- /dev/null +++ b/autoblog/ingest-loop.sh @@ -0,0 +1,235 @@ +#!/bin/bash +# set -uo pipefail, NOT -e — loop must survive individual batch failures +set -uo pipefail + +POLL_INTERVAL="${INGEST_POLL_INTERVAL:-300}" +STABILITY_SECS="${INGEST_STABILITY_SECS:-60}" +BATCH_BYTES="${INGEST_BATCH_BYTES:-200000}" +MAX_BATCHES_PER_WAKE="${INGEST_MAX_BATCHES_PER_WAKE:-50}" +ENABLED="${INGEST_ENABLED:-1}" +FAILURE_BACKOFF_SECS=3600 + +# Must be verified against the installed `claude --help` output; newer versions +# use `--permission-mode bypassPermissions` instead of the flag below. +CLAUDE_PERM_FLAGS="--dangerously-skip-permissions" + +log_line() { + printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" +} + +has_draft_frontmatter() { + awk 'BEGIN{in_fm=0} + NR==1 && $0=="---"{in_fm=1; next} + in_fm && $0=="---"{exit} + in_fm && /^status:[[:space:]]*draft[[:space:]]*$/{print "Y"; exit} + NR>40{exit}' "$1" | grep -q Y +} + +classify_op() { + local path="$1" sha="$2" + if grep -qF "| $path sha:$sha" /vault/log.md 2>/dev/null; then + printf '' + return + fi + if grep -qF "| $path sha:" /vault/log.md 2>/dev/null; then + printf 'reingest' + return + fi + printf 'new' +} + +build_pending_list() { + find /vault/raw -type f \ + -not -path '/vault/raw/assets/*' \ + -not -name '.DS_Store' \ + -not -name '.gitkeep' \ + -print0 \ + | while IFS= read -r -d '' path; do + # GNU stat only (container-only execution) + mtime=$(stat -c %Y "$path") + [ $(( $(date +%s) - mtime )) -lt "$STABILITY_SECS" ] && continue + + size=$(stat -c %s "$path") + [ "$size" -lt 50 ] && continue + + case "$path" in + *.md|*.markdown) + if has_draft_frontmatter "$path"; then continue; fi + ;; + esac + + sha=$(sha256sum "$path" | cut -c1-12) + relpath="${path#/vault/}" + op=$(classify_op "$relpath" "$sha") + [ -z "$op" ] && continue + + printf '%s\t%s\t%s\t%s\n' "$relpath" "$sha" "$size" "$op" + done +} + +# NUL-delimited output so consumer reads whole batches via `read -r -d ''`. +# printf "%c", 0 (not "\0") — mawk silently drops \0; %c,0 works in mawk+gawk. +build_batches() { + local budget="$1" + awk -v budget="$budget" ' + function flush() { + if (have_lines) { printf "%s%c", buf, 0 } + buf=""; have_lines=0; acc=0 + } + { + size = $3 + 0 + # Oversize file: flush current, emit it alone, reset. + if (size > budget && have_lines) flush() + if (buf) buf = buf "\n" $0; else buf = $0 + have_lines = 1 + acc += size + if (acc >= budget) flush() + } + END { flush() } + ' +} + +verify_and_patch_log() { + local batch="$1" had_miss=0 + local path sha size op + while IFS=$'\t' read -r path sha size op; do + [ -z "$path" ] && continue + if grep -qF "| $path sha:$sha" /vault/log.md; then continue; fi + # Look for an unsuffixed log line for this exact path. + if grep -qE "^## \[.*\] (ingest|re-ingest) \| $(printf '%s' "$path" | sed 's/[]\\/&.*^$[]/\\&/g')$" /vault/log.md; then + awk -v path="$path" -v sfx=" sha:$sha" ' + { + if (match($0, "^## \\[.*\\] (ingest|re-ingest) \\| " path "$")) { + last = NR + } + lines[NR] = $0 + } + END { + for (i = 1; i <= NR; i++) { + if (i == last) print lines[i] sfx + else print lines[i] + } + } + ' /vault/log.md > /vault/log.md.tmp && mv /vault/log.md.tmp /vault/log.md + had_miss=1 + else + log_line "WARNING: no log entry for $path after claude exit 0; will reprocess next poll" + fi + done <<< "$batch" + [ "$had_miss" = "1" ] && return 1 || return 0 +} + +run_claude_on_batch() { + local batch="$1" + [ -z "$batch" ] && { log_line "empty batch; skipping"; return 0; } + + local files_md="" path sha size op + while IFS=$'\t' read -r path sha size op; do + [ -z "$path" ] && continue + files_md+="- \`${path}\` (sha:${sha}, op:${op})"$'\n' + done <<< "$batch" + + if [ -z "$files_md" ]; then + log_line "batch produced no parseable entries; skipping" + return 0 + fi + + local prompt + prompt="$(cat <, where + is exactly the 12-char prefix provided above for that file. Example: + ## [YYYY-MM-DD] ingest | raw/foo.md sha:abc123def456 + ## [YYYY-MM-DD] re-ingest | raw/bar.md sha:789abc012345 + The auto-ingest loop reads this suffix for dedup; omitting it causes infinite + re-processing. + +3. Follow all normal skill steps (git pull, group attachments, write wiki pages, + update index.md, append log.md, commit, push). +EOF + )" + + # claude -p discovers CLAUDE.md + skills from cwd; /agent is the agent root. + cd /agent + local rc + # shellcheck disable=SC2086 # CLAUDE_PERM_FLAGS intentionally word-splits (one or two tokens) + claude -p "$prompt" $CLAUDE_PERM_FLAGS + rc=$? + [ $rc -ne 0 ] && return $rc + + verify_and_patch_log "$batch" || log_line "WARNING: post-check patched one or more log entries" + return 0 +} + +# One-shot first-run backfill of sha: suffixes on existing log entries. +/opt/autoblog/bin/migrate-log-sha.sh || log_line "migration failed (non-fatal)" + +in_backoff=0 + +while true; do + if [ "$ENABLED" != "1" ]; then + sleep "$POLL_INTERVAL" + continue + fi + + if [ "$in_backoff" = "1" ]; then + next_sleep="$FAILURE_BACKOFF_SECS" + else + next_sleep="$POLL_INTERVAL" + fi + + if ! git -C /vault pull --rebase --quiet; then + log_line "pull failed; sleeping $FAILURE_BACKOFF_SECS" + in_backoff=1 + sleep "$FAILURE_BACKOFF_SECS" + continue + fi + + pending="$(build_pending_list)" + if [ -z "$pending" ]; then + in_backoff=0 + sleep "$next_sleep" + continue + fi + + hit_cap=0 + failed=0 + batch_count=0 + while IFS= read -r -d '' batch; do + [ -z "$batch" ] && continue + batch_count=$(( batch_count + 1 )) + if [ "$batch_count" -gt "$MAX_BATCHES_PER_WAKE" ]; then + log_line "hit MAX_BATCHES_PER_WAKE=$MAX_BATCHES_PER_WAKE; remainder deferred" + hit_cap=1 + break + fi + if ! run_claude_on_batch "$batch"; then + log_line "batch failed; entering 1-hour backoff" + failed=1 + break + fi + done < <(build_batches "$BATCH_BYTES" <<< "$pending") + + if [ "$failed" = "1" ]; then + in_backoff=1 + sleep "$FAILURE_BACKOFF_SECS" + else + in_backoff=0 + # When capped but more pending, shorten next sleep to drain the queue. + if [ "$hit_cap" = "1" ] && [ "$POLL_INTERVAL" -gt 60 ]; then + sleep 60 + else + sleep "$POLL_INTERVAL" + fi + fi +done diff --git a/autoblog/migrate-log-sha.sh b/autoblog/migrate-log-sha.sh new file mode 100755 index 0000000..78a857d --- /dev/null +++ b/autoblog/migrate-log-sha.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# One-shot first-run backfill of sha: suffixes on pre-existing log +# entries. Idempotent: marker file short-circuits re-runs; awk also skips +# lines that already carry a sha suffix. +set -uo pipefail + +MARKER=/vault/.ingest-migration-done +[ -f "$MARKER" ] && exit 0 + +# log.md may not exist on a fresh vault — nothing to migrate. +[ -f /vault/log.md ] || { touch "$MARKER"; exit 0; } + +awk ' + /^## \[.*\] (ingest|re-ingest) \| raw\// { + if ($0 ~ / sha:[0-9a-f]+$/) { print; next } + idx = index($0, "| raw/") + path = substr($0, idx + 2) + full = "/vault/" path + cmd = "[ -f \"" full "\" ] && sha256sum \"" full "\" | cut -c1-12 || true" + cmd | getline sha + close(cmd) + if (sha ~ /^[0-9a-f]+$/) print $0 " sha:" sha + else print $0 + next + } + { print } +' /vault/log.md > /vault/log.md.tmp && mv /vault/log.md.tmp /vault/log.md + +touch "$MARKER" diff --git a/autoblog/templates/agent-template/.claude/skills/ingest-source.md b/autoblog/templates/agent-template/.claude/skills/ingest-source.md index 97db31e..07c821e 100644 --- a/autoblog/templates/agent-template/.claude/skills/ingest-source.md +++ b/autoblog/templates/agent-template/.claude/skills/ingest-source.md @@ -108,6 +108,11 @@ Do NOT try to compute a diff from previous versions. If the user needs that level of sophistication, they'll tell you and we'll build it. For now: re-ingest means "read the current file and fold it into the wiki." +For auto-loop re-ingest, find the existing wiki/sources/-.md +whose frontmatter `sources:` list includes the raw path being re-processed, +and update that file in place. Do not create a new dated file. The date +in the filename reflects first ingest, not the re-ingest. + ## Batch handling On "process all pending": do the detection once, show the list, then process one @@ -117,6 +122,24 @@ If a single source fails partway (e.g. the file is corrupt): log a line `## [YYYY-MM-DD] skip | raw/` with the reason and move on. Commit what succeeded. Report which files failed at the end. +## Auto-ingest contract + +When invoked by the auto-ingest loop (not by a user), the prompt contains +an explicit list of files, each annotated with (sha:<12hex>, op:new|reingest). +Behavior in this mode: + +1. Do NOT scan raw/ yourself — use exactly the provided list, in order. +2. op:new → normal ingest per Steps above. + op:reingest → FIND the existing wiki/sources page whose frontmatter + `sources:` list includes this raw path, UPDATE it in place (do not create + a new dated file), and revise dependent entity/concept pages as needed. +3. Every log entry you append MUST end with ` sha:` using exactly + the 12-char prefix from the prompt. Example: + ## [YYYY-MM-DD] ingest | raw/foo.md sha:abc123def456 + ## [YYYY-MM-DD] re-ingest | raw/foo.md sha:789abc012345 + The loop uses this suffix to dedupe future scans. Omitting it causes + infinite re-processing. + ## Conventions the agent maintains - Wiki subdirs are content-type, not subject-matter: `daily/`, `notes/`, `ideas/`, `entities/`, `sources/`. @@ -136,3 +159,4 @@ Commit what succeeded. Report which files failed at the end. - No creating a new `wiki/daily/.md` if one already exists — APPEND a section. - No agent-authored page without `ai-generated:` and `sources:` frontmatter. - No skipping the pull-before-read or push-after-commit steps. Every run does both. +- Omit the ` sha:` suffix on log entries. The auto-ingest loop depends on it.