dcouple · tbrownio · Apr 25, 2026 · Apr 25, 2026
diff --git a/.env.example b/.env.example
@@ -25,3 +25,21 @@ CADDY_HTTPS_PORT=8443
 # CADDY_BIND=0.0.0.0
 # CADDY_HTTP_PORT=80
 # CADDY_HTTPS_PORT=443
+
+# --- Auto-ingest loop ---
+# Background loop inside the container scans /vault/raw/ every
+# INGEST_POLL_INTERVAL seconds and ingests files that pass all filters:
+#   - mtime older than INGEST_STABILITY_SECS (avoids catching Web Clipper
+#     mid-write or files you're actively typing)
+#   - size >= 50 bytes (skips empty stubs)
+#   - no `status: draft` in YAML frontmatter (use this on daily notes
+#     you're still working on; remove it when ready)
+#   - sha256 not already logged as ingested in /vault/log.md
+# Files are grouped into batches up to INGEST_BATCH_BYTES; each batch
+# runs in a fresh `claude -p` process. On failure the loop backs off
+# to 1 hour. Set INGEST_ENABLED=0 to turn the loop off.
+INGEST_ENABLED=1
+INGEST_POLL_INTERVAL=300
+INGEST_STABILITY_SECS=60
+INGEST_BATCH_BYTES=200000
+INGEST_MAX_BATCHES_PER_WAKE=50
diff --git a/autoblog/Dockerfile b/autoblog/Dockerfile
@@ -29,7 +29,7 @@ RUN echo 'if [ -n "$SSH_CONNECTION" ] && [ -z "$TMUX" ]; then exec tmux new-sess
 
 # sshd config + entrypoint scripts
 COPY sshd_config /etc/ssh/sshd_config
-COPY entrypoint.sh bootstrap-volumes.sh /opt/autoblog/bin/
+COPY entrypoint.sh bootstrap-volumes.sh ingest-loop.sh migrate-log-sha.sh /opt/autoblog/bin/
 RUN chmod +x /opt/autoblog/bin/*.sh
 
 # Templates (read at first boot; never mutated at runtime)

diff --git a/autoblog/bootstrap-volumes.sh b/autoblog/bootstrap-volumes.sh
@@ -70,6 +70,7 @@ if ! git -C /vault-remote.git rev-parse --verify HEAD >/dev/null 2>&1; then
 
   git init --bare -b main /vault-remote.git
   git push /vault-remote.git main
+  cd /                        # leave /tmp/vault-seed before deleting it
   rm -rf /tmp/vault-seed
 
   git clone /vault-remote.git /vault
@@ -110,6 +111,9 @@ chown -R autoblog:autoblog \
   /agent /site /vault /vault-remote.git \
   /home/autoblog/.claude /home/autoblog/.ssh
 
+# root's safe.directory setting doesn't apply to autoblog's git calls from the loop
+sudo -u autoblog git config --global --add safe.directory '*'
+
 # Readiness marker for the healthcheck
 touch /var/run/autoblog-ready
 echo "[bootstrap] ready"
diff --git a/autoblog/entrypoint.sh b/autoblog/entrypoint.sh
@@ -14,5 +14,20 @@ start-stop-daemon --start --background --chuid autoblog:autoblog \
   --startas /bin/bash -- \
   -c "exec npm run dev -- --host 0.0.0.0 --port 4321 >> /var/log/astro-dev.log 2>&1"
 
+echo "[autoblog] starting ingest-loop"
+touch /var/log/ingest-loop.log
+chown autoblog:autoblog /var/log/ingest-loop.log
+start-stop-daemon --start --background --chuid autoblog:autoblog \
+  --chdir /agent \
+  --make-pidfile --pidfile /var/run/ingest-loop.pid \
+  --startas /bin/bash -- \
+  -c "export ANTHROPIC_API_KEY='${ANTHROPIC_API_KEY:-}' \
+             INGEST_ENABLED='${INGEST_ENABLED:-1}' \
+             INGEST_POLL_INTERVAL='${INGEST_POLL_INTERVAL:-300}' \
+             INGEST_STABILITY_SECS='${INGEST_STABILITY_SECS:-60}' \
+             INGEST_BATCH_BYTES='${INGEST_BATCH_BYTES:-200000}' \
+             INGEST_MAX_BATCHES_PER_WAKE='${INGEST_MAX_BATCHES_PER_WAKE:-50}'; \
+          exec /opt/autoblog/bin/ingest-loop.sh >> /var/log/ingest-loop.log 2>&1"
+
 echo "[autoblog] starting sshd"
 exec /usr/sbin/sshd -D -e
diff --git a/autoblog/ingest-loop.sh b/autoblog/ingest-loop.sh
@@ -0,0 +1,235 @@
+#!/bin/bash
+# set -uo pipefail, NOT -e — loop must survive individual batch failures
+set -uo pipefail
+
+POLL_INTERVAL="${INGEST_POLL_INTERVAL:-300}"
+STABILITY_SECS="${INGEST_STABILITY_SECS:-60}"
+BATCH_BYTES="${INGEST_BATCH_BYTES:-200000}"
+MAX_BATCHES_PER_WAKE="${INGEST_MAX_BATCHES_PER_WAKE:-50}"
+ENABLED="${INGEST_ENABLED:-1}"
+FAILURE_BACKOFF_SECS=3600
+
+# Must be verified against the installed `claude --help` output; newer versions
+# use `--permission-mode bypassPermissions` instead of the flag below.
+CLAUDE_PERM_FLAGS="--dangerously-skip-permissions"
+
+log_line() {
+  printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*"
+}
+
+has_draft_frontmatter() {
+  awk 'BEGIN{in_fm=0}
+       NR==1 && $0=="---"{in_fm=1; next}
+       in_fm && $0=="---"{exit}
+       in_fm && /^status:[[:space:]]*draft[[:space:]]*$/{print "Y"; exit}
+       NR>40{exit}' "$1" | grep -q Y
+}
+
+classify_op() {
+  local path="$1" sha="$2"
+  if grep -qF "| $path sha:$sha" /vault/log.md 2>/dev/null; then
+    printf ''
+    return
+  fi
+  if grep -qF "| $path sha:" /vault/log.md 2>/dev/null; then
+    printf 'reingest'
+    return
+  fi
+  printf 'new'
+}
+
+build_pending_list() {
+  find /vault/raw -type f \
+    -not -path '/vault/raw/assets/*' \
+    -not -name '.DS_Store' \
+    -not -name '.gitkeep' \
+    -print0 \
+  | while IFS= read -r -d '' path; do
+      # GNU stat only (container-only execution)
+      mtime=$(stat -c %Y "$path")
+      [ $(( $(date +%s) - mtime )) -lt "$STABILITY_SECS" ] && continue
+
+      size=$(stat -c %s "$path")
+      [ "$size" -lt 50 ] && continue
+
+      case "$path" in
+        *.md|*.markdown)
+          if has_draft_frontmatter "$path"; then continue; fi
+          ;;
+      esac
+
+      sha=$(sha256sum "$path" | cut -c1-12)
+      relpath="${path#/vault/}"
+      op=$(classify_op "$relpath" "$sha")
+      [ -z "$op" ] && continue
+
+      printf '%s\t%s\t%s\t%s\n' "$relpath" "$sha" "$size" "$op"
+    done
+}
+
+# NUL-delimited output so consumer reads whole batches via `read -r -d ''`.
+# printf "%c", 0 (not "\0") — mawk silently drops \0; %c,0 works in mawk+gawk.
+build_batches() {
+  local budget="$1"
+  awk -v budget="$budget" '
+    function flush() {
+      if (have_lines) { printf "%s%c", buf, 0 }
+      buf=""; have_lines=0; acc=0
+    }
+    {
+      size = $3 + 0
+      # Oversize file: flush current, emit it alone, reset.
+      if (size > budget && have_lines) flush()
+      if (buf) buf = buf "\n" $0; else buf = $0
+      have_lines = 1
+      acc += size
+      if (acc >= budget) flush()
+    }
+    END { flush() }
+  '
+}
+
+verify_and_patch_log() {
+  local batch="$1" had_miss=0
+  local path sha size op
+  while IFS=$'\t' read -r path sha size op; do
+    [ -z "$path" ] && continue
+    if grep -qF "| $path sha:$sha" /vault/log.md; then continue; fi
+    # Look for an unsuffixed log line for this exact path.
+    if grep -qE "^## \[.*\] (ingest|re-ingest) \| $(printf '%s' "$path" | sed 's/[]\\/&.*^$[]/\\&/g')$" /vault/log.md; then
+      awk -v path="$path" -v sfx=" sha:$sha" '
+        {
+          if (match($0, "^## \\[.*\\] (ingest|re-ingest) \\| " path "$")) {
+            last = NR
+          }
+          lines[NR] = $0
+        }
+        END {
+          for (i = 1; i <= NR; i++) {
+            if (i == last) print lines[i] sfx
+            else print lines[i]
+          }
+        }
+      ' /vault/log.md > /vault/log.md.tmp && mv /vault/log.md.tmp /vault/log.md
+      had_miss=1
+    else
+      log_line "WARNING: no log entry for $path after claude exit 0; will reprocess next poll"
+    fi
+  done <<< "$batch"
+  [ "$had_miss" = "1" ] && return 1 || return 0
+}
+
+run_claude_on_batch() {
+  local batch="$1"
+  [ -z "$batch" ] && { log_line "empty batch; skipping"; return 0; }
+
+  local files_md="" path sha size op
+  while IFS=$'\t' read -r path sha size op; do
+    [ -z "$path" ] && continue
+    files_md+="- \`${path}\` (sha:${sha}, op:${op})"$'\n'
+  done <<< "$batch"
+
+  if [ -z "$files_md" ]; then
+    log_line "batch produced no parseable entries; skipping"
+    return 0
+  fi
+
+  local prompt
+  prompt="$(cat <<EOF
+Run the ingest-source skill NOW on exactly these files, in this order. Do not
+scan raw/ for other files — use only this list:
+
+${files_md}
+CONTRACT (must follow exactly, no exceptions):
+
+1. For each file: op:new means normal ingest; op:reingest means rewrite the
+   EXISTING wiki/sources page (do not create a new dated file — find the source
+   page whose frontmatter sources: list includes this raw path and update in place)
+   and revise dependent entity/concept pages.
+
+2. Every log entry you append must end with a space then sha:<prefix>, where
+   <prefix> is exactly the 12-char prefix provided above for that file. Example:
+     ## [YYYY-MM-DD] ingest | raw/foo.md sha:abc123def456
+     ## [YYYY-MM-DD] re-ingest | raw/bar.md sha:789abc012345
+   The auto-ingest loop reads this suffix for dedup; omitting it causes infinite
+   re-processing.
+
+3. Follow all normal skill steps (git pull, group attachments, write wiki pages,
+   update index.md, append log.md, commit, push).
+EOF
+  )"
+
+  # claude -p discovers CLAUDE.md + skills from cwd; /agent is the agent root.
+  cd /agent
+  local rc
+  # shellcheck disable=SC2086  # CLAUDE_PERM_FLAGS intentionally word-splits (one or two tokens)
+  claude -p "$prompt" $CLAUDE_PERM_FLAGS
+  rc=$?
+  [ $rc -ne 0 ] && return $rc
+
+  verify_and_patch_log "$batch" || log_line "WARNING: post-check patched one or more log entries"
+  return 0
+}
+
+# One-shot first-run backfill of sha: suffixes on existing log entries.
+/opt/autoblog/bin/migrate-log-sha.sh || log_line "migration failed (non-fatal)"
+
+in_backoff=0
+
+while true; do
+  if [ "$ENABLED" != "1" ]; then
+    sleep "$POLL_INTERVAL"
+    continue
+  fi
+
+  if [ "$in_backoff" = "1" ]; then
+    next_sleep="$FAILURE_BACKOFF_SECS"
+  else
+    next_sleep="$POLL_INTERVAL"
+  fi
+
+  if ! git -C /vault pull --rebase --quiet; then
+    log_line "pull failed; sleeping $FAILURE_BACKOFF_SECS"
+    in_backoff=1
+    sleep "$FAILURE_BACKOFF_SECS"
+    continue
+  fi
+
+  pending="$(build_pending_list)"
+  if [ -z "$pending" ]; then
+    in_backoff=0
+    sleep "$next_sleep"
+    continue
+  fi
+
+  hit_cap=0
+  failed=0
+  batch_count=0
+  while IFS= read -r -d '' batch; do
+    [ -z "$batch" ] && continue
+    batch_count=$(( batch_count + 1 ))
+    if [ "$batch_count" -gt "$MAX_BATCHES_PER_WAKE" ]; then
+      log_line "hit MAX_BATCHES_PER_WAKE=$MAX_BATCHES_PER_WAKE; remainder deferred"
+      hit_cap=1
+      break
+    fi
+    if ! run_claude_on_batch "$batch"; then
+      log_line "batch failed; entering 1-hour backoff"
+      failed=1
+      break
+    fi
+  done < <(build_batches "$BATCH_BYTES" <<< "$pending")
+
+  if [ "$failed" = "1" ]; then
+    in_backoff=1
+    sleep "$FAILURE_BACKOFF_SECS"
+  else
+    in_backoff=0
+    # When capped but more pending, shorten next sleep to drain the queue.
+    if [ "$hit_cap" = "1" ] && [ "$POLL_INTERVAL" -gt 60 ]; then
+      sleep 60
+    else
+      sleep "$POLL_INTERVAL"
+    fi
+  fi
+done
diff --git a/autoblog/migrate-log-sha.sh b/autoblog/migrate-log-sha.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# One-shot first-run backfill of sha:<prefix> suffixes on pre-existing log
+# entries. Idempotent: marker file short-circuits re-runs; awk also skips
+# lines that already carry a sha suffix.
+set -uo pipefail
+
+MARKER=/vault/.ingest-migration-done
+[ -f "$MARKER" ] && exit 0
+
+# log.md may not exist on a fresh vault — nothing to migrate.
+[ -f /vault/log.md ] || { touch "$MARKER"; exit 0; }
+
+awk '
+  /^## \[.*\] (ingest|re-ingest) \| raw\// {
+    if ($0 ~ / sha:[0-9a-f]+$/) { print; next }
+    idx = index($0, "| raw/")
+    path = substr($0, idx + 2)
+    full = "/vault/" path
+    cmd = "[ -f \"" full "\" ] && sha256sum \"" full "\" | cut -c1-12 || true"
+    cmd | getline sha
+    close(cmd)
+    if (sha ~ /^[0-9a-f]+$/) print $0 " sha:" sha
+    else print $0
+    next
+  }
+  { print }
+' /vault/log.md > /vault/log.md.tmp && mv /vault/log.md.tmp /vault/log.md
+
+touch "$MARKER"
diff --git a/autoblog/templates/agent-template/.claude/skills/ingest-source.md b/autoblog/templates/agent-template/.claude/skills/ingest-source.md
@@ -108,6 +108,11 @@ Do NOT try to compute a diff from previous versions. If the user needs that
 level of sophistication, they'll tell you and we'll build it. For now: re-ingest
 means "read the current file and fold it into the wiki."
 
+For auto-loop re-ingest, find the existing wiki/sources/<date>-<slug>.md
+whose frontmatter `sources:` list includes the raw path being re-processed,
+and update that file in place. Do not create a new dated file. The date
+in the filename reflects first ingest, not the re-ingest.
+
 ## Batch handling
 
 On "process all pending": do the detection once, show the list, then process one
@@ -117,6 +122,24 @@ If a single source fails partway (e.g. the file is corrupt): log a line
 `## [YYYY-MM-DD] skip | raw/<path> — <reason>` with the reason and move on.
 Commit what succeeded. Report which files failed at the end.
 
+## Auto-ingest contract
+
+When invoked by the auto-ingest loop (not by a user), the prompt contains
+an explicit list of files, each annotated with (sha:<12hex>, op:new|reingest).
+Behavior in this mode:
+
+1. Do NOT scan raw/ yourself — use exactly the provided list, in order.
+2. op:new  → normal ingest per Steps above.
+   op:reingest → FIND the existing wiki/sources page whose frontmatter
+   `sources:` list includes this raw path, UPDATE it in place (do not create
+   a new dated file), and revise dependent entity/concept pages as needed.
+3. Every log entry you append MUST end with ` sha:<prefix>` using exactly
+   the 12-char prefix from the prompt. Example:
+       ## [YYYY-MM-DD] ingest | raw/foo.md sha:abc123def456
+       ## [YYYY-MM-DD] re-ingest | raw/foo.md sha:789abc012345
+   The loop uses this suffix to dedupe future scans. Omitting it causes
+   infinite re-processing.
+
 ## Conventions the agent maintains
 
 - Wiki subdirs are content-type, not subject-matter: `daily/`, `notes/`, `ideas/`, `entities/`, `sources/`.
@@ -136,3 +159,4 @@ Commit what succeeded. Report which files failed at the end.
 - No creating a new `wiki/daily/<date>.md` if one already exists — APPEND a section.
 - No agent-authored page without `ai-generated:` and `sources:` frontmatter.
 - No skipping the pull-before-read or push-after-commit steps. Every run does both.
+- Omit the ` sha:<prefix>` suffix on log entries. The auto-ingest loop depends on it.