Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,21 @@ CADDY_HTTPS_PORT=8443
# CADDY_BIND=0.0.0.0
# CADDY_HTTP_PORT=80
# CADDY_HTTPS_PORT=443

# --- Auto-ingest loop ---
# Background loop inside the container scans /vault/raw/ every
# INGEST_POLL_INTERVAL seconds and ingests files that pass all filters:
# - mtime older than INGEST_STABILITY_SECS (avoids catching Web Clipper
# mid-write or files you're actively typing)
# - size >= 50 bytes (skips empty stubs)
# - no `status: draft` in YAML frontmatter (use this on daily notes
# you're still working on; remove it when ready)
# - sha256 not already logged as ingested in /vault/log.md
# Files are grouped into batches up to INGEST_BATCH_BYTES; each batch
# runs in a fresh `claude -p` process. On failure the loop backs off
# to 1 hour. Set INGEST_ENABLED=0 to turn the loop off.
INGEST_ENABLED=1
INGEST_POLL_INTERVAL=300
INGEST_STABILITY_SECS=60
INGEST_BATCH_BYTES=200000
INGEST_MAX_BATCHES_PER_WAKE=50
2 changes: 1 addition & 1 deletion autoblog/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ RUN echo 'if [ -n "$SSH_CONNECTION" ] && [ -z "$TMUX" ]; then exec tmux new-sess

# sshd config + entrypoint scripts
COPY sshd_config /etc/ssh/sshd_config
COPY entrypoint.sh bootstrap-volumes.sh /opt/autoblog/bin/
COPY entrypoint.sh bootstrap-volumes.sh ingest-loop.sh migrate-log-sha.sh /opt/autoblog/bin/
RUN chmod +x /opt/autoblog/bin/*.sh

# Templates (read at first boot; never mutated at runtime)
Expand Down
4 changes: 4 additions & 0 deletions autoblog/bootstrap-volumes.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ if ! git -C /vault-remote.git rev-parse --verify HEAD >/dev/null 2>&1; then

git init --bare -b main /vault-remote.git
git push /vault-remote.git main
cd / # leave /tmp/vault-seed before deleting it
rm -rf /tmp/vault-seed

git clone /vault-remote.git /vault
Expand Down Expand Up @@ -110,6 +111,9 @@ chown -R autoblog:autoblog \
/agent /site /vault /vault-remote.git \
/home/autoblog/.claude /home/autoblog/.ssh

# root's safe.directory setting doesn't apply to autoblog's git calls from the loop
sudo -u autoblog git config --global --add safe.directory '*'

# Readiness marker for the healthcheck
touch /var/run/autoblog-ready
echo "[bootstrap] ready"
15 changes: 15 additions & 0 deletions autoblog/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,20 @@ start-stop-daemon --start --background --chuid autoblog:autoblog \
--startas /bin/bash -- \
-c "exec npm run dev -- --host 0.0.0.0 --port 4321 >> /var/log/astro-dev.log 2>&1"

echo "[autoblog] starting ingest-loop"
touch /var/log/ingest-loop.log
chown autoblog:autoblog /var/log/ingest-loop.log
start-stop-daemon --start --background --chuid autoblog:autoblog \
--chdir /agent \
--make-pidfile --pidfile /var/run/ingest-loop.pid \
--startas /bin/bash -- \
-c "export ANTHROPIC_API_KEY='${ANTHROPIC_API_KEY:-}' \
INGEST_ENABLED='${INGEST_ENABLED:-1}' \
INGEST_POLL_INTERVAL='${INGEST_POLL_INTERVAL:-300}' \
INGEST_STABILITY_SECS='${INGEST_STABILITY_SECS:-60}' \
INGEST_BATCH_BYTES='${INGEST_BATCH_BYTES:-200000}' \
INGEST_MAX_BATCHES_PER_WAKE='${INGEST_MAX_BATCHES_PER_WAKE:-50}'; \
exec /opt/autoblog/bin/ingest-loop.sh >> /var/log/ingest-loop.log 2>&1"

echo "[autoblog] starting sshd"
exec /usr/sbin/sshd -D -e
235 changes: 235 additions & 0 deletions autoblog/ingest-loop.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
#!/bin/bash
# set -uo pipefail, NOT -e — loop must survive individual batch failures
set -uo pipefail

POLL_INTERVAL="${INGEST_POLL_INTERVAL:-300}"
STABILITY_SECS="${INGEST_STABILITY_SECS:-60}"
BATCH_BYTES="${INGEST_BATCH_BYTES:-200000}"
MAX_BATCHES_PER_WAKE="${INGEST_MAX_BATCHES_PER_WAKE:-50}"
ENABLED="${INGEST_ENABLED:-1}"
FAILURE_BACKOFF_SECS=3600

# Must be verified against the installed `claude --help` output; newer versions
# use `--permission-mode bypassPermissions` instead of the flag below.
CLAUDE_PERM_FLAGS="--dangerously-skip-permissions"

log_line() {
printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*"
}

has_draft_frontmatter() {
awk 'BEGIN{in_fm=0}
NR==1 && $0=="---"{in_fm=1; next}
in_fm && $0=="---"{exit}
in_fm && /^status:[[:space:]]*draft[[:space:]]*$/{print "Y"; exit}
NR>40{exit}' "$1" | grep -q Y
}

classify_op() {
local path="$1" sha="$2"
if grep -qF "| $path sha:$sha" /vault/log.md 2>/dev/null; then
printf ''
return
fi
if grep -qF "| $path sha:" /vault/log.md 2>/dev/null; then
printf 'reingest'
return
fi
printf 'new'
}

build_pending_list() {
find /vault/raw -type f \
-not -path '/vault/raw/assets/*' \
-not -name '.DS_Store' \
-not -name '.gitkeep' \
-print0 \
| while IFS= read -r -d '' path; do
# GNU stat only (container-only execution)
mtime=$(stat -c %Y "$path")
[ $(( $(date +%s) - mtime )) -lt "$STABILITY_SECS" ] && continue

size=$(stat -c %s "$path")
[ "$size" -lt 50 ] && continue

case "$path" in
*.md|*.markdown)
if has_draft_frontmatter "$path"; then continue; fi
;;
esac

sha=$(sha256sum "$path" | cut -c1-12)
relpath="${path#/vault/}"
op=$(classify_op "$relpath" "$sha")
[ -z "$op" ] && continue

printf '%s\t%s\t%s\t%s\n' "$relpath" "$sha" "$size" "$op"
done
}

# NUL-delimited output so consumer reads whole batches via `read -r -d ''`.
# printf "%c", 0 (not "\0") — mawk silently drops \0; %c,0 works in mawk+gawk.
build_batches() {
local budget="$1"
awk -v budget="$budget" '
function flush() {
if (have_lines) { printf "%s%c", buf, 0 }
buf=""; have_lines=0; acc=0
}
{
size = $3 + 0
# Oversize file: flush current, emit it alone, reset.
if (size > budget && have_lines) flush()
if (buf) buf = buf "\n" $0; else buf = $0
have_lines = 1
acc += size
if (acc >= budget) flush()
}
END { flush() }
'
}

verify_and_patch_log() {
local batch="$1" had_miss=0
local path sha size op
while IFS=$'\t' read -r path sha size op; do
[ -z "$path" ] && continue
if grep -qF "| $path sha:$sha" /vault/log.md; then continue; fi
# Look for an unsuffixed log line for this exact path.
if grep -qE "^## \[.*\] (ingest|re-ingest) \| $(printf '%s' "$path" | sed 's/[]\\/&.*^$[]/\\&/g')$" /vault/log.md; then
awk -v path="$path" -v sfx=" sha:$sha" '
{
if (match($0, "^## \\[.*\\] (ingest|re-ingest) \\| " path "$")) {
last = NR
}
lines[NR] = $0
}
END {
for (i = 1; i <= NR; i++) {
if (i == last) print lines[i] sfx
else print lines[i]
}
}
' /vault/log.md > /vault/log.md.tmp && mv /vault/log.md.tmp /vault/log.md
had_miss=1
else
log_line "WARNING: no log entry for $path after claude exit 0; will reprocess next poll"
fi
done <<< "$batch"
[ "$had_miss" = "1" ] && return 1 || return 0
}

run_claude_on_batch() {
local batch="$1"
[ -z "$batch" ] && { log_line "empty batch; skipping"; return 0; }

local files_md="" path sha size op
while IFS=$'\t' read -r path sha size op; do
[ -z "$path" ] && continue
files_md+="- \`${path}\` (sha:${sha}, op:${op})"$'\n'
done <<< "$batch"

if [ -z "$files_md" ]; then
log_line "batch produced no parseable entries; skipping"
return 0
fi

local prompt
prompt="$(cat <<EOF
Run the ingest-source skill NOW on exactly these files, in this order. Do not
scan raw/ for other files — use only this list:

${files_md}
CONTRACT (must follow exactly, no exceptions):

1. For each file: op:new means normal ingest; op:reingest means rewrite the
EXISTING wiki/sources page (do not create a new dated file — find the source
page whose frontmatter sources: list includes this raw path and update in place)
and revise dependent entity/concept pages.

2. Every log entry you append must end with a space then sha:<prefix>, where
<prefix> is exactly the 12-char prefix provided above for that file. Example:
## [YYYY-MM-DD] ingest | raw/foo.md sha:abc123def456
## [YYYY-MM-DD] re-ingest | raw/bar.md sha:789abc012345
The auto-ingest loop reads this suffix for dedup; omitting it causes infinite
re-processing.

3. Follow all normal skill steps (git pull, group attachments, write wiki pages,
update index.md, append log.md, commit, push).
EOF
)"

# claude -p discovers CLAUDE.md + skills from cwd; /agent is the agent root.
cd /agent
local rc
# shellcheck disable=SC2086 # CLAUDE_PERM_FLAGS intentionally word-splits (one or two tokens)
claude -p "$prompt" $CLAUDE_PERM_FLAGS
rc=$?
[ $rc -ne 0 ] && return $rc

verify_and_patch_log "$batch" || log_line "WARNING: post-check patched one or more log entries"
return 0
}

# One-shot first-run backfill of sha: suffixes on existing log entries.
/opt/autoblog/bin/migrate-log-sha.sh || log_line "migration failed (non-fatal)"

in_backoff=0

while true; do
if [ "$ENABLED" != "1" ]; then
sleep "$POLL_INTERVAL"
continue
fi

if [ "$in_backoff" = "1" ]; then
next_sleep="$FAILURE_BACKOFF_SECS"
else
next_sleep="$POLL_INTERVAL"
fi

if ! git -C /vault pull --rebase --quiet; then
log_line "pull failed; sleeping $FAILURE_BACKOFF_SECS"
in_backoff=1
sleep "$FAILURE_BACKOFF_SECS"
continue
fi

pending="$(build_pending_list)"
if [ -z "$pending" ]; then
in_backoff=0
sleep "$next_sleep"
continue
fi

hit_cap=0
failed=0
batch_count=0
while IFS= read -r -d '' batch; do
[ -z "$batch" ] && continue
batch_count=$(( batch_count + 1 ))
if [ "$batch_count" -gt "$MAX_BATCHES_PER_WAKE" ]; then
log_line "hit MAX_BATCHES_PER_WAKE=$MAX_BATCHES_PER_WAKE; remainder deferred"
hit_cap=1
break
fi
if ! run_claude_on_batch "$batch"; then
log_line "batch failed; entering 1-hour backoff"
failed=1
break
fi
done < <(build_batches "$BATCH_BYTES" <<< "$pending")

if [ "$failed" = "1" ]; then
in_backoff=1
sleep "$FAILURE_BACKOFF_SECS"
else
in_backoff=0
# When capped but more pending, shorten next sleep to drain the queue.
if [ "$hit_cap" = "1" ] && [ "$POLL_INTERVAL" -gt 60 ]; then
sleep 60
else
sleep "$POLL_INTERVAL"
fi
fi
done
29 changes: 29 additions & 0 deletions autoblog/migrate-log-sha.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/bin/bash
# One-shot first-run backfill of sha:<prefix> suffixes on pre-existing log
# entries. Idempotent: marker file short-circuits re-runs; awk also skips
# lines that already carry a sha suffix.
set -uo pipefail

MARKER=/vault/.ingest-migration-done
[ -f "$MARKER" ] && exit 0

# log.md may not exist on a fresh vault — nothing to migrate.
[ -f /vault/log.md ] || { touch "$MARKER"; exit 0; }

awk '
/^## \[.*\] (ingest|re-ingest) \| raw\// {
if ($0 ~ / sha:[0-9a-f]+$/) { print; next }
idx = index($0, "| raw/")
path = substr($0, idx + 2)
full = "/vault/" path
cmd = "[ -f \"" full "\" ] && sha256sum \"" full "\" | cut -c1-12 || true"
cmd | getline sha
close(cmd)
if (sha ~ /^[0-9a-f]+$/) print $0 " sha:" sha
else print $0
next
}
{ print }
' /vault/log.md > /vault/log.md.tmp && mv /vault/log.md.tmp /vault/log.md

touch "$MARKER"
24 changes: 24 additions & 0 deletions autoblog/templates/agent-template/.claude/skills/ingest-source.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ Do NOT try to compute a diff from previous versions. If the user needs that
level of sophistication, they'll tell you and we'll build it. For now: re-ingest
means "read the current file and fold it into the wiki."

For auto-loop re-ingest, find the existing wiki/sources/<date>-<slug>.md
whose frontmatter `sources:` list includes the raw path being re-processed,
and update that file in place. Do not create a new dated file. The date
in the filename reflects first ingest, not the re-ingest.

## Batch handling

On "process all pending": do the detection once, show the list, then process one
Expand All @@ -117,6 +122,24 @@ If a single source fails partway (e.g. the file is corrupt): log a line
`## [YYYY-MM-DD] skip | raw/<path> — <reason>` with the reason and move on.
Commit what succeeded. Report which files failed at the end.

## Auto-ingest contract

When invoked by the auto-ingest loop (not by a user), the prompt contains
an explicit list of files, each annotated with (sha:<12hex>, op:new|reingest).
Behavior in this mode:

1. Do NOT scan raw/ yourself — use exactly the provided list, in order.
2. op:new → normal ingest per Steps above.
op:reingest → FIND the existing wiki/sources page whose frontmatter
`sources:` list includes this raw path, UPDATE it in place (do not create
a new dated file), and revise dependent entity/concept pages as needed.
3. Every log entry you append MUST end with ` sha:<prefix>` using exactly
the 12-char prefix from the prompt. Example:
## [YYYY-MM-DD] ingest | raw/foo.md sha:abc123def456
## [YYYY-MM-DD] re-ingest | raw/foo.md sha:789abc012345
The loop uses this suffix to dedupe future scans. Omitting it causes
infinite re-processing.

## Conventions the agent maintains

- Wiki subdirs are content-type, not subject-matter: `daily/`, `notes/`, `ideas/`, `entities/`, `sources/`.
Expand All @@ -136,3 +159,4 @@ Commit what succeeded. Report which files failed at the end.
- No creating a new `wiki/daily/<date>.md` if one already exists — APPEND a section.
- No agent-authored page without `ai-generated:` and `sources:` frontmatter.
- No skipping the pull-before-read or push-after-commit steps. Every run does both.
- Omit the ` sha:<prefix>` suffix on log entries. The auto-ingest loop depends on it.