diff --git a/src/clawops/recovery/backends.py b/src/clawops/recovery/backends.py index 3dc181d..257a807 100644 --- a/src/clawops/recovery/backends.py +++ b/src/clawops/recovery/backends.py @@ -12,7 +12,7 @@ type RunCommandFunc = Callable[..., ExecResult] type TarWriter = Callable[ [pathlib.Path], - None, + dict[str, object], ] @@ -21,7 +21,9 @@ class BackupBackend(Protocol): name: str - def create(self, archive_tmp_path: pathlib.Path) -> tuple[bool, str | None]: + def create( + self, archive_tmp_path: pathlib.Path + ) -> tuple[bool, str | None, dict[str, object] | None]: """Create one archive and return status + optional failure reason.""" ... @@ -39,16 +41,18 @@ def is_available(self) -> bool: """Return whether the OpenClaw executable is available.""" return self._which("openclaw") is not None - def create(self, archive_tmp_path: pathlib.Path) -> tuple[bool, str | None]: + def create( + self, archive_tmp_path: pathlib.Path + ) -> tuple[bool, str | None, dict[str, object] | None]: """Create one archive through OpenClaw.""" result = self._run_command( ["openclaw", "backup", "create", str(archive_tmp_path)], timeout_seconds=600, ) if result.ok: - return True, None + return True, None, None detail = result.stderr.strip() or result.stdout.strip() or "openclaw backup create failed" - return False, detail + return False, detail, None class TarBackupBackend: @@ -59,7 +63,9 @@ class TarBackupBackend: def __init__(self, *, writer: TarWriter) -> None: self._writer = writer - def create(self, archive_tmp_path: pathlib.Path) -> tuple[bool, str | None]: + def create( + self, archive_tmp_path: pathlib.Path + ) -> tuple[bool, str | None, dict[str, object] | None]: """Create one archive through the fallback tar writer.""" - self._writer(archive_tmp_path) - return True, None + manifest = self._writer(archive_tmp_path) + return True, None, manifest diff --git a/src/clawops/recovery/models.py b/src/clawops/recovery/models.py index 2389476..8844333 100644 --- a/src/clawops/recovery/models.py +++ b/src/clawops/recovery/models.py @@ -49,3 +49,4 @@ class BackupCreateExecution: mode: str | None = None fallback_used: bool = False fallback_reason: str | None = None + manifest: dict[str, object] | None = None diff --git a/src/clawops/recovery/orchestrator.py b/src/clawops/recovery/orchestrator.py index 6b6fb97..87ee332 100644 --- a/src/clawops/recovery/orchestrator.py +++ b/src/clawops/recovery/orchestrator.py @@ -13,7 +13,7 @@ from clawops.strongclaw_runtime import CommandError, ExecResult type RunCommandFunc = Callable[..., ExecResult] -type TarWriter = Callable[[pathlib.Path], None] +type TarWriter = Callable[[pathlib.Path], dict[str, object]] type SafeUnlink = Callable[[pathlib.Path], None] @@ -49,7 +49,7 @@ def create_backup_execution( fallback_reason: str | None = None if openclaw_backend.is_available(): safe_unlink(archive_tmp_path) - backend_ok, backend_error = openclaw_backend.create(archive_tmp_path) + backend_ok, backend_error, _ = openclaw_backend.create(archive_tmp_path) if backend_ok: archive_tmp_path.replace(archive_path) return BackupCreateExecution( @@ -65,8 +65,9 @@ def create_backup_execution( safe_unlink(archive_tmp_path) fallback_backend = TarBackupBackend(writer=tar_writer) + fallback_manifest: dict[str, object] | None = None try: - fallback_backend.create(archive_tmp_path) + _, _, fallback_manifest = fallback_backend.create(archive_tmp_path) archive_tmp_path.replace(archive_path) except (OSError, tarfile.TarError) as exc: safe_unlink(archive_tmp_path) @@ -79,4 +80,5 @@ def create_backup_execution( mode="tar-fallback", fallback_used=fallback_reason is not None, fallback_reason=fallback_reason, + manifest=fallback_manifest, ) diff --git a/src/clawops/recovery/telemetry.py b/src/clawops/recovery/telemetry.py index 4ca5555..a493519 100644 --- a/src/clawops/recovery/telemetry.py +++ b/src/clawops/recovery/telemetry.py @@ -4,7 +4,9 @@ from collections.abc import Mapping +from clawops.observability import TelemetryValue -def event_payload(event: str, fields: Mapping[str, object]) -> dict[str, object]: + +def event_payload(event: str, fields: Mapping[str, TelemetryValue]) -> dict[str, TelemetryValue]: """Build one structured recovery telemetry payload.""" return {"event": event, **dict(fields)} diff --git a/src/clawops/strongclaw_recovery.py b/src/clawops/strongclaw_recovery.py index b800c7e..b3abc4b 100644 --- a/src/clawops/strongclaw_recovery.py +++ b/src/clawops/strongclaw_recovery.py @@ -3,14 +3,18 @@ from __future__ import annotations import argparse +import hashlib +import io import json import pathlib import shutil import tarfile import time +from typing import cast from clawops.app_paths import strongclaw_state_dir from clawops.cli_roots import add_ignored_repo_root_alias, warn_ignored_repo_root_argument +from clawops.observability import emit_structured_log from clawops.recovery.models import BackupCreateExecution from clawops.recovery.orchestrator import create_backup_execution from clawops.recovery.policy import ( @@ -18,6 +22,7 @@ RECOVERY_PROFILES, ensure_recovery_profile, ) +from clawops.recovery.telemetry import event_payload from clawops.strongclaw_runtime import ( CommandError, resolve_home_dir, @@ -25,6 +30,16 @@ ) _OPENCLAW_VERIFY_MANIFEST_MISMATCH = "Expected exactly one backup manifest entry" +_FALLBACK_MANIFEST_PATH = ".strongclaw/backup-manifest.json" +_FALLBACK_MANIFEST_REQUIRED_FIELDS: tuple[str, ...] = ( + "profile", + "include_roots", + "exclude_roots", + "file_count", + "bytes", + "content_sha256", + "backend", +) def backups_dir(*, home_dir: pathlib.Path | None = None) -> pathlib.Path: @@ -74,8 +89,14 @@ def _write_tar_archive( state_dir: pathlib.Path, include_root: pathlib.Path, exclude_roots: tuple[pathlib.Path, ...], -) -> None: + profile: str, +) -> dict[str, object]: """Write a fallback tar archive with explicit path exclusions.""" + file_count = 0 + total_bytes = 0 + hash_summary = hashlib.sha256() + include_root_resolved = include_root.resolve() + exclude_root_strings = [root.as_posix() for root in exclude_roots] with tarfile.open(archive_path, "w:gz") as archive: for path in state_dir.rglob("*"): if path.is_symlink(): @@ -85,7 +106,39 @@ def _write_tar_archive( resolved_path = path.resolve() if any(_is_path_within(resolved_path, root) for root in exclude_roots): continue - archive.add(path, arcname=path.relative_to(include_root), recursive=False) + arcname = path.relative_to(include_root_resolved).as_posix() + archive.add(path, arcname=arcname, recursive=False) + if not path.is_file(): + continue + file_count += 1 + size = path.stat().st_size + total_bytes += size + hash_summary.update(arcname.encode("utf-8")) + hash_summary.update(b"\0") + hash_summary.update(str(size).encode("utf-8")) + hash_summary.update(b"\0") + with path.open("rb") as handle: + while True: + chunk = handle.read(65536) + if not chunk: + break + hash_summary.update(chunk) + manifest: dict[str, object] = { + "profile": profile, + "include_roots": [include_root_resolved.as_posix()], + "exclude_roots": exclude_root_strings, + "file_count": file_count, + "bytes": total_bytes, + "content_sha256": hash_summary.hexdigest(), + "backend": "tar-fallback", + } + encoded_manifest = json.dumps(manifest, sort_keys=True, separators=(",", ":")).encode( + "utf-8" + ) + manifest_info = tarfile.TarInfo(name=_FALLBACK_MANIFEST_PATH) + manifest_info.size = len(encoded_manifest) + archive.addfile(manifest_info, io.BytesIO(encoded_manifest)) + return manifest def _create_backup_result( @@ -117,6 +170,7 @@ def _create_backup_result( state_dir=state_dir, include_root=resolved_home_dir, exclude_roots=exclude_roots, + profile=selected_profile, ), safe_unlink=_safe_unlink, which=shutil.which, @@ -173,10 +227,32 @@ def verify_backup( def _verify_tar_archive(archive_path: pathlib.Path) -> pathlib.Path: - """Verify one fallback tar archive by ensuring members can be enumerated.""" + """Verify one fallback tar archive by validating its embedded manifest.""" with tarfile.open(archive_path, "r:gz") as archive: - # Simply ensure the archive can be opened and its members enumerated. - archive.getmembers() + members = archive.getmembers() + manifest_members = [member for member in members if member.name == _FALLBACK_MANIFEST_PATH] + if len(manifest_members) != 1: + raise CommandError( + f"invalid fallback backup manifest count in {archive_path}: expected 1, got {len(manifest_members)}" + ) + manifest_handle = archive.extractfile(manifest_members[0]) + if manifest_handle is None: + raise CommandError(f"fallback backup manifest is unreadable in {archive_path}") + try: + manifest_payload = json.loads(manifest_handle.read().decode("utf-8")) + except (UnicodeDecodeError, json.JSONDecodeError) as exc: + raise CommandError( + f"fallback backup manifest is invalid JSON in {archive_path}" + ) from exc + if not isinstance(manifest_payload, dict): + raise CommandError(f"fallback backup manifest payload is invalid in {archive_path}") + missing_fields = [ + field for field in _FALLBACK_MANIFEST_REQUIRED_FIELDS if field not in manifest_payload + ] + if missing_fields: + raise CommandError( + f"fallback backup manifest missing required fields in {archive_path}: {', '.join(missing_fields)}" + ) return archive_path @@ -306,46 +382,146 @@ def main(argv: list[str] | None = None) -> int: ) home_dir = resolve_home_dir(args.home_dir) if args.command == "backup-create": + create_started_ms = int(time.time() * 1000) + emit_structured_log( + "clawops.recovery.backup_create_started", + event_payload( + "clawops.recovery.backup_create_started", + { + "profile": args.profile or DEFAULT_RECOVERY_PROFILE, + "mode": "automation", + "fallback_used": False, + }, + ), + ) execution = _create_backup_result( home_dir=home_dir, profile=args.profile or DEFAULT_RECOVERY_PROFILE, allow_fallback=bool(args.allow_fallback), dry_run=bool(args.dry_run), ) + duration_ms = int(time.time() * 1000) - create_started_ms + plan_payload = execution.plan.to_payload() payload: dict[str, object] = { "ok": True, - **execution.plan.to_payload(), + **plan_payload, "dry_run": execution.dry_run, } + emit_structured_log( + "clawops.recovery.backup_plan_built", + event_payload( + "clawops.recovery.backup_plan_built", + { + "profile": execution.plan.profile, + "backend": ",".join(execution.plan.backend_candidates), + "file_count": execution.plan.estimated_file_count, + "bytes": execution.plan.estimated_bytes, + "excluded_paths_count": len(execution.plan.exclude_roots), + "result": "ok", + }, + ), + ) if execution.archive_path is not None and execution.mode is not None: payload["archive"] = str(execution.archive_path) payload["mode"] = execution.mode payload["fallback_used"] = execution.fallback_used if execution.fallback_reason is not None: payload["fallback_reason"] = execution.fallback_reason + reported_file_count = execution.plan.estimated_file_count + reported_bytes = execution.plan.estimated_bytes + if execution.manifest is not None: + payload["manifest"] = execution.manifest + manifest_file_count = execution.manifest.get("file_count") + manifest_bytes = execution.manifest.get("bytes") + if isinstance(manifest_file_count, int): + reported_file_count = manifest_file_count + payload["file_count"] = reported_file_count + if isinstance(manifest_bytes, int): + reported_bytes = manifest_bytes + payload["bytes"] = reported_bytes + emit_structured_log( + "clawops.recovery.backup_create_completed", + event_payload( + "clawops.recovery.backup_create_completed", + { + "backup_id": execution.archive_path.name, + "profile": execution.plan.profile, + "backend": execution.mode, + "mode": "operator" if args.allow_fallback else "automation", + "file_count": reported_file_count, + "bytes": reported_bytes, + "duration_ms": duration_ms, + "result": "ok", + "fallback_used": execution.fallback_used, + "excluded_paths_count": len(execution.plan.exclude_roots), + }, + ), + ) elif args.command == "backup-verify": - payload = {"ok": True, "archive": str(verify_backup(args.target, home_dir=home_dir))} + verify_started_ms = int(time.time() * 1000) + verified_archive = verify_backup(args.target, home_dir=home_dir) + payload = {"ok": True, "archive": str(verified_archive)} + emit_structured_log( + "clawops.recovery.backup_verify_completed", + event_payload( + "clawops.recovery.backup_verify_completed", + { + "backup_id": verified_archive.name, + "duration_ms": int(time.time() * 1000) - verify_started_ms, + "result": "ok", + }, + ), + ) elif args.command == "restore": + restore_started_ms = int(time.time() * 1000) destination = ( pathlib.Path(args.destination).expanduser().resolve() if args.destination is not None else home_dir.parent / ".openclaw-restore" ) + archive_path = pathlib.Path(args.archive).expanduser().resolve() payload = { "ok": True, "destination": str( restore_backup( - pathlib.Path(args.archive).expanduser().resolve(), + archive_path, destination=destination, home_dir=home_dir, ) ), } + emit_structured_log( + "clawops.recovery.restore_completed", + event_payload( + "clawops.recovery.restore_completed", + { + "backup_id": archive_path.name, + "duration_ms": int(time.time() * 1000) - restore_started_ms, + "result": "ok", + }, + ), + ) elif args.command == "prune-retention": + prune_started_ms = int(time.time() * 1000) payload = prune_retention( home_dir=home_dir, include_shared_tmp=bool(args.include_shared_tmp), ) + deleted_paths = payload.get("deleted") + deleted_count = ( + len(cast(list[object], deleted_paths)) if isinstance(deleted_paths, list) else 0 + ) + emit_structured_log( + "clawops.recovery.backup_prune_completed", + event_payload( + "clawops.recovery.backup_prune_completed", + { + "duration_ms": int(time.time() * 1000) - prune_started_ms, + "result": "ok", + "file_count": deleted_count, + }, + ), + ) else: payload = rotation_guidance() print(json.dumps(payload, indent=2, sort_keys=True)) diff --git a/tests/suites/unit/clawops/test_strongclaw_recovery.py b/tests/suites/unit/clawops/test_strongclaw_recovery.py index cb24a29..8d92aef 100644 --- a/tests/suites/unit/clawops/test_strongclaw_recovery.py +++ b/tests/suites/unit/clawops/test_strongclaw_recovery.py @@ -16,6 +16,8 @@ from clawops.strongclaw_runtime import CommandError, ExecResult from tests.plugins.infrastructure.context import TestContext +FALLBACK_MANIFEST_PATH = ".strongclaw/backup-manifest.json" + def _init_openclaw_home(home_dir: Path) -> Path: """Create a minimal OpenClaw home tree for recovery tests.""" @@ -29,10 +31,28 @@ def _init_openclaw_home(home_dir: Path) -> Path: def _write_payload_member(archive_path: Path, member_name: str) -> None: """Write one tar archive containing a single regular-file member.""" payload = b"unsafe\n" + manifest_payload = json.dumps( + { + "profile": "control-plane", + "include_roots": ["/tmp/home"], + "exclude_roots": [ + "/tmp/home/.local/state/strongclaw/backups", + "/tmp/home/.openclaw/backups", + ], + "file_count": 1, + "bytes": len(payload), + "content_sha256": "dummy", + "backend": "tar-fallback", + }, + sort_keys=True, + ).encode("utf-8") member = tarfile.TarInfo(member_name) member.size = len(payload) with tarfile.open(archive_path, "w:gz") as archive: archive.addfile(member, io.BytesIO(payload)) + manifest_member = tarfile.TarInfo(FALLBACK_MANIFEST_PATH) + manifest_member.size = len(manifest_payload) + archive.addfile(manifest_member, io.BytesIO(manifest_payload)) def _missing_tool(_command: str, _path: str | None = None) -> str | None: @@ -68,6 +88,11 @@ def test_backup_create_cli_reports_tar_fallback_and_round_trips( assert exit_code == 0 assert payload["mode"] == "tar-fallback" + assert payload["manifest"]["backend"] == "tar-fallback" + assert payload["manifest"]["profile"] == "control-plane" + assert payload["manifest"]["file_count"] >= 1 + assert payload["manifest"]["bytes"] >= 1 + assert payload["manifest"]["content_sha256"] assert archive_path.is_file() assert strongclaw_recovery.verify_backup("latest", home_dir=home_dir) == archive_path @@ -255,6 +280,24 @@ def test_restore_backup_rejects_link_members( link_member.linkname = "target" with tarfile.open(archive_path, "w:gz") as archive: archive.addfile(link_member) + manifest_payload = json.dumps( + { + "profile": "control-plane", + "include_roots": ["/tmp/home"], + "exclude_roots": [ + "/tmp/home/.local/state/strongclaw/backups", + "/tmp/home/.openclaw/backups", + ], + "file_count": 0, + "bytes": 0, + "content_sha256": "dummy", + "backend": "tar-fallback", + }, + sort_keys=True, + ).encode("utf-8") + manifest_member = tarfile.TarInfo(FALLBACK_MANIFEST_PATH) + manifest_member.size = len(manifest_payload) + archive.addfile(manifest_member, io.BytesIO(manifest_payload)) test_context.patch.patch_object( strongclaw_recovery.shutil, "which", @@ -382,3 +425,66 @@ def _run_command(command: list[str], **_kwargs: object) -> ExecResult: ) assert strongclaw_recovery.verify_backup(archive_path, home_dir=home_dir) == archive_path + + +def test_verify_backup_fails_when_fallback_manifest_is_missing( + tmp_path: Path, + test_context: TestContext, +) -> None: + """Fallback archive verification should fail when the embedded manifest is absent.""" + archive_path = tmp_path / "missing-manifest.tar.gz" + payload = b"data\n" + with tarfile.open(archive_path, "w:gz") as archive: + member = tarfile.TarInfo(name=".openclaw/config.json") + member.size = len(payload) + archive.addfile(member, io.BytesIO(payload)) + + test_context.patch.patch_object(strongclaw_recovery.shutil, "which", new=_missing_tool) + with pytest.raises(CommandError, match="invalid fallback backup manifest count"): + strongclaw_recovery.verify_backup(archive_path, home_dir=tmp_path / "home") + + +def test_recovery_emits_structured_events_for_backup_and_prune( + tmp_path: Path, + capsys: pytest.CaptureFixture[str], + test_context: TestContext, +) -> None: + """Recovery CLI should emit structured events with required fields when enabled.""" + home_dir = tmp_path / "home" + _init_openclaw_home(home_dir) + test_context.patch.patch_object(strongclaw_recovery.shutil, "which", new=_missing_tool) + previous_structured_logs = os.environ.get("CLAWOPS_STRUCTURED_LOGS") + try: + os.environ["CLAWOPS_STRUCTURED_LOGS"] = "1" + strongclaw_recovery.main(["--home-dir", str(home_dir), "backup-create"]) + strongclaw_recovery.main(["--home-dir", str(home_dir), "backup-verify", "latest"]) + strongclaw_recovery.main(["--home-dir", str(home_dir), "prune-retention"]) + finally: + if previous_structured_logs is None: + os.environ.pop("CLAWOPS_STRUCTURED_LOGS", None) + else: + os.environ["CLAWOPS_STRUCTURED_LOGS"] = previous_structured_logs + + stderr_lines = [line for line in capsys.readouterr().err.splitlines() if line.strip()] + events = [json.loads(line) for line in stderr_lines] + by_event: dict[str, dict[str, object]] = {} + for event in events: + event_name = event.get("event") + if isinstance(event_name, str): + by_event[event_name] = event + + plan_event = by_event["clawops.recovery.backup_plan_built"] + create_event = by_event["clawops.recovery.backup_create_completed"] + verify_event = by_event["clawops.recovery.backup_verify_completed"] + prune_event = by_event["clawops.recovery.backup_prune_completed"] + + assert plan_event["profile"] == "control-plane" + assert plan_event["excluded_paths_count"] == 2 + assert create_event["backend"] == "tar-fallback" + assert create_event["result"] == "ok" + assert isinstance(create_event["file_count"], int) + assert create_event["file_count"] >= 1 + assert isinstance(create_event["bytes"], int) + assert create_event["bytes"] >= 1 + assert verify_event["result"] == "ok" + assert prune_event["result"] == "ok"