Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions autobot-slm-backend/api/code_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
CodeSyncStatusResponse,
CodeVersionNotification,
CodeVersionNotificationResponse,
FileDriftReport,
FleetSyncJobStatus,
FleetSyncNodeStatus,
FleetSyncRequest,
Expand All @@ -50,6 +51,12 @@
from services.auth import get_current_user
from services.code_distributor import get_code_distributor
from services.database import get_db
from services.drift_checker import (
ALLOWED_COMPONENTS,
build_drift_report,
get_default_deployed_dir,
get_default_source_dir,
)
from services.fleet_sync_guard import assert_no_running_sync, fleet_sync_lock
from services.git_tracker import DEFAULT_BRANCH, DEFAULT_REPO_PATH, get_git_tracker
from services.playbook_executor import get_playbook_executor
Expand Down Expand Up @@ -395,6 +402,56 @@ async def get_sync_status(
)


@router.get("/drift", response_model=FileDriftReport)
async def get_file_drift(
_: Annotated[dict, Depends(get_current_user)],
component: str = "autobot-slm-backend",
) -> FileDriftReport:
"""
Compare file checksums between code_source and the deployed directory (Issue #2834).

Detects files that have drifted due to manual patches or incomplete Ansible deploys.
Only Python, config, and script files are compared; .pyc, __pycache__, venv, and
.git directories are always excluded.

Query params:
component: Sub-directory to compare (default: autobot-slm-backend).
Must be one of the allowed components (Issue #3427).

Returns a FileDriftReport with a list of drifted files and their checksums.
"""
if component not in ALLOWED_COMPONENTS:
raise HTTPException(
status_code=400,
detail=f"Invalid component '{component}'. Must be one of: {sorted(ALLOWED_COMPONENTS)}",
)

try:
source_dir = get_default_source_dir(component)
except ValueError as exc:
raise HTTPException(status_code=500, detail=str(exc)) from exc
deployed_dir = get_default_deployed_dir(component)

logger.info(
"drift check: comparing source=%s deployed=%s", source_dir, deployed_dir
)

report = await asyncio.get_running_loop().run_in_executor(
None,
build_drift_report,
source_dir,
deployed_dir,
)

logger.info(
"drift check: %d drifted files out of %d compared",
len(report["drifted_files"]),
report["total_compared"],
)

return FileDriftReport(**report)


@router.post("/refresh", response_model=CodeSyncRefreshResponse)
async def refresh_version(
db: Annotated[AsyncSession, Depends(get_db)],
Expand Down
22 changes: 21 additions & 1 deletion autobot-slm-backend/models/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from datetime import datetime
from enum import Enum
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, Field, field_validator, model_validator

Expand Down Expand Up @@ -1557,6 +1557,26 @@ class CodeSyncRefreshResponse(BaseModel):
has_update: bool = False


class DriftedFile(BaseModel):
"""A file whose checksum differs between code_source and deployed (Issue #2834)."""

path: str
source_checksum: Optional[str] = None
deployed_checksum: Optional[str] = None
status: Literal["modified", "source_only", "deployed_only"]


class FileDriftReport(BaseModel):
"""Result of comparing code_source vs deployed file checksums (Issue #2834)."""

source_dir: str
deployed_dir: str
drifted_files: list[DriftedFile]
total_compared: int
drift_detected: bool
checked_at: str


class PendingNodeResponse(BaseModel):
"""Node that needs code update."""

Expand Down
226 changes: 226 additions & 0 deletions autobot-slm-backend/services/drift_checker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
# AutoBot - AI-Powered Automation Platform
# Copyright (c) 2025 mrveiss
# Author: mrveiss
"""
Drift Checker Service (Issue #2834).

Compares file checksums between the code_source directory and the deployed
directory to detect files that have been manually patched or missed by Ansible.
"""

import hashlib
import logging
import os
from datetime import datetime, timezone
from pathlib import Path
from typing import Dict, List, Tuple

logger = logging.getLogger(__name__)

# File extensions that are meaningful to compare.
_INCLUDE_EXTENSIONS = {".py", ".cfg", ".ini", ".toml", ".yaml", ".yml", ".sh", ".txt"}

# Permitted component names for the /drift endpoint (Issue #3427).
# Only these sub-directories may be requested to prevent path traversal.
ALLOWED_COMPONENTS = frozenset(
{
"autobot-slm-backend",
"autobot-slm-frontend",
"autobot-backend",
"autobot-frontend",
}
)

# Directory names to skip entirely during traversal.
_SKIP_DIRS = {
"__pycache__",
".git",
"venv",
".venv",
"node_modules",
".mypy_cache",
".ruff_cache",
"dist",
"build",
}


def _file_checksum(path: Path, block_size: int = 65536) -> str:
"""Return the SHA-256 hex digest of a file.

Reads in blocks to avoid loading large files into memory at once.

Args:
path: Absolute path to the file.
block_size: Read chunk size in bytes.

Returns:
Lowercase hex SHA-256 digest string.
"""
h = hashlib.sha256()
with open(path, "rb") as fh:
while chunk := fh.read(block_size):
h.update(chunk)
return h.hexdigest()


def _collect_checksums(root: Path) -> Dict[str, str]:
"""Walk *root* and return a mapping of relative-path → SHA-256 checksum.

Only files with extensions in ``_INCLUDE_EXTENSIONS`` are included.
Directories in ``_SKIP_DIRS`` are pruned from the walk.

Args:
root: Directory to scan.

Returns:
Dict mapping POSIX-style relative path strings to hex digest strings.
"""
checksums: Dict[str, str] = {}
for dirpath, dirnames, filenames in os.walk(root):
# Prune skip dirs in-place so os.walk does not descend into them.
dirnames[:] = [d for d in dirnames if d not in _SKIP_DIRS]

for filename in filenames:
filepath = Path(dirpath) / filename
if filepath.suffix not in _INCLUDE_EXTENSIONS:
continue
try:
rel = filepath.relative_to(root).as_posix()
checksums[rel] = _file_checksum(filepath)
except (OSError, IOError) as exc:
logger.warning("drift_checker: cannot read %s: %s", filepath, exc)

return checksums


def compute_drift(
source_dir: str,
deployed_dir: str,
) -> Tuple[List[dict], int]:
"""Compare file checksums between *source_dir* and *deployed_dir*.

Returns a tuple of (drifted_file_dicts, total_compared_count).

Each drifted file dict has keys:
path – POSIX relative path
source_checksum – SHA-256 of the source file (None if absent)
deployed_checksum – SHA-256 of the deployed file (None if absent)
status – "modified" | "source_only" | "deployed_only"

Files that exist in both directories with identical checksums are not
included in the returned list.

Args:
source_dir: Absolute path to the authoritative code source directory.
deployed_dir: Absolute path to the currently deployed directory.

Returns:
Tuple of (list_of_drift_dicts, total_files_compared).
"""
src_path = Path(source_dir)
dep_path = Path(deployed_dir)

if not src_path.is_dir():
logger.warning("drift_checker: source_dir does not exist: %s", source_dir)
return [], 0

if not dep_path.is_dir():
logger.warning("drift_checker: deployed_dir does not exist: %s", deployed_dir)
return [], 0

src_checksums = _collect_checksums(src_path)
dep_checksums = _collect_checksums(dep_path)

all_paths = set(src_checksums) | set(dep_checksums)
total_compared = len(all_paths)
drifted: List[dict] = []

for rel_path in sorted(all_paths):
src_cs = src_checksums.get(rel_path)
dep_cs = dep_checksums.get(rel_path)

if src_cs == dep_cs:
# Both present and identical — no drift.
continue

if src_cs is None:
status = "deployed_only"
elif dep_cs is None:
status = "source_only"
else:
status = "modified"

drifted.append(
{
"path": rel_path,
"source_checksum": src_cs,
"deployed_checksum": dep_cs,
"status": status,
}
)

return drifted, total_compared


def build_drift_report(
source_dir: str,
deployed_dir: str,
) -> dict:
"""Build the full drift report dict for the API response (Issue #2834).

Args:
source_dir: Path to code_source directory.
deployed_dir: Path to deployed component directory.

Returns:
Dict matching the ``FileDriftReport`` schema.
"""
drifted, total = compute_drift(source_dir, deployed_dir)

return {
"source_dir": source_dir,
"deployed_dir": deployed_dir,
"drifted_files": drifted,
"total_compared": total,
"drift_detected": len(drifted) > 0,
"checked_at": datetime.now(timezone.utc).isoformat(),
}


def get_default_deployed_dir(component: str = "autobot-slm-backend") -> str:
"""Return the expected deployed path for *component* under /opt/autobot.

Reads ``SLM_DEPLOYED_ROOT`` from the environment so the path is
configurable without hardcoding.

Args:
component: Sub-directory name under the deployed root.

Returns:
Absolute path string for the deployed component directory.
"""
deployed_root = os.environ.get("SLM_DEPLOYED_ROOT", "/opt/autobot")
return str(Path(deployed_root) / component)


def get_default_source_dir(component: str = "autobot-slm-backend") -> str:
"""Return the code_source sub-directory for *component*.

Reads ``SLM_REPO_PATH`` from the environment (same var used by git_tracker).
Falls back to the repository root when the component sub-directory does not
exist yet (e.g. monorepo roots that serve as the authoritative source).

Args:
component: Sub-directory name inside the code_source repository.

Returns:
Absolute path string for the source directory to compare against.
"""
source_root = os.environ.get("SLM_REPO_PATH", "/opt/autobot/code_source")
candidate = Path(source_root) / component
if not candidate.is_dir():
raise ValueError(
f"drift_checker: source component directory does not exist: {candidate}"
)
return str(candidate)
Loading
Loading