From 01b32e6e72a6c58c7efd37f21103f212c113cbfb Mon Sep 17 00:00:00 2001 From: openhands Date: Fri, 22 May 2026 12:02:24 -0600 Subject: [PATCH 1/5] feat: add bash_events_retention_seconds config for automatic event cleanup Add a new Config field bash_events_retention_seconds (default None = keep forever) that triggers a background asyncio task running in the app lifespan to periodically delete bash event files older than the configured window. Changes: - config.py: add bash_events_retention_seconds: int | None field - bash_service.py: add delete_events_older_than(cutoff) for efficient age-based file deletion using the YYYYMMDDHHMMSS filename prefix for early-exit sorted scanning, and run_retention_cleanup_loop() as the cancellable background coroutine - api.py: start/cancel the retention task in api_lifespan using the existing suppress(CancelledError) pattern - test_bash_service.py: unit tests for delete_events_older_than (empty dir, removes old, keeps new, correct count) and an integration test for the loop using a short interval_seconds to avoid 60s wait Co-authored-by: openhands --- .../openhands/agent_server/api.py | 21 ++++- .../openhands/agent_server/bash_service.py | 58 +++++++++++- .../openhands/agent_server/config.py | 9 ++ tests/agent_server/test_bash_service.py | 90 +++++++++++++++++++ 4 files changed, 175 insertions(+), 3 deletions(-) diff --git a/openhands-agent-server/openhands/agent_server/api.py b/openhands-agent-server/openhands/agent_server/api.py index 070080ed32..e3589904b9 100644 --- a/openhands-agent-server/openhands/agent_server/api.py +++ b/openhands-agent-server/openhands/agent_server/api.py @@ -3,7 +3,7 @@ import tempfile import traceback from collections.abc import AsyncIterator, Sequence -from contextlib import asynccontextmanager +from contextlib import asynccontextmanager, suppress from pathlib import Path from typing import Any from urllib.parse import urlparse @@ -17,6 +17,7 @@ from openhands.agent_server.auth_router import auth_router from openhands.agent_server.bash_router import bash_router +from openhands.agent_server.bash_service import get_default_bash_event_service from openhands.agent_server.cloud_proxy_router import cloud_proxy_router from openhands.agent_server.config import ( Config, @@ -185,12 +186,30 @@ async def start_tool_preload_service(): mark_initialization_complete() logger.info("Server initialization complete - ready to serve requests") + config = get_default_config() + retention_task: asyncio.Task | None = None + if config.bash_events_retention_seconds is not None: + retention_task = asyncio.create_task( + get_default_bash_event_service().run_retention_cleanup_loop( + config.bash_events_retention_seconds + ) + ) + logger.info( + "Bash events retention cleanup started (retention: %ds)", + config.bash_events_retention_seconds, + ) + async with service: # Store the initialized service in app state for dependency injection api.state.conversation_service = service try: yield finally: + if retention_task is not None: + retention_task.cancel() + with suppress(asyncio.CancelledError): + await retention_task + # Define async functions for stopping each service async def stop_vscode_service(): if vscode_service is not None: diff --git a/openhands-agent-server/openhands/agent_server/bash_service.py b/openhands-agent-server/openhands/agent_server/bash_service.py index 4df7439012..c10d050d3c 100644 --- a/openhands-agent-server/openhands/agent_server/bash_service.py +++ b/openhands-agent-server/openhands/agent_server/bash_service.py @@ -4,7 +4,7 @@ import os import signal from dataclasses import dataclass, field -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path from uuid import UUID @@ -18,7 +18,7 @@ ) from openhands.agent_server.pub_sub import PubSub, Subscriber from openhands.sdk.logger import get_logger -from openhands.sdk.utils import sanitized_env +from openhands.sdk.utils import sanitized_env, utc_now logger = get_logger(__name__) @@ -344,6 +344,60 @@ async def read_stream(stream, is_stderr=False): self._save_event_to_file(error_output) await self._pub_sub(error_output) + async def delete_events_older_than(self, cutoff: datetime) -> int: + """Delete bash event files with a recorded timestamp older than ``cutoff``. + + File names are prefixed with ``YYYYMMDDHHMMSS`` in ascending sort order, + so scanning stops as soon as a file at or after the cutoff is reached. + + Returns: + int: The number of event files deleted. + """ + cutoff_str = self._timestamp_to_str(cutoff) + files = self._get_event_files_by_pattern("*") # ascending chronological order + count = 0 + for path in files: + if path.name >= cutoff_str: + break # remaining files are at or newer than cutoff + try: + path.unlink() + count += 1 + except Exception as e: + logger.warning("Failed to delete bash event file %s: %s", path, e) + if count: + logger.info( + "Deleted %d bash event file(s) older than %s", count, cutoff_str + ) + return count + + async def run_retention_cleanup_loop( + self, + retention_seconds: int, + interval_seconds: float | None = None, + ) -> None: + """Periodically purge bash event files older than ``retention_seconds``. + + Runs until cancelled (e.g. during application shutdown). + + Args: + retention_seconds: Age threshold in seconds; older files are deleted. + interval_seconds: How often to run the cleanup. Defaults to + ``max(60, retention_seconds / 2)``. Pass a smaller value in + tests to avoid long waits. + """ + interval = ( + interval_seconds + if interval_seconds is not None + else max(60.0, retention_seconds / 2) + ) + while True: + await asyncio.sleep(interval) + try: + cutoff = utc_now() - timedelta(seconds=retention_seconds) + await self.delete_events_older_than(cutoff) + except Exception as e: + logger.warning("Bash events retention cleanup error: %s", e) + async def subscribe_to_events(self, subscriber: Subscriber[BashEventBase]) -> UUID: """Subscribe to bash events. diff --git a/openhands-agent-server/openhands/agent_server/config.py b/openhands-agent-server/openhands/agent_server/config.py index 87d1a46be2..151ca66141 100644 --- a/openhands-agent-server/openhands/agent_server/config.py +++ b/openhands-agent-server/openhands/agent_server/config.py @@ -139,6 +139,15 @@ class Config(BaseModel): "Defaults to 'workspace/bash_events'." ), ) + bash_events_retention_seconds: int | None = Field( + default=None, + gt=0, + description=( + "How long bash event files are retained on disk, in seconds. " + "A background task purges events older than this window on a " + "rolling basis. None (default) retains events indefinitely." + ), + ) static_files_path: Path | None = Field( default=None, description=( diff --git a/tests/agent_server/test_bash_service.py b/tests/agent_server/test_bash_service.py index 2228f10a34..db54fd7a4c 100644 --- a/tests/agent_server/test_bash_service.py +++ b/tests/agent_server/test_bash_service.py @@ -1,8 +1,10 @@ """Tests for bash_service.py.""" import asyncio +import contextlib import time from collections.abc import AsyncIterator +from datetime import UTC, datetime from pathlib import Path from uuid import UUID @@ -14,6 +16,7 @@ from openhands.agent_server import bash_router as bash_router_module from openhands.agent_server.bash_service import BashEventService from openhands.agent_server.config import Config +from openhands.agent_server.models import BashCommand from openhands.agent_server.server_details_router import ( mark_initialization_complete, server_details_router, @@ -80,3 +83,90 @@ async def test_bash_timeout_runs_sigterm_trap( await asyncio.sleep(0.2) # let the trap's filesystem write land assert marker.exists(), "SIGTERM trap did not run; cleanup skipped." + + +# --------------------------------------------------------------------------- +# delete_events_older_than +# --------------------------------------------------------------------------- + +_OLD = datetime(2020, 1, 1, tzinfo=UTC) +_NEW = datetime(2022, 1, 1, tzinfo=UTC) +_CUTOFF = datetime(2021, 1, 1, tzinfo=UTC) + + +async def test_delete_events_older_than_removes_old_keeps_new(tmp_path: Path): + service = BashEventService(bash_events_dir=tmp_path / "bash_events") + + old_cmd = BashCommand(command="echo old", timestamp=_OLD) + new_cmd = BashCommand(command="echo new", timestamp=_NEW) + service._save_event_to_file(old_cmd) + service._save_event_to_file(new_cmd) + + count = await service.delete_events_older_than(_CUTOFF) + + assert count == 1 + remaining = service._get_event_files_by_pattern("*") + assert len(remaining) == 1 + assert new_cmd.id.hex in remaining[0].name + + +async def test_delete_events_older_than_empty_directory(tmp_path: Path): + service = BashEventService(bash_events_dir=tmp_path / "bash_events") + count = await service.delete_events_older_than(_CUTOFF) + assert count == 0 + + +async def test_delete_events_older_than_all_newer_are_skipped(tmp_path: Path): + service = BashEventService(bash_events_dir=tmp_path / "bash_events") + + new_cmd = BashCommand(command="echo new", timestamp=_NEW) + service._save_event_to_file(new_cmd) + + count = await service.delete_events_older_than(_CUTOFF) + + assert count == 0 + assert len(service._get_event_files_by_pattern("*")) == 1 + + +async def test_delete_events_older_than_returns_correct_count(tmp_path: Path): + service = BashEventService(bash_events_dir=tmp_path / "bash_events") + + for i in range(3): + service._save_event_to_file(BashCommand(command=f"echo {i}", timestamp=_OLD)) + service._save_event_to_file(BashCommand(command="echo new", timestamp=_NEW)) + + count = await service.delete_events_older_than(_CUTOFF) + + assert count == 3 + assert len(service._get_event_files_by_pattern("*")) == 1 + + +# --------------------------------------------------------------------------- +# run_retention_cleanup_loop +# --------------------------------------------------------------------------- + + +@pytest.mark.timeout(5) +async def test_run_retention_cleanup_loop_purges_old_events(tmp_path: Path): + service = BashEventService(bash_events_dir=tmp_path / "bash_events") + + # Write an event whose recorded timestamp is well in the past. + service._save_event_to_file(BashCommand(command="echo old", timestamp=_OLD)) + assert len(service._get_event_files_by_pattern("*")) == 1 + + # Run the loop with a 1-second retention window and a 50 ms tick so + # the test doesn't have to wait for the default 60-second interval. + task = asyncio.create_task( + service.run_retention_cleanup_loop(retention_seconds=1, interval_seconds=0.05) + ) + try: + # Give the loop time to fire at least once. + await asyncio.sleep(0.15) + finally: + task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await task + + assert len(service._get_event_files_by_pattern("*")) == 0, ( + "Old event file should have been purged by the retention loop" + ) From 77c0e26792a00c1276b0e21545adb934174e6193 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 23 May 2026 08:05:46 -0600 Subject: [PATCH 2/5] refactor: address review feedback on bash events retention - Make delete_events_older_than a plain sync method; all its operations are blocking filesystem I/O with no awaits - In run_retention_cleanup_loop, dispatch blocking work to a thread via asyncio.to_thread so the event loop is never stalled during glob/sort/unlink of a large bash_events directory - Move retention task creation inside 'async with service:' so the task is never spawned if service __aenter__ fails, eliminating the orphaned-task edge case - Run cleanup immediately on loop entry (before the first sleep) so files accumulated across a server restart are purged right away - Update tests: delete_events_older_than unit tests are now plain sync functions; loop test still async via asyncio.to_thread path Co-authored-by: openhands --- .../openhands/agent_server/api.py | 27 ++++++++++--------- .../openhands/agent_server/bash_service.py | 18 ++++++++++--- tests/agent_server/test_bash_service.py | 16 +++++------ 3 files changed, 36 insertions(+), 25 deletions(-) diff --git a/openhands-agent-server/openhands/agent_server/api.py b/openhands-agent-server/openhands/agent_server/api.py index e3589904b9..80a3031b82 100644 --- a/openhands-agent-server/openhands/agent_server/api.py +++ b/openhands-agent-server/openhands/agent_server/api.py @@ -186,22 +186,23 @@ async def start_tool_preload_service(): mark_initialization_complete() logger.info("Server initialization complete - ready to serve requests") - config = get_default_config() - retention_task: asyncio.Task | None = None - if config.bash_events_retention_seconds is not None: - retention_task = asyncio.create_task( - get_default_bash_event_service().run_retention_cleanup_loop( - config.bash_events_retention_seconds - ) - ) - logger.info( - "Bash events retention cleanup started (retention: %ds)", - config.bash_events_retention_seconds, - ) - async with service: # Store the initialized service in app state for dependency injection api.state.conversation_service = service + + config = get_default_config() + retention_task: asyncio.Task | None = None + if config.bash_events_retention_seconds is not None: + retention_task = asyncio.create_task( + get_default_bash_event_service().run_retention_cleanup_loop( + config.bash_events_retention_seconds + ) + ) + logger.info( + "Bash events retention cleanup started (retention: %ds)", + config.bash_events_retention_seconds, + ) + try: yield finally: diff --git a/openhands-agent-server/openhands/agent_server/bash_service.py b/openhands-agent-server/openhands/agent_server/bash_service.py index c10d050d3c..209b4e4a6e 100644 --- a/openhands-agent-server/openhands/agent_server/bash_service.py +++ b/openhands-agent-server/openhands/agent_server/bash_service.py @@ -344,9 +344,14 @@ async def read_stream(stream, is_stderr=False): self._save_event_to_file(error_output) await self._pub_sub(error_output) - async def delete_events_older_than(self, cutoff: datetime) -> int: + def delete_events_older_than(self, cutoff: datetime) -> int: """Delete bash event files with a recorded timestamp older than ``cutoff``. + This is a synchronous method — all operations are blocking filesystem + I/O. Callers on the asyncio event loop should use + ``await asyncio.to_thread(service.delete_events_older_than, cutoff)`` + to avoid stalling the loop. + File names are prefixed with ``YYYYMMDDHHMMSS`` in ascending sort order, so scanning stops as soon as a file at or after the cutoff is reached. @@ -377,7 +382,12 @@ async def run_retention_cleanup_loop( ) -> None: """Periodically purge bash event files older than ``retention_seconds``. - Runs until cancelled (e.g. during application shutdown). + Runs until cancelled (e.g. during application shutdown). Cleanup runs + immediately on entry so that files accumulated across a server restart + are purged without waiting for the first interval to elapse. + + Blocking filesystem work is dispatched to a thread via + ``asyncio.to_thread`` to keep the event loop free. Args: retention_seconds: Age threshold in seconds; older files are deleted. @@ -391,12 +401,12 @@ async def run_retention_cleanup_loop( else max(60.0, retention_seconds / 2) ) while True: - await asyncio.sleep(interval) try: cutoff = utc_now() - timedelta(seconds=retention_seconds) - await self.delete_events_older_than(cutoff) + await asyncio.to_thread(self.delete_events_older_than, cutoff) except Exception as e: logger.warning("Bash events retention cleanup error: %s", e) + await asyncio.sleep(interval) async def subscribe_to_events(self, subscriber: Subscriber[BashEventBase]) -> UUID: """Subscribe to bash events. diff --git a/tests/agent_server/test_bash_service.py b/tests/agent_server/test_bash_service.py index db54fd7a4c..033de166db 100644 --- a/tests/agent_server/test_bash_service.py +++ b/tests/agent_server/test_bash_service.py @@ -94,7 +94,7 @@ async def test_bash_timeout_runs_sigterm_trap( _CUTOFF = datetime(2021, 1, 1, tzinfo=UTC) -async def test_delete_events_older_than_removes_old_keeps_new(tmp_path: Path): +def test_delete_events_older_than_removes_old_keeps_new(tmp_path: Path): service = BashEventService(bash_events_dir=tmp_path / "bash_events") old_cmd = BashCommand(command="echo old", timestamp=_OLD) @@ -102,7 +102,7 @@ async def test_delete_events_older_than_removes_old_keeps_new(tmp_path: Path): service._save_event_to_file(old_cmd) service._save_event_to_file(new_cmd) - count = await service.delete_events_older_than(_CUTOFF) + count = service.delete_events_older_than(_CUTOFF) assert count == 1 remaining = service._get_event_files_by_pattern("*") @@ -110,32 +110,32 @@ async def test_delete_events_older_than_removes_old_keeps_new(tmp_path: Path): assert new_cmd.id.hex in remaining[0].name -async def test_delete_events_older_than_empty_directory(tmp_path: Path): +def test_delete_events_older_than_empty_directory(tmp_path: Path): service = BashEventService(bash_events_dir=tmp_path / "bash_events") - count = await service.delete_events_older_than(_CUTOFF) + count = service.delete_events_older_than(_CUTOFF) assert count == 0 -async def test_delete_events_older_than_all_newer_are_skipped(tmp_path: Path): +def test_delete_events_older_than_all_newer_are_skipped(tmp_path: Path): service = BashEventService(bash_events_dir=tmp_path / "bash_events") new_cmd = BashCommand(command="echo new", timestamp=_NEW) service._save_event_to_file(new_cmd) - count = await service.delete_events_older_than(_CUTOFF) + count = service.delete_events_older_than(_CUTOFF) assert count == 0 assert len(service._get_event_files_by_pattern("*")) == 1 -async def test_delete_events_older_than_returns_correct_count(tmp_path: Path): +def test_delete_events_older_than_returns_correct_count(tmp_path: Path): service = BashEventService(bash_events_dir=tmp_path / "bash_events") for i in range(3): service._save_event_to_file(BashCommand(command=f"echo {i}", timestamp=_OLD)) service._save_event_to_file(BashCommand(command="echo new", timestamp=_NEW)) - count = await service.delete_events_older_than(_CUTOFF) + count = service.delete_events_older_than(_CUTOFF) assert count == 3 assert len(service._get_event_files_by_pattern("*")) == 1 From be4dada705a09b4b0e5a279e9d83c0c8dd1fda20 Mon Sep 17 00:00:00 2001 From: openhands Date: Sat, 23 May 2026 15:33:20 -0600 Subject: [PATCH 3/5] fix: address second round of review feedback on bash events retention - config.py: extend bash_events_retention_seconds description with the operator warning about setting retention higher than the longest command timeout; operators reading the env var docs won't see the PR body - bash_service.py: add brief asyncio.sleep(min(interval, 60)) after a cleanup error to prevent continuous warning log flooding on persistent failures such as permission errors or a full disk - api.py: read config from api.state.config instead of get_default_config() so that create_app(Config(bash_events_retention_seconds=N)) correctly starts the retention task rather than silently falling back to the process-global environment config Co-authored-by: openhands --- openhands-agent-server/openhands/agent_server/api.py | 2 +- .../openhands/agent_server/bash_service.py | 4 ++++ openhands-agent-server/openhands/agent_server/config.py | 6 +++++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/openhands-agent-server/openhands/agent_server/api.py b/openhands-agent-server/openhands/agent_server/api.py index 80a3031b82..2040ae84c0 100644 --- a/openhands-agent-server/openhands/agent_server/api.py +++ b/openhands-agent-server/openhands/agent_server/api.py @@ -190,7 +190,7 @@ async def start_tool_preload_service(): # Store the initialized service in app state for dependency injection api.state.conversation_service = service - config = get_default_config() + config = api.state.config retention_task: asyncio.Task | None = None if config.bash_events_retention_seconds is not None: retention_task = asyncio.create_task( diff --git a/openhands-agent-server/openhands/agent_server/bash_service.py b/openhands-agent-server/openhands/agent_server/bash_service.py index 209b4e4a6e..6853a1056c 100644 --- a/openhands-agent-server/openhands/agent_server/bash_service.py +++ b/openhands-agent-server/openhands/agent_server/bash_service.py @@ -406,6 +406,10 @@ async def run_retention_cleanup_loop( await asyncio.to_thread(self.delete_events_older_than, cutoff) except Exception as e: logger.warning("Bash events retention cleanup error: %s", e) + # Brief back-off to prevent log flooding if the failure is persistent + # (e.g. permission error, full disk). Cap at the normal interval so + # we don't over-delay in low-retention configurations. + await asyncio.sleep(min(interval, 60.0)) await asyncio.sleep(interval) async def subscribe_to_events(self, subscriber: Subscriber[BashEventBase]) -> UUID: diff --git a/openhands-agent-server/openhands/agent_server/config.py b/openhands-agent-server/openhands/agent_server/config.py index 151ca66141..b76a18f460 100644 --- a/openhands-agent-server/openhands/agent_server/config.py +++ b/openhands-agent-server/openhands/agent_server/config.py @@ -145,7 +145,11 @@ class Config(BaseModel): description=( "How long bash event files are retained on disk, in seconds. " "A background task purges events older than this window on a " - "rolling basis. None (default) retains events indefinitely." + "rolling basis. None (default) retains events indefinitely. " + "Should be set higher than the longest expected command timeout: " + "a command whose BashCommand file is purged mid-execution will " + "complete normally, but its on-disk event history will be " + "incomplete. A value >= 2x max command timeout avoids this." ), ) static_files_path: Path | None = Field( From 6cbb5619e59ec4978b1df0472891a51473d77eca Mon Sep 17 00:00:00 2001 From: Tim O'Farrell Date: Sat, 23 May 2026 18:32:13 -0600 Subject: [PATCH 4/5] Update openhands-agent-server/openhands/agent_server/bash_service.py Co-authored-by: OpenHands Bot --- openhands-agent-server/openhands/agent_server/bash_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/openhands-agent-server/openhands/agent_server/bash_service.py b/openhands-agent-server/openhands/agent_server/bash_service.py index 6853a1056c..da3e0d6e37 100644 --- a/openhands-agent-server/openhands/agent_server/bash_service.py +++ b/openhands-agent-server/openhands/agent_server/bash_service.py @@ -410,6 +410,8 @@ async def run_retention_cleanup_loop( # (e.g. permission error, full disk). Cap at the normal interval so # we don't over-delay in low-retention configurations. await asyncio.sleep(min(interval, 60.0)) + # Always sleep the full interval after the error back-off, so total + # wait on error = min(interval, 60) + interval ≈ 2× normal cadence. await asyncio.sleep(interval) async def subscribe_to_events(self, subscriber: Subscriber[BashEventBase]) -> UUID: From 5489e195744e76df31ec3293ae582e3da896a94c Mon Sep 17 00:00:00 2001 From: Tim O'Farrell Date: Sat, 23 May 2026 18:50:58 -0600 Subject: [PATCH 5/5] Update openhands-agent-server/openhands/agent_server/bash_service.py Co-authored-by: OpenHands Bot --- openhands-agent-server/openhands/agent_server/bash_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openhands-agent-server/openhands/agent_server/bash_service.py b/openhands-agent-server/openhands/agent_server/bash_service.py index da3e0d6e37..5c10f64d00 100644 --- a/openhands-agent-server/openhands/agent_server/bash_service.py +++ b/openhands-agent-server/openhands/agent_server/bash_service.py @@ -365,7 +365,7 @@ def delete_events_older_than(self, cutoff: datetime) -> int: if path.name >= cutoff_str: break # remaining files are at or newer than cutoff try: - path.unlink() + path.unlink(missing_ok=True) count += 1 except Exception as e: logger.warning("Failed to delete bash event file %s: %s", path, e)