From 81fba9d5dbfd22a85f273e521640f62f318b5f64 Mon Sep 17 00:00:00 2001 From: "burak.keles" Date: Thu, 30 Apr 2026 16:01:51 +0300 Subject: [PATCH 01/12] Add agent-based hook execution with sub-conversation support --- .../conversation/impl/local_conversation.py | 1 + openhands-sdk/openhands/sdk/hooks/config.py | 30 +-- .../openhands/sdk/hooks/conversation_hooks.py | 16 +- openhands-sdk/openhands/sdk/hooks/executor.py | 205 ++++++++++++++++- openhands-sdk/openhands/sdk/hooks/manager.py | 9 +- tests/sdk/hooks/test_config.py | 62 +++++ tests/sdk/hooks/test_executor.py | 216 ++++++++++++++++++ 7 files changed, 515 insertions(+), 24 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index a6bb1ec60e..279794f223 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -565,6 +565,7 @@ def _ensure_plugins_loaded(self) -> None: working_dir=str(self.workspace.working_dir), session_id=str(self._state.id), original_callback=self._base_callback, + llm=self.agent.llm, ) self._hook_processor.set_conversation_state(self._state) self._hook_processor.run_session_start() diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index 594597cf8b..de0f1105ce 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -41,13 +41,14 @@ class HookType(StrEnum): COMMAND = "command" # Shell command executed via subprocess PROMPT = "prompt" # LLM-based evaluation (future) + AGENT = "agent" # Agent-based evaluation with tool access class HookDefinition(BaseModel): """A single hook definition.""" type: HookType = HookType.COMMAND - command: str + command: str | None = None prompt: str | None = None timeout: int = 60 async_: bool = Field(default=False, alias="async") # 'async' is a reserved keyword @@ -56,25 +57,28 @@ class HookDefinition(BaseModel): "populate_by_name": True, # Allow both 'async' and 'async_' in input } - @model_validator(mode="before") - @classmethod - def _set_command_for_prompt_hooks(cls, data: Any) -> Any: - if ( - isinstance(data, dict) - and data.get("type") == "prompt" - and "command" not in data - ): - data["command"] = "" - return data - @model_validator(mode="after") - def _check_required_fields(self) -> "HookDefinition": + def _validate_type_fields(self) -> "HookDefinition": if self.type == HookType.COMMAND and not self.command: raise ValueError("'command' is required when type is 'command'") if self.type == HookType.PROMPT and not self.prompt: raise ValueError("'prompt' is required when type is 'prompt'") + if self.type == HookType.AGENT and self.command is not None: + raise ValueError( + "'command' must not be set when type is 'agent'; use 'prompt' instead" + ) + if self.type == HookType.AGENT and self.async_: + raise ValueError("'async' is not supported for agent hooks") return self + @property + def display_command(self) -> str: + """Human-readable label for this hook used in logs and events.""" + return ( + self.command + if self.command is not None + else f"agent-hook:{self.type.value}" + ) class HookMatcher(BaseModel): """Matches events to hooks based on patterns. diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py index 4eaac1c1ac..a0438075ac 100644 --- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py +++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: from openhands.sdk.conversation.state import ConversationState + from openhands.sdk.llm import LLM logger = get_logger(__name__) @@ -149,7 +150,7 @@ def _handle_pre_tool_use(self, event: ActionEvent) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.PRE_TOOL_USE, - hook_command=hook.command, + hook_command=hook.display_command, result=result, tool_name=tool_name, action_id=event.id, @@ -224,7 +225,7 @@ def _handle_post_tool_use(self, event: ObservationEvent) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.POST_TOOL_USE, - hook_command=hook.command, + hook_command=hook.display_command, result=result, tool_name=tool_name, action_id=action_event.id, @@ -266,7 +267,7 @@ def _handle_user_prompt_submit(self, event: MessageEvent) -> MessageEvent: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.USER_PROMPT_SUBMIT, - hook_command=hook.command, + hook_command=hook.display_command, result=result, message_id=event.id, hook_input={"message": message}, @@ -328,7 +329,7 @@ def run_session_start(self) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.SESSION_START, - hook_command=hook.command, + hook_command=hook.display_command, result=result, ) if result.error: @@ -342,7 +343,7 @@ def run_session_end(self) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.SESSION_END, - hook_command=hook.command, + hook_command=hook.display_command, result=result, ) if result.error: @@ -360,7 +361,7 @@ def run_stop(self, reason: str | None = None) -> tuple[bool, str | None]: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.STOP, - hook_command=hook.command, + hook_command=hook.display_command, result=result, hook_input={"reason": reason} if reason else None, ) @@ -389,6 +390,7 @@ def create_hook_callback( session_id: str | None = None, original_callback: Any = None, emit_hook_events: bool = True, + llm: "LLM | None" = None, ) -> tuple[HookEventProcessor, Any]: """Create a hook-enabled event callback. Returns (processor, callback). @@ -399,6 +401,7 @@ def create_hook_callback( original_callback: Callback to chain after hook processing. emit_hook_events: If True, emit HookExecutionEvent for each hook execution. Defaults to True for full observability. + llm: LLM instance inherited from the parent conversation, used by agent hooks. Returns: Tuple of (HookEventProcessor, callback function). @@ -407,6 +410,7 @@ def create_hook_callback( config=hook_config, working_dir=working_dir, session_id=session_id, + llm=llm, ) processor = HookEventProcessor( diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 6f3ce897b1..978f6354b6 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -1,19 +1,27 @@ -"""Hook executor - runs shell commands with JSON I/O.""" +"""Hook executor - runs shell commands and agent evaluations with JSON I/O.""" +import concurrent.futures import json import logging import os import signal import subprocess import time +from typing import TYPE_CHECKING +import opentelemetry.context as otel_context from pydantic import BaseModel -from openhands.sdk.hooks.config import HookDefinition +from openhands.sdk.hooks.config import HookDefinition, HookType from openhands.sdk.hooks.types import HookDecision, HookEvent +from openhands.sdk.observability.laminar import observe from openhands.sdk.utils import sanitized_env +if TYPE_CHECKING: + from openhands.sdk.llm import LLM + + class HookResult(BaseModel): """Result from executing a hook. @@ -138,15 +146,188 @@ def cleanup_all(self) -> None: class HookExecutor: - """Executes hook commands with JSON I/O.""" + """Executes hook commands and agent evaluations with JSON I/O.""" def __init__( self, working_dir: str | None = None, async_process_manager: AsyncProcessManager | None = None, + llm: "LLM | None" = None, ): self.working_dir = working_dir or os.getcwd() self.async_process_manager = async_process_manager or AsyncProcessManager() + self.llm = llm + + @observe( + name="hook.execute.agent", + ignore_inputs=["self", "hook", "event"], + ignore_output=True, + ) + def _execute_agent_hook( + self, + hook: HookDefinition, + event: HookEvent, + ) -> HookResult: + """Execute an agent-based hook by spawning a short-lived sub-conversation. + + The sub-conversation inherits the parent LLM, runs with read-only tools + and no hooks (preventing recursion), and must return a JSON decision. + """ + # Lazy imports to avoid circular dependency: + # executor <- manager <- conversation_hooks <- local_conversation -> executor + from openhands.sdk.agent import Agent # type: ignore[attr-defined] + from openhands.sdk.conversation.impl.local_conversation import LocalConversation + from openhands.sdk.conversation.response_utils import get_agent_final_response + from openhands.sdk.tool.spec import Tool + + event_type = ( + event.event_type + if isinstance(event.event_type, str) + else event.event_type.value + ) + + if self.llm is None: + logger.warning( + f"Agent hook has no LLM configured for event '{event_type}'" + " — defaulting to allow" + ) + return HookResult( + success=True, + decision=HookDecision.ALLOW, + reason="No LLM configured for agent hook", + ) + + no_policy = "No additional policy specified. Use your best judgment." + system_prompt = ( + "You are a security policy evaluator for an AI coding agent. " + "Your only job is to decide whether an agent operation" + " should be allowed or denied.\n\n" + f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```\n\n" + "## Instructions\n" + "Use the available tools to investigate if needed (e.g. read files" + " in the working directory). Then call `finish` with a JSON string" + " as the message:\n" + ' {"decision": "allow" | "deny", "reason": ""}\n\n' + "When in doubt, prefer `allow`. Only deny when the operation clearly" + " violates the policy below.\n\n" + f"## Policy\n{hook.prompt or no_policy}" + ) + + # Isolated LLM copy — hook metrics not attributed to the parent conversation. + hook_llm = self.llm.model_copy(update={"usage_id": "hook-agent-evaluator"}) + hook_llm.reset_metrics() + + # Capture the current OTel span context so the sub-conversation thread + # becomes a child of this span rather than an orphaned trace. + parent_ctx = otel_context.get_current() + + conversation = None + try: + agent = Agent( + llm=hook_llm, + tools=[ + Tool(name="GrepTool"), + Tool(name="GlobTool"), + ], + include_default_tools=["FinishTool"], + system_prompt=system_prompt, + ) + # hook_config=None disables all hooks in the sub-conversation (no recursion) + conversation = LocalConversation( + agent=agent, + workspace=self.working_dir, + plugins=None, + hook_config=None, + persistence_dir=None, + visualizer=None, + max_iteration_per_run=10, + ) + conversation.send_message( + f"Evaluate this {event_type} hook event and make your decision." + ) + + def _run_with_context() -> None: + token = otel_context.attach(parent_ctx) + try: + conversation.run() + finally: + otel_context.detach(token) + + pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) + try: + future = pool.submit(_run_with_context) + try: + future.result(timeout=hook.timeout) + except concurrent.futures.TimeoutError: + logger.warning( + f"Agent hook timed out after {hook.timeout}s" + f" for event '{event_type}' — defaulting to allow" + ) + return HookResult( + success=False, + decision=HookDecision.ALLOW, + reason=( + f"Agent hook timed out after {hook.timeout} seconds" + " — defaulting to allow" + ), + ) + finally: + # Don't block waiting for the thread — the conversation may still be + # running but we've already returned or moved on. + pool.shutdown(wait=False) + raw = get_agent_final_response(conversation.state.events) + except Exception as e: + logger.warning( + f"Agent hook sub-conversation failed for event '{event_type}'" + f" — defaulting to allow: {e}" + ) + return HookResult( + success=False, + decision=HookDecision.ALLOW, + reason="Agent hook execution failed — defaulting to allow", + error=str(e), + ) + finally: + if conversation is not None: + conversation.close() + + if not raw: + logger.warning( + f"Agent hook produced no final response for event '{event_type}'" + " — defaulting to allow" + ) + return HookResult( + success=False, + decision=HookDecision.ALLOW, + reason="Agent hook produced no final response — defaulting to allow", + ) + + try: + data = json.loads(raw) + decision_str = str(data.get("decision", "allow")).lower() + reason = str(data.get("reason", "")) + if decision_str == "deny": + return HookResult( + success=True, + blocked=True, + decision=HookDecision.DENY, + reason=reason, + ) + return HookResult( + success=True, + decision=HookDecision.ALLOW, + reason=reason, + ) + except (json.JSONDecodeError, AttributeError): + logger.warning( + f"Agent hook returned non-JSON response for event '{event_type}'" + f" — defaulting to allow: {repr(raw)[:200]}" + ) + return HookResult( + success=True, + decision=HookDecision.ALLOW, + reason="Agent hook returned non-JSON response — defaulting to allow", + ) def execute( self, @@ -155,6 +336,24 @@ def execute( env: dict[str, str] | None = None, ) -> HookResult: """Execute a single hook.""" + if hook.type == HookType.AGENT: + return self._execute_agent_hook(hook, event) + if hook.type == HookType.PROMPT: + event_type = ( + event.event_type + if isinstance(event.event_type, str) + else event.event_type.value + ) + logger.warning( + f"PROMPT hooks are not yet implemented — defaulting to allow" + f" (event_type={event_type})" + ) + return HookResult( + success=False, + decision=HookDecision.ALLOW, + reason="PROMPT hooks are not yet implemented — defaulting to allow", + ) + # Prepare environment hook_env = sanitized_env() hook_env["OPENHANDS_PROJECT_DIR"] = self.working_dir diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py index af57be2734..7f7b50945f 100644 --- a/openhands-sdk/openhands/sdk/hooks/manager.py +++ b/openhands-sdk/openhands/sdk/hooks/manager.py @@ -1,13 +1,17 @@ """Hook manager - orchestrates hook execution within conversations.""" import logging -from typing import Any +from typing import TYPE_CHECKING, Any from openhands.sdk.hooks.config import HookConfig from openhands.sdk.hooks.executor import HookExecutor, HookResult from openhands.sdk.hooks.types import HookEvent, HookEventType +if TYPE_CHECKING: + from openhands.sdk.llm import LLM + + logger = logging.getLogger(__name__) @@ -19,9 +23,10 @@ def __init__( config: HookConfig | None = None, working_dir: str | None = None, session_id: str | None = None, + llm: "LLM | None" = None, ): self.config = config or HookConfig.load(working_dir=working_dir) - self.executor = HookExecutor(working_dir=working_dir) + self.executor = HookExecutor(working_dir=working_dir, llm=llm) self.session_id = session_id self.working_dir = working_dir diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py index f5a1c832e4..d5059fbd82 100644 --- a/tests/sdk/hooks/test_config.py +++ b/tests/sdk/hooks/test_config.py @@ -10,6 +10,68 @@ from openhands.sdk.hooks.types import HookEventType +class TestHookDefinitionValidation: + """Tests for HookDefinition type-specific field validation.""" + + def test_command_hook_requires_command(self): + """COMMAND type must have a command field.""" + with pytest.raises(Exception, match="'command' is required"): + HookDefinition(type="command") + + def test_command_hook_valid(self): + """COMMAND type with command is valid.""" + h = HookDefinition(command="echo hi") + assert h.type == HookType.COMMAND + assert h.command == "echo hi" + + def test_agent_hook_valid_with_prompt(self): + """AGENT type with prompt (no command) is valid.""" + h = HookDefinition(type="agent", prompt="Block writes to /etc") + assert h.type == HookType.AGENT + assert h.prompt == "Block writes to /etc" + assert h.command is None + + def test_agent_hook_valid_without_prompt(self): + """AGENT type without prompt is valid (prompt is optional).""" + h = HookDefinition(type="agent") + assert h.type == HookType.AGENT + assert h.prompt is None + + def test_agent_hook_rejects_command_field(self): + """AGENT type must not have command set alongside it.""" + with pytest.raises( + Exception, match="'command' must not be set when type is 'agent'" + ): + HookDefinition(type="agent", command="echo hi") + + def test_agent_hook_rejects_async(self): + """AGENT type does not support async_=True.""" + with pytest.raises(Exception, match="not supported for agent hooks"): + HookDefinition.model_validate({"type": "agent", "async": True}) + + def test_agent_hook_from_json(self): + """AGENT hook can be loaded from JSON config.""" + data = { + "stop": [ + { + "hooks": [ + { + "type": "agent", + "prompt": "Verify all tasks are done", + "timeout": 30, + } + ] + } + ] + } + config = HookConfig.from_dict(data) + hooks = config.get_hooks_for_event(HookEventType.STOP) + assert len(hooks) == 1 + assert hooks[0].type == HookType.AGENT + assert hooks[0].prompt == "Verify all tasks are done" + assert hooks[0].timeout == 30 + + class TestHookMatcher: """Tests for HookMatcher pattern matching.""" diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 26d31d0565..69e6323b68 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -3,12 +3,15 @@ import json import subprocess from unittest import mock +from unittest.mock import MagicMock, patch import pytest +from pydantic import SecretStr from openhands.sdk.hooks.config import HookDefinition from openhands.sdk.hooks.executor import HookExecutor from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType +from openhands.sdk.llm import LLM from tests.command_utils import python_command @@ -421,3 +424,216 @@ def test_cleanup_expired_keeps_active_processes(self, tmp_path): # Clean up for test teardown process.terminate() + + +class TestAgentHookExecution: + """Tests for HookType.AGENT execution path.""" + + # Patch at the package/module level so lazy `from X import Y` picks up the mock. + _AGENT_PATH = "openhands.sdk.agent.Agent" + _CONV_PATH = "openhands.sdk.conversation.impl.local_conversation.LocalConversation" + _RESPONSE_PATH = ( + "openhands.sdk.conversation.response_utils.get_agent_final_response" + ) + + @pytest.fixture + def mock_llm(self): + return LLM(model="gpt-4o", api_key=SecretStr("test-key"), usage_id="test") + + @pytest.fixture + def executor(self, tmp_path, mock_llm): + return HookExecutor(working_dir=str(tmp_path), llm=mock_llm) + + @pytest.fixture + def executor_no_llm(self, tmp_path): + return HookExecutor(working_dir=str(tmp_path), llm=None) + + @pytest.fixture + def sample_event(self): + return HookEvent( + event_type=HookEventType.STOP, + session_id="test-session", + ) + + def test_execute_dispatches_to_agent_hook(self, executor, sample_event): + """execute() routes AGENT type to _execute_agent_hook, not subprocess.""" + from openhands.sdk.hooks.executor import HookResult + + hook = HookDefinition(type="agent", prompt="Verify task completion") + + with patch.object( + executor, + "_execute_agent_hook", + return_value=HookResult( + success=True, decision=HookDecision.ALLOW, reason="ok" + ), + ) as mock_agent: + result = executor.execute(hook, sample_event) + mock_agent.assert_called_once_with(hook, sample_event) + + assert result.decision == HookDecision.ALLOW + assert result.should_continue + + def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event): + """Agent hook with no LLM configured defaults to allow without error.""" + hook = HookDefinition(type="agent", prompt="Check something") + result = executor_no_llm.execute(hook, sample_event) + + assert result.success + assert result.decision == HookDecision.ALLOW + assert not result.blocked + + def test_deny_decision_blocks(self, executor, sample_event): + """_execute_agent_hook parsing deny JSON produces a blocked HookResult.""" + deny_json = '{"decision": "deny", "reason": "Tasks not complete"}' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=deny_json), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent", prompt="Check tasks") + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "Tasks not complete" + assert not result.should_continue + + def test_allow_decision_passes(self, executor, sample_event): + """_execute_agent_hook parsing allow JSON produces a non-blocking HookResult.""" + allow_json = '{"decision": "allow", "reason": "Looks safe"}' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=allow_json), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.reason == "Looks safe" + + def test_invalid_json_defaults_to_allow(self, executor, sample_event): + """_execute_agent_hook with non-JSON LLM response defaults to allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value="I think you should allow this."), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.should_continue + + def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event): + """_execute_agent_hook when sub-conversation raises defaults to allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH, side_effect=RuntimeError("workspace error")), + ): + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.error is not None + + def test_empty_response_defaults_to_allow(self, executor, sample_event): + """_execute_agent_hook when agent produces no response defaults to allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=""), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.success is False + + def test_timeout_enforced(self, executor, sample_event): + """Agent hook respects hook.timeout via ThreadPoolExecutor.""" + import time + + def slow_run(): + time.sleep(10) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + ): + mock_conv = MagicMock() + mock_conv.run = slow_run + mock_conv_cls.return_value = mock_conv + + hook = HookDefinition(type="agent", timeout=1) + start = time.time() + result = executor._execute_agent_hook(hook, sample_event) + elapsed = time.time() - start + + assert elapsed < 5 + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert "timed out" in (result.reason or "") + + def test_metrics_are_isolated_from_parent_llm(self, executor, sample_event): + """model_copy + reset_metrics ensures hook tokens don't pollute parent LLM.""" + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + hook = HookDefinition(type="agent") + executor._execute_agent_hook(hook, sample_event) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + assert hook_llm is not executor.llm + assert hook_llm.usage_id == "hook-agent-evaluator" + + +class TestPromptHookNotImplemented: + """HookType.PROMPT is a future stub — execution defaults to allow, never crashes.""" + + @pytest.fixture + def executor(self, tmp_path): + return HookExecutor(working_dir=str(tmp_path)) + + @pytest.fixture + def sample_event(self): + return HookEvent(event_type=HookEventType.PRE_TOOL_USE, tool_name="BashTool") + + def test_prompt_hook_defaults_to_allow(self, executor, sample_event): + """Executing a PROMPT hook returns allow instead of crashing.""" + hook = HookDefinition(type="prompt") + result = executor.execute(hook, sample_event) + assert result.decision == HookDecision.ALLOW + assert result.success is False + assert "not yet implemented" in (result.reason or "") + + def test_prompt_hook_does_not_block(self, executor, sample_event): + """PROMPT hook must not block the operation while unimplemented.""" + hook = HookDefinition(type="prompt") + result = executor.execute(hook, sample_event) + assert result.blocked is False + assert result.should_continue is True + + def test_prompt_hook_without_command_validates(self): + """PROMPT hook with no command is valid at config time (future use).""" + hook = HookDefinition(type="prompt") + assert hook.command is None From 6e6406c0fa8d86509e1b4e6789cd5728f4ad0bfe Mon Sep 17 00:00:00 2001 From: "burak.keles" Date: Thu, 30 Apr 2026 16:48:02 +0300 Subject: [PATCH 02/12] Add display_command property to HookDefinition for improved hook labeling --- openhands-sdk/openhands/sdk/hooks/config.py | 13 +++++---- tests/sdk/hooks/test_config.py | 31 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index de0f1105ce..c13dfd5e26 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -48,6 +48,7 @@ class HookDefinition(BaseModel): """A single hook definition.""" type: HookType = HookType.COMMAND + name: str | None = None command: str | None = None prompt: str | None = None timeout: int = 60 @@ -74,11 +75,13 @@ def _validate_type_fields(self) -> "HookDefinition": @property def display_command(self) -> str: """Human-readable label for this hook used in logs and events.""" - return ( - self.command - if self.command is not None - else f"agent-hook:{self.type.value}" - ) + if self.command is not None: + return self.command + if self.name is not None: + return f"agent-hook:{self.name}" + if self.prompt: + return f"agent-hook:{self.prompt[:20]}" + return "agent-hook:agent" class HookMatcher(BaseModel): """Matches events to hooks based on patterns. diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py index d5059fbd82..568e8ccfab 100644 --- a/tests/sdk/hooks/test_config.py +++ b/tests/sdk/hooks/test_config.py @@ -37,6 +37,37 @@ def test_agent_hook_valid_without_prompt(self): assert h.type == HookType.AGENT assert h.prompt is None + def test_display_command_uses_command_for_command_hooks(self): + h = HookDefinition(command="block.sh") + assert h.display_command == "block.sh" + + def test_display_command_uses_name_when_set(self): + h = HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf") + assert h.display_command == "agent-hook:block-deletions" + + def test_display_command_falls_back_to_prompt_prefix(self): + h = HookDefinition(type="agent", prompt="Block network calls to external IPs") + assert h.display_command == "agent-hook:Block network calls " + + def test_display_command_truncates_long_prompt(self): + long_prompt = "A" * 100 + h = HookDefinition(type="agent", prompt=long_prompt) + assert h.display_command == f"agent-hook:{'A' * 20}" + + def test_display_command_fallback_when_no_name_no_prompt(self): + h = HookDefinition(type="agent") + assert h.display_command == "agent-hook:agent" + + def test_three_hooks_are_distinguishable(self): + """Multiple agent hooks without names get unique labels via prompt prefix.""" + hooks = [ + HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf"), + HookDefinition(type="agent", prompt="Block network calls to external IPs"), + HookDefinition(type="agent", prompt="Verify all tasks are complete"), + ] + labels = [h.display_command for h in hooks] + assert len(set(labels)) == 3 # all distinct + def test_agent_hook_rejects_command_field(self): """AGENT type must not have command set alongside it.""" with pytest.raises( From 7eb38715f0d7efc0d39c0a0f62b0b7b71826640e Mon Sep 17 00:00:00 2001 From: "burak.keles" Date: Sat, 9 May 2026 18:23:26 +0300 Subject: [PATCH 03/12] Address PR review feedback on agent hook implementation - Replace hardcoded system prompt with user-configurable system_prompt field - Replace prompt field with system_prompt on HookDefinition - Make tools configurable via hook.tools (default empty list) - Thread persistence_dir and visualizer through HookManager and create_hook_callback - Add max_iterations field to HookDefinition; remove hardcoded 10 - Inject event payload into user message so agent always has context to evaluate - Call conversation.pause() before pool.shutdown on timeout for safe cleanup - Change HookType to StrEnum - Move test_config.py tests from classes to module-level functions with parametrize --- .../conversation/impl/local_conversation.py | 2 + openhands-sdk/openhands/sdk/hooks/config.py | 10 +- .../openhands/sdk/hooks/conversation_hooks.py | 8 + openhands-sdk/openhands/sdk/hooks/executor.py | 45 ++--- openhands-sdk/openhands/sdk/hooks/manager.py | 10 +- tests/sdk/hooks/test_config.py | 168 ++++++++---------- tests/sdk/hooks/test_executor.py | 6 +- 7 files changed, 122 insertions(+), 127 deletions(-) diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index 279794f223..047acc8e16 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -566,6 +566,8 @@ def _ensure_plugins_loaded(self) -> None: session_id=str(self._state.id), original_callback=self._base_callback, llm=self.agent.llm, + persistence_dir=self._state.persistence_dir, + visualizer=self._visualizer, ) self._hook_processor.set_conversation_state(self._state) self._hook_processor.run_session_start() diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index c13dfd5e26..673f8ca7cc 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -50,8 +50,10 @@ class HookDefinition(BaseModel): type: HookType = HookType.COMMAND name: str | None = None command: str | None = None - prompt: str | None = None + system_prompt: str | None = None + tools: list[str] = Field(default_factory=list) timeout: int = 60 + max_iterations: int = 10 async_: bool = Field(default=False, alias="async") # 'async' is a reserved keyword model_config = { @@ -66,7 +68,7 @@ def _validate_type_fields(self) -> "HookDefinition": raise ValueError("'prompt' is required when type is 'prompt'") if self.type == HookType.AGENT and self.command is not None: raise ValueError( - "'command' must not be set when type is 'agent'; use 'prompt' instead" + "'command' must not be set when type is 'agent'; use 'system_prompt' instead" ) if self.type == HookType.AGENT and self.async_: raise ValueError("'async' is not supported for agent hooks") @@ -79,8 +81,8 @@ def display_command(self) -> str: return self.command if self.name is not None: return f"agent-hook:{self.name}" - if self.prompt: - return f"agent-hook:{self.prompt[:20]}" + if self.system_prompt: + return f"agent-hook:{self.system_prompt[:20]}" return "agent-hook:agent" class HookMatcher(BaseModel): diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py index a0438075ac..fa359e4afd 100644 --- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py +++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py @@ -18,6 +18,8 @@ from openhands.sdk.logger import get_logger +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase + if TYPE_CHECKING: from openhands.sdk.conversation.state import ConversationState from openhands.sdk.llm import LLM @@ -391,6 +393,8 @@ def create_hook_callback( original_callback: Any = None, emit_hook_events: bool = True, llm: "LLM | None" = None, + persistence_dir: str | None = None, + visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, ) -> tuple[HookEventProcessor, Any]: """Create a hook-enabled event callback. Returns (processor, callback). @@ -402,6 +406,8 @@ def create_hook_callback( emit_hook_events: If True, emit HookExecutionEvent for each hook execution. Defaults to True for full observability. llm: LLM instance inherited from the parent conversation, used by agent hooks. + persistence_dir: Directory used to persist agent hook sub-conversation events. + visualizer: Visualizer instance passed to agent hook sub-conversations. Returns: Tuple of (HookEventProcessor, callback function). @@ -411,6 +417,8 @@ def create_hook_callback( working_dir=working_dir, session_id=session_id, llm=llm, + persistence_dir=persistence_dir, + visualizer=visualizer, ) processor = HookEventProcessor( diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 978f6354b6..71afbe513e 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -12,6 +12,7 @@ import opentelemetry.context as otel_context from pydantic import BaseModel +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase from openhands.sdk.hooks.config import HookDefinition, HookType from openhands.sdk.hooks.types import HookDecision, HookEvent from openhands.sdk.observability.laminar import observe @@ -153,10 +154,14 @@ def __init__( working_dir: str | None = None, async_process_manager: AsyncProcessManager | None = None, llm: "LLM | None" = None, + persistence_dir: str | None = None, + visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, ): self.working_dir = working_dir or os.getcwd() self.async_process_manager = async_process_manager or AsyncProcessManager() self.llm = llm + self.persistence_dir = persistence_dir + self.visualizer = visualizer @observe( name="hook.execute.agent", @@ -170,8 +175,8 @@ def _execute_agent_hook( ) -> HookResult: """Execute an agent-based hook by spawning a short-lived sub-conversation. - The sub-conversation inherits the parent LLM, runs with read-only tools - and no hooks (preventing recursion), and must return a JSON decision. + The sub-conversation inherits the parent LLM, runs with user-configured + tools and no hooks (preventing recursion), and must return a JSON decision. """ # Lazy imports to avoid circular dependency: # executor <- manager <- conversation_hooks <- local_conversation -> executor @@ -197,22 +202,6 @@ def _execute_agent_hook( reason="No LLM configured for agent hook", ) - no_policy = "No additional policy specified. Use your best judgment." - system_prompt = ( - "You are a security policy evaluator for an AI coding agent. " - "Your only job is to decide whether an agent operation" - " should be allowed or denied.\n\n" - f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```\n\n" - "## Instructions\n" - "Use the available tools to investigate if needed (e.g. read files" - " in the working directory). Then call `finish` with a JSON string" - " as the message:\n" - ' {"decision": "allow" | "deny", "reason": ""}\n\n' - "When in doubt, prefer `allow`. Only deny when the operation clearly" - " violates the policy below.\n\n" - f"## Policy\n{hook.prompt or no_policy}" - ) - # Isolated LLM copy — hook metrics not attributed to the parent conversation. hook_llm = self.llm.model_copy(update={"usage_id": "hook-agent-evaluator"}) hook_llm.reset_metrics() @@ -225,12 +214,9 @@ def _execute_agent_hook( try: agent = Agent( llm=hook_llm, - tools=[ - Tool(name="GrepTool"), - Tool(name="GlobTool"), - ], + tools=[Tool(name=t) for t in hook.tools], include_default_tools=["FinishTool"], - system_prompt=system_prompt, + system_prompt=hook.system_prompt, ) # hook_config=None disables all hooks in the sub-conversation (no recursion) conversation = LocalConversation( @@ -238,12 +224,14 @@ def _execute_agent_hook( workspace=self.working_dir, plugins=None, hook_config=None, - persistence_dir=None, - visualizer=None, - max_iteration_per_run=10, + persistence_dir=self.persistence_dir, + visualizer=self.visualizer, + max_iteration_per_run=hook.max_iterations, ) + # Event payload is always injected — the agent needs it to evaluate. conversation.send_message( - f"Evaluate this {event_type} hook event and make your decision." + f"Evaluate this {event_type} hook event and make your decision.\n\n" + f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```" ) def _run_with_context() -> None: @@ -263,6 +251,7 @@ def _run_with_context() -> None: f"Agent hook timed out after {hook.timeout}s" f" for event '{event_type}' — defaulting to allow" ) + conversation.pause() return HookResult( success=False, decision=HookDecision.ALLOW, @@ -272,8 +261,6 @@ def _run_with_context() -> None: ), ) finally: - # Don't block waiting for the thread — the conversation may still be - # running but we've already returned or moved on. pool.shutdown(wait=False) raw = get_agent_final_response(conversation.state.events) except Exception as e: diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py index 7f7b50945f..a3c725e9a1 100644 --- a/openhands-sdk/openhands/sdk/hooks/manager.py +++ b/openhands-sdk/openhands/sdk/hooks/manager.py @@ -3,6 +3,7 @@ import logging from typing import TYPE_CHECKING, Any +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase from openhands.sdk.hooks.config import HookConfig from openhands.sdk.hooks.executor import HookExecutor, HookResult from openhands.sdk.hooks.types import HookEvent, HookEventType @@ -24,9 +25,16 @@ def __init__( working_dir: str | None = None, session_id: str | None = None, llm: "LLM | None" = None, + persistence_dir: str | None = None, + visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, ): self.config = config or HookConfig.load(working_dir=working_dir) - self.executor = HookExecutor(working_dir=working_dir, llm=llm) + self.executor = HookExecutor( + working_dir=working_dir, + llm=llm, + persistence_dir=persistence_dir, + visualizer=visualizer, + ) self.session_id = session_id self.working_dir = working_dir diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py index 568e8ccfab..255283e010 100644 --- a/tests/sdk/hooks/test_config.py +++ b/tests/sdk/hooks/test_config.py @@ -10,97 +10,85 @@ from openhands.sdk.hooks.types import HookEventType -class TestHookDefinitionValidation: - """Tests for HookDefinition type-specific field validation.""" - - def test_command_hook_requires_command(self): - """COMMAND type must have a command field.""" - with pytest.raises(Exception, match="'command' is required"): - HookDefinition(type="command") - - def test_command_hook_valid(self): - """COMMAND type with command is valid.""" - h = HookDefinition(command="echo hi") - assert h.type == HookType.COMMAND - assert h.command == "echo hi" - - def test_agent_hook_valid_with_prompt(self): - """AGENT type with prompt (no command) is valid.""" - h = HookDefinition(type="agent", prompt="Block writes to /etc") - assert h.type == HookType.AGENT - assert h.prompt == "Block writes to /etc" - assert h.command is None - - def test_agent_hook_valid_without_prompt(self): - """AGENT type without prompt is valid (prompt is optional).""" - h = HookDefinition(type="agent") - assert h.type == HookType.AGENT - assert h.prompt is None - - def test_display_command_uses_command_for_command_hooks(self): - h = HookDefinition(command="block.sh") - assert h.display_command == "block.sh" - - def test_display_command_uses_name_when_set(self): - h = HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf") - assert h.display_command == "agent-hook:block-deletions" - - def test_display_command_falls_back_to_prompt_prefix(self): - h = HookDefinition(type="agent", prompt="Block network calls to external IPs") - assert h.display_command == "agent-hook:Block network calls " - - def test_display_command_truncates_long_prompt(self): - long_prompt = "A" * 100 - h = HookDefinition(type="agent", prompt=long_prompt) - assert h.display_command == f"agent-hook:{'A' * 20}" - - def test_display_command_fallback_when_no_name_no_prompt(self): - h = HookDefinition(type="agent") - assert h.display_command == "agent-hook:agent" - - def test_three_hooks_are_distinguishable(self): - """Multiple agent hooks without names get unique labels via prompt prefix.""" - hooks = [ - HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf"), - HookDefinition(type="agent", prompt="Block network calls to external IPs"), - HookDefinition(type="agent", prompt="Verify all tasks are complete"), +def test_command_hook_requires_command(): + with pytest.raises(Exception, match="'command' is required"): + HookDefinition(type="command") + + +def test_command_hook_valid(): + h = HookDefinition(command="echo hi") + assert h.type == HookType.COMMAND + assert h.command == "echo hi" + + +@pytest.mark.parametrize( + "kwargs", + [ + {"type": "agent", "system_prompt": "Block writes to /etc"}, + {"type": "agent"}, + ], + ids=["with-system-prompt", "without-system-prompt"], +) +def test_agent_hook_valid(kwargs): + h = HookDefinition(**kwargs) + assert h.type == HookType.AGENT + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + ({"command": "block.sh"}, "block.sh"), + ({"type": "agent", "name": "block-deletions", "system_prompt": "Block rm -rf"}, "agent-hook:block-deletions"), + ({"type": "agent", "system_prompt": "Block network calls to external IPs"}, "agent-hook:Block network calls "), + ({"type": "agent", "system_prompt": "A" * 100}, f"agent-hook:{'A' * 20}"), + ({"type": "agent"}, "agent-hook:agent"), + ], + ids=["command", "agent-named", "agent-prompt-prefix", "agent-prompt-truncated", "agent-fallback"], +) +def test_display_command(kwargs, expected): + h = HookDefinition(**kwargs) + assert h.display_command == expected + + +def test_multiple_agent_hooks_are_distinguishable(): + hooks = [ + HookDefinition(type="agent", name="block-deletions", system_prompt="Block rm -rf"), + HookDefinition(type="agent", system_prompt="Block network calls to external IPs"), + HookDefinition(type="agent", system_prompt="Verify all tasks are complete"), + ] + assert len({h.display_command for h in hooks}) == 3 + + +@pytest.mark.parametrize( + "kwargs,match", + [ + ({"type": "agent", "command": "echo hi"}, "'command' must not be set when type is 'agent'"), + ({"type": "command"}, "'command' is required"), + ], + ids=["agent-rejects-command", "command-requires-command"], +) +def test_hook_definition_validation_errors(kwargs, match): + with pytest.raises(Exception, match=match): + HookDefinition(**kwargs) + + +def test_agent_hook_rejects_async(): + with pytest.raises(Exception, match="not supported for agent hooks"): + HookDefinition.model_validate({"type": "agent", "async": True}) + + +def test_agent_hook_from_json(): + data = { + "stop": [ + {"hooks": [{"type": "agent", "system_prompt": "Verify all tasks are done", "timeout": 30}]} ] - labels = [h.display_command for h in hooks] - assert len(set(labels)) == 3 # all distinct - - def test_agent_hook_rejects_command_field(self): - """AGENT type must not have command set alongside it.""" - with pytest.raises( - Exception, match="'command' must not be set when type is 'agent'" - ): - HookDefinition(type="agent", command="echo hi") - - def test_agent_hook_rejects_async(self): - """AGENT type does not support async_=True.""" - with pytest.raises(Exception, match="not supported for agent hooks"): - HookDefinition.model_validate({"type": "agent", "async": True}) - - def test_agent_hook_from_json(self): - """AGENT hook can be loaded from JSON config.""" - data = { - "stop": [ - { - "hooks": [ - { - "type": "agent", - "prompt": "Verify all tasks are done", - "timeout": 30, - } - ] - } - ] - } - config = HookConfig.from_dict(data) - hooks = config.get_hooks_for_event(HookEventType.STOP) - assert len(hooks) == 1 - assert hooks[0].type == HookType.AGENT - assert hooks[0].prompt == "Verify all tasks are done" - assert hooks[0].timeout == 30 + } + config = HookConfig.from_dict(data) + hooks = config.get_hooks_for_event(HookEventType.STOP) + assert len(hooks) == 1 + assert hooks[0].type == HookType.AGENT + assert hooks[0].system_prompt == "Verify all tasks are done" + assert hooks[0].timeout == 30 class TestHookMatcher: diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 69e6323b68..564050f12d 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -459,7 +459,7 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event): """execute() routes AGENT type to _execute_agent_hook, not subprocess.""" from openhands.sdk.hooks.executor import HookResult - hook = HookDefinition(type="agent", prompt="Verify task completion") + hook = HookDefinition(type="agent", system_prompt="Verify task completion") with patch.object( executor, @@ -476,7 +476,7 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event): def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event): """Agent hook with no LLM configured defaults to allow without error.""" - hook = HookDefinition(type="agent", prompt="Check something") + hook = HookDefinition(type="agent", system_prompt="Check something") result = executor_no_llm.execute(hook, sample_event) assert result.success @@ -493,7 +493,7 @@ def test_deny_decision_blocks(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=deny_json), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent", prompt="Check tasks") + hook = HookDefinition(type="agent", system_prompt="Check tasks") result = executor._execute_agent_hook(hook, sample_event) assert result.blocked From c166503bf995ba39a3ce85e59f9fa98c1028d88e Mon Sep 17 00:00:00 2001 From: "burak.keles" Date: Thu, 21 May 2026 13:01:22 +0300 Subject: [PATCH 04/12] Introduce agent-based hooks example and improve agent hook execution - Add `49_agent_hooks` example demoing `type="agent"` hooks for lifecycle events. - Includes security and quality review use cases with detailed README. - Refactor hook executor for enhanced JSON parsing and timeout handling. - Simplify sub-conversation logic with `_allow` and `_parse_decision` helpers. - Improve robustness against non-standard LLM responses like fenced JSON or chatter. - Make hook LLM copy respect `hook.timeout` and ensure usage metrics isolation. - Extend test coverage for various hook scenarios, including agent LLM configuration. --- .../49_agent_hooks/49_agent_hooks.py | 166 ++++++++++++++++++ .../49_agent_hooks/README.md | 60 +++++++ openhands-sdk/openhands/sdk/hooks/config.py | 2 +- openhands-sdk/openhands/sdk/hooks/executor.py | 146 +++++++-------- tests/sdk/hooks/test_executor.py | 144 ++++++++++++--- 5 files changed, 416 insertions(+), 102 deletions(-) create mode 100644 examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py create mode 100644 examples/01_standalone_sdk/49_agent_hooks/README.md diff --git a/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py b/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py new file mode 100644 index 0000000000..7dbb5e6ef4 --- /dev/null +++ b/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py @@ -0,0 +1,166 @@ +"""OpenHands Agent SDK — Agent-based Hooks Example + +Demonstrates the `type="agent"` hook, which evaluates lifecycle events with an +LLM-driven sub-agent instead of a shell script. The hook agent receives the +event JSON, reasons about it semantically, and replies with a decision payload: + + {"decision": "allow" | "deny", "reason": "..."} + +Two demos: + +- PreToolUse (security reviewer): inspects the INTENT of a terminal command, + not just its syntax. A command like `awk '{print}' /etc/passwd` would slip + past a blacklist of command names, but the agent hook recognises it as + reading a sensitive system file and denies it. +- Stop (quality reviewer): refuses to let the main agent finish until the + required deliverable exists in the workspace. + +For shell-command hooks see ../33_hooks/33_hooks.py. +""" + +import os +import tempfile +from pathlib import Path + +from pydantic import SecretStr + +from openhands.sdk import LLM, Conversation +from openhands.sdk.event.hook_execution import HookExecutionEvent +from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher, HookType +from openhands.tools.preset.default import get_default_agent + + +# Configure LLM +api_key = os.getenv("LLM_API_KEY") +assert api_key is not None, "LLM_API_KEY environment variable is not set." +model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") +base_url = os.getenv("LLM_BASE_URL") + +llm = LLM( + usage_id="agent", + model=model, + base_url=base_url, + api_key=SecretStr(api_key), +) + +SECURITY_REVIEWER_PROMPT = """You are a security reviewer for a coding agent. +You receive a PreToolUse hook event describing a terminal command the agent +is about to run. + +Reason about the SEMANTIC INTENT of the command, not just its syntax. Block: +- reading sensitive system files (/etc/passwd, /etc/shadow, ~/.ssh/*, credentials) +- broad filesystem discovery outside the workspace (rg --files /etc, find /, ls /root) +- exfiltrating data over the network +- modifying system state (sudo, apt install, systemctl) + +Allow ordinary inspection, builds, tests, and edits inside the workspace. +When unsure, prefer allow. + +Reply with a single JSON object and nothing else: +{"decision": "allow" | "deny", "reason": ""} +""" + +QUALITY_REVIEWER_PROMPT = """You are a quality reviewer enforcing task completion. +You receive a Stop hook event when the main agent tries to finish. + +The task requires the file REPORT.md to exist in the workspace and contain at +least one bullet point describing the repository. Use the file_editor tool to +check whether the file exists and inspect its contents. + +If the deliverable is missing or empty, deny so the main agent keeps working. +Otherwise allow. + +Reply with a single JSON object and nothing else: +{"decision": "allow" | "deny", "reason": ""} +""" + + +def hook_logger(event) -> None: + """Surface each hook decision so the demo output is self-explanatory.""" + if not isinstance(event, HookExecutionEvent): + return + status = "DENY " if event.blocked else ("ALLOW" if event.success else "FAIL ") + line = f" [hook] {event.hook_event_type} {status} -> {event.hook_command}" + if event.reason: + line += f"\n reason: {event.reason}" + print(line) + + +with tempfile.TemporaryDirectory() as tmpdir: + workspace = Path(tmpdir) + + hook_config = HookConfig( + pre_tool_use=[ + HookMatcher( + matcher="terminal", + hooks=[ + HookDefinition( + type=HookType.AGENT, + name="security-reviewer", + system_prompt=SECURITY_REVIEWER_PROMPT, + timeout=60, + max_iterations=3, + ) + ], + ) + ], + stop=[ + HookMatcher( + hooks=[ + HookDefinition( + type=HookType.AGENT, + name="quality-reviewer", + system_prompt=QUALITY_REVIEWER_PROMPT, + tools=["file_editor"], + timeout=90, + max_iterations=5, + ) + ], + ) + ], + ) + + agent = get_default_agent(llm=llm) + conversation = Conversation( + agent=agent, + workspace=str(workspace), + hook_config=hook_config, + callbacks=[hook_logger], + ) + + print("=" * 60) + print("Demo 1: PreToolUse — semantic deny") + print("=" * 60) + print( + "Asking the agent to read /etc/passwd via awk. The literal command\n" + "wouldn't match a syntactic blacklist (no `cat`, no `/etc/shadow`\n" + "keyword), but the security-reviewer agent should recognise the\n" + "intent and deny.\n" + ) + conversation.send_message( + "Show me the contents of /etc/passwd using awk '{print}'." + ) + conversation.run() + + print("\n" + "=" * 60) + print("Demo 2: Stop — deny until deliverable exists") + print("=" * 60) + print( + "Quality reviewer denies until REPORT.md exists with a bullet point.\n" + ) + conversation.send_message( + "Write REPORT.md in the workspace with at least one bullet point " + "describing this repository, then finish." + ) + conversation.run() + + report = workspace / "REPORT.md" + if report.exists(): + print(f"\n[REPORT.md preview: {report.read_text()[:120]!r}...]") + + print("\n" + "=" * 60) + print("Example Complete!") + print("=" * 60) + + cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost + print(f"\nEXAMPLE_COST: {cost}") \ No newline at end of file diff --git a/examples/01_standalone_sdk/49_agent_hooks/README.md b/examples/01_standalone_sdk/49_agent_hooks/README.md new file mode 100644 index 0000000000..4a354c1b26 --- /dev/null +++ b/examples/01_standalone_sdk/49_agent_hooks/README.md @@ -0,0 +1,60 @@ +# Agent-based Hooks Example + +This folder demonstrates the `type="agent"` hook — a lifecycle hook whose +decision is produced by an LLM-driven sub-agent rather than a shell script. + +For shell-command hooks see [`../33_hooks/`](../33_hooks). + +## Why an agent hook + +A shell-based PreToolUse hook can only block what its blacklist literally +matches. The agent rewrites `cat /etc/passwd` as `awk '{print}' /etc/passwd` +and slips through. An agent hook reasons about the **semantic intent** of the +command — "reading a sensitive system file" — and denies regardless of the +exact tool name used. + +## Example + +- **49_agent_hooks.py** — Two agent hooks wired onto a single conversation: + - **PreToolUse** "security reviewer" denies a command whose intent is to + read `/etc/passwd`, even though no obvious keyword appears in a blacklist. + - **Stop** "quality reviewer" refuses to let the main agent finish until + the required deliverable (`REPORT.md`) is present in the workspace. + +Each hook decision is printed to the console via a `HookExecutionEvent` +callback, so you can watch the allow/deny outcomes as the demo runs. + +## Running + +```bash +export LLM_API_KEY="your-key" +export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929" # optional +export LLM_BASE_URL="https://your-endpoint" # optional + +python 49_agent_hooks.py +``` + +## How an agent hook is configured + +```python +HookDefinition( + type=HookType.AGENT, + name="security-reviewer", # bucket for cost metrics (hook-agent:) + system_prompt="...", # instructs the hook agent; must request JSON + tools=["file_editor"], # optional tools the hook agent may use + # (use registered names, e.g. "file_editor", + # "terminal" — not class names like + # "FileEditorTool") + timeout=60, # forwarded to the per-hook LLM copy + max_iterations=3, # cap on hook sub-conversation steps +) +``` + +The hook agent receives the event JSON and must reply with: + +```json +{"decision": "allow" | "deny", "reason": ""} +``` + +Anything else (non-JSON, missing field, sub-conversation error) defaults to +`allow` so a broken hook cannot wedge the main agent. diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index 673f8ca7cc..7efe68501d 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -53,7 +53,7 @@ class HookDefinition(BaseModel): system_prompt: str | None = None tools: list[str] = Field(default_factory=list) timeout: int = 60 - max_iterations: int = 10 + max_iterations: int = 3 async_: bool = Field(default=False, alias="async") # 'async' is a reserved keyword model_config = { diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 71afbe513e..1178065272 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -1,6 +1,5 @@ """Hook executor - runs shell commands and agent evaluations with JSON I/O.""" -import concurrent.futures import json import logging import os @@ -9,7 +8,6 @@ import time from typing import TYPE_CHECKING -import opentelemetry.context as otel_context from pydantic import BaseModel from openhands.sdk.conversation.visualizer import ConversationVisualizerBase @@ -163,6 +161,20 @@ def __init__( self.persistence_dir = persistence_dir self.visualizer = visualizer + def _allow( + self, + reason: str, + *, + success: bool = False, + error: str | None = None, + ) -> HookResult: + return HookResult( + success=success, + decision=HookDecision.ALLOW, + reason=reason, + error=error, + ) + @observe( name="hook.execute.agent", ignore_inputs=["self", "hook", "event"], @@ -173,11 +185,6 @@ def _execute_agent_hook( hook: HookDefinition, event: HookEvent, ) -> HookResult: - """Execute an agent-based hook by spawning a short-lived sub-conversation. - - The sub-conversation inherits the parent LLM, runs with user-configured - tools and no hooks (preventing recursion), and must return a JSON decision. - """ # Lazy imports to avoid circular dependency: # executor <- manager <- conversation_hooks <- local_conversation -> executor from openhands.sdk.agent import Agent # type: ignore[attr-defined] @@ -196,19 +203,14 @@ def _execute_agent_hook( f"Agent hook has no LLM configured for event '{event_type}'" " — defaulting to allow" ) - return HookResult( - success=True, - decision=HookDecision.ALLOW, - reason="No LLM configured for agent hook", - ) - - # Isolated LLM copy — hook metrics not attributed to the parent conversation. - hook_llm = self.llm.model_copy(update={"usage_id": "hook-agent-evaluator"}) - hook_llm.reset_metrics() + return self._allow("No LLM configured for agent hook") - # Capture the current OTel span context so the sub-conversation thread - # becomes a child of this span rather than an orphaned trace. - parent_ctx = otel_context.get_current() + hook_llm = self.llm.model_copy( + update={ + "usage_id": f"hook-agent:{hook.name or 'default'}", + "timeout": hook.timeout, + } + ) conversation = None try: @@ -218,7 +220,7 @@ def _execute_agent_hook( include_default_tools=["FinishTool"], system_prompt=hook.system_prompt, ) - # hook_config=None disables all hooks in the sub-conversation (no recursion) + # hook_config=None disables hooks in the sub-conversation (no recursion) conversation = LocalConversation( agent=agent, workspace=self.working_dir, @@ -228,93 +230,79 @@ def _execute_agent_hook( visualizer=self.visualizer, max_iteration_per_run=hook.max_iterations, ) - # Event payload is always injected — the agent needs it to evaluate. conversation.send_message( f"Evaluate this {event_type} hook event and make your decision.\n\n" f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```" ) - - def _run_with_context() -> None: - token = otel_context.attach(parent_ctx) - try: - conversation.run() - finally: - otel_context.detach(token) - - pool = concurrent.futures.ThreadPoolExecutor(max_workers=1) - try: - future = pool.submit(_run_with_context) - try: - future.result(timeout=hook.timeout) - except concurrent.futures.TimeoutError: - logger.warning( - f"Agent hook timed out after {hook.timeout}s" - f" for event '{event_type}' — defaulting to allow" - ) - conversation.pause() - return HookResult( - success=False, - decision=HookDecision.ALLOW, - reason=( - f"Agent hook timed out after {hook.timeout} seconds" - " — defaulting to allow" - ), - ) - finally: - pool.shutdown(wait=False) + conversation.run() raw = get_agent_final_response(conversation.state.events) except Exception as e: logger.warning( f"Agent hook sub-conversation failed for event '{event_type}'" f" — defaulting to allow: {e}" ) - return HookResult( - success=False, - decision=HookDecision.ALLOW, - reason="Agent hook execution failed — defaulting to allow", + return self._allow( + "Agent hook execution failed — defaulting to allow", error=str(e), ) finally: if conversation is not None: conversation.close() + return self._parse_decision(raw, event_type) + + _JSON_DECODER = json.JSONDecoder() + + def _extract_first_json_object(self, text: str) -> dict | None: + # Walks the string and asks the JSON decoder to parse from each '{'. + # Handles bare JSON, ```json ... ``` fences, prose prefixes/suffixes, + # and mixed combinations — anything LLMs add around the actual object. + for i, ch in enumerate(text): + if ch != "{": + continue + try: + obj, _ = self._JSON_DECODER.raw_decode(text[i:]) + except json.JSONDecodeError: + continue + if isinstance(obj, dict): + return obj + return None + + def _parse_decision(self, raw: str, event_type: str) -> HookResult: if not raw: logger.warning( f"Agent hook produced no final response for event '{event_type}'" " — defaulting to allow" ) - return HookResult( - success=False, - decision=HookDecision.ALLOW, - reason="Agent hook produced no final response — defaulting to allow", + return self._allow( + "Agent hook produced no final response — defaulting to allow" ) - try: - data = json.loads(raw) - decision_str = str(data.get("decision", "allow")).lower() - reason = str(data.get("reason", "")) - if decision_str == "deny": - return HookResult( - success=True, - blocked=True, - decision=HookDecision.DENY, - reason=reason, - ) - return HookResult( - success=True, - decision=HookDecision.ALLOW, - reason=reason, - ) - except (json.JSONDecodeError, AttributeError): + data = self._extract_first_json_object(raw) + if data is None: logger.warning( - f"Agent hook returned non-JSON response for event '{event_type}'" - f" — defaulting to allow: {repr(raw)[:200]}" + f"Agent hook returned no parseable JSON object for event" + f" '{event_type}' — defaulting to allow: {repr(raw)[:200]}" + ) + return self._allow( + "Agent hook returned no parseable JSON — defaulting to allow", + success=True, ) + + decision_str = str(data.get("decision", "allow")).lower() + reason = str(data.get("reason", "")) + if decision_str == "deny": return HookResult( success=True, - decision=HookDecision.ALLOW, - reason="Agent hook returned non-JSON response — defaulting to allow", + blocked=True, + decision=HookDecision.DENY, + reason=reason, ) + return HookResult( + success=True, + decision=HookDecision.ALLOW, + reason=reason, + ) def execute( self, diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 564050f12d..530a8278a7 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -479,7 +479,7 @@ def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event): hook = HookDefinition(type="agent", system_prompt="Check something") result = executor_no_llm.execute(hook, sample_event) - assert result.success + assert not result.success assert result.decision == HookDecision.ALLOW assert not result.blocked @@ -518,6 +518,88 @@ def test_allow_decision_passes(self, executor, sample_event): assert result.decision == HookDecision.ALLOW assert result.reason == "Looks safe" + def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event): + """LLMs commonly wrap JSON in ```json fences; the executor must still + honour the decision instead of falling open.""" + fenced = ( + '```json\n' + '{"decision": "deny", "reason": "Sensitive file read"}\n' + '```' + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=fenced), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "Sensitive file read" + + def test_plain_fence_without_language_tag_is_parsed( + self, executor, sample_event + ): + """Some LLMs use ``` without a language tag — still extract the JSON.""" + fenced = '```\n{"decision": "allow", "reason": "ok"}\n```' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=fenced), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.reason == "ok" + + def test_prose_prefix_before_json_is_parsed(self, executor, sample_event): + """LLMs often explain themselves before emitting JSON; we must still + honour a real deny instead of silently falling open to allow.""" + prose_then_json = ( + "After reviewing the workspace I found REPORT.md is missing.\n\n" + '{"decision": "deny", "reason": "missing deliverable"}' + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=prose_then_json), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "missing deliverable" + + def test_prose_suffix_after_json_is_parsed(self, executor, sample_event): + """Trailing chatter after the JSON object must not defeat the parser.""" + json_then_prose = ( + '{"decision": "deny", "reason": "sensitive file"}\n\n' + "Let me know if you need more details." + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=json_then_prose), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type="agent") + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "sensitive file" + def test_invalid_json_defaults_to_allow(self, executor, sample_event): """_execute_agent_hook with non-JSON LLM response defaults to allow.""" with ( @@ -561,33 +643,31 @@ def test_empty_response_defaults_to_allow(self, executor, sample_event): assert result.decision == HookDecision.ALLOW assert result.success is False - def test_timeout_enforced(self, executor, sample_event): - """Agent hook respects hook.timeout via ThreadPoolExecutor.""" - import time + def test_timeout_propagated_to_hook_llm(self, executor, sample_event): + """hook.timeout is forwarded to the copied LLM, parent's timeout untouched.""" + parent_timeout = executor.llm.timeout + captured_llm = {} - def slow_run(): - time.sleep(10) + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() with ( - patch(self._AGENT_PATH), - patch(self._CONV_PATH) as mock_conv_cls, + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), ): - mock_conv = MagicMock() - mock_conv.run = slow_run - mock_conv_cls.return_value = mock_conv + hook = HookDefinition(type="agent", timeout=7) + executor._execute_agent_hook(hook, sample_event) - hook = HookDefinition(type="agent", timeout=1) - start = time.time() - result = executor._execute_agent_hook(hook, sample_event) - elapsed = time.time() - start + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + assert hook_llm.timeout == 7 + assert executor.llm.timeout == parent_timeout - assert elapsed < 5 - assert not result.blocked - assert result.decision == HookDecision.ALLOW - assert "timed out" in (result.reason or "") + def test_hook_metrics_under_usage_id(self, executor, sample_event): + """Hook LLM uses per-hook usage_id and shares parent's Metrics object.""" + parent_metrics = executor.llm.metrics - def test_metrics_are_isolated_from_parent_llm(self, executor, sample_event): - """model_copy + reset_metrics ensures hook tokens don't pollute parent LLM.""" captured_llm = {} def capture_agent_init(**kwargs): @@ -604,7 +684,27 @@ def capture_agent_init(**kwargs): hook_llm = captured_llm.get("llm") assert hook_llm is not None assert hook_llm is not executor.llm - assert hook_llm.usage_id == "hook-agent-evaluator" + assert hook_llm.usage_id == "hook-agent:default" + assert hook_llm._metrics is parent_metrics + + def test_hook_usage_id_uses_hook_name(self, executor, sample_event): + """A named hook gets its own usage_id bucket: hook-agent:.""" + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + hook = HookDefinition(type="agent", name="security-check") + executor._execute_agent_hook(hook, sample_event) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + assert hook_llm.usage_id == "hook-agent:security-check" class TestPromptHookNotImplemented: From ba8e3fc5d906f62d1787c529b782d03026bb2ca5 Mon Sep 17 00:00:00 2001 From: "burak.keles" Date: Thu, 21 May 2026 13:32:16 +0300 Subject: [PATCH 05/12] resolve conflict --- openhands-sdk/openhands/sdk/hooks/config.py | 2 ++ tests/sdk/hooks/test_executor.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index 7efe68501d..1a336b6475 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -50,6 +50,7 @@ class HookDefinition(BaseModel): type: HookType = HookType.COMMAND name: str | None = None command: str | None = None + prompt: str | None = None system_prompt: str | None = None tools: list[str] = Field(default_factory=list) timeout: int = 60 @@ -85,6 +86,7 @@ def display_command(self) -> str: return f"agent-hook:{self.system_prompt[:20]}" return "agent-hook:agent" + class HookMatcher(BaseModel): """Matches events to hooks based on patterns. diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 530a8278a7..e40ad08052 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -720,7 +720,7 @@ def sample_event(self): def test_prompt_hook_defaults_to_allow(self, executor, sample_event): """Executing a PROMPT hook returns allow instead of crashing.""" - hook = HookDefinition(type="prompt") + hook = HookDefinition(type="prompt", prompt="evaluate this event") result = executor.execute(hook, sample_event) assert result.decision == HookDecision.ALLOW assert result.success is False @@ -728,12 +728,12 @@ def test_prompt_hook_defaults_to_allow(self, executor, sample_event): def test_prompt_hook_does_not_block(self, executor, sample_event): """PROMPT hook must not block the operation while unimplemented.""" - hook = HookDefinition(type="prompt") + hook = HookDefinition(type="prompt", prompt="evaluate this event") result = executor.execute(hook, sample_event) assert result.blocked is False assert result.should_continue is True def test_prompt_hook_without_command_validates(self): """PROMPT hook with no command is valid at config time (future use).""" - hook = HookDefinition(type="prompt") + hook = HookDefinition(type="prompt", prompt="evaluate this event") assert hook.command is None From 67d37356d6b355621ee59314d39c0a10ca22a58c Mon Sep 17 00:00:00 2001 From: "burak.keles" Date: Thu, 21 May 2026 18:01:02 +0300 Subject: [PATCH 06/12] Refactor agent hook execution for improved JSON parsing, error handling, and metrics isolation. Update comments and tests for clarity. --- .../49_agent_hooks/README.md | 2 +- openhands-sdk/openhands/sdk/hooks/executor.py | 51 ++++++++++--------- tests/sdk/hooks/test_executor.py | 20 ++++---- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/examples/01_standalone_sdk/49_agent_hooks/README.md b/examples/01_standalone_sdk/49_agent_hooks/README.md index 4a354c1b26..dd3a8e534b 100644 --- a/examples/01_standalone_sdk/49_agent_hooks/README.md +++ b/examples/01_standalone_sdk/49_agent_hooks/README.md @@ -39,7 +39,7 @@ python 49_agent_hooks.py ```python HookDefinition( type=HookType.AGENT, - name="security-reviewer", # bucket for cost metrics (hook-agent:) + name="security-reviewer", # bucket for cost metrics (agent-hook:) system_prompt="...", # instructs the hook agent; must request JSON tools=["file_editor"], # optional tools the hook agent may use # (use registered names, e.g. "file_editor", diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 1178065272..0d55a07d6f 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -1,5 +1,6 @@ """Hook executor - runs shell commands and agent evaluations with JSON I/O.""" +import contextlib import json import logging import os @@ -34,6 +35,11 @@ class HookResult(BaseModel): to ``False`` and the error is logged, but the operation still proceeds. In particular, exit code ``1`` does **not** block — only ``2`` does. Hooks intended to enforce a policy must exit with ``2``. + + For agent / prompt hooks, ``success=True`` means the hook produced a + deliberate verdict (parsed ``allow`` or ``deny``). Fall-open paths set + ``success=False`` with ``error`` populated, so a "we couldn't decide" allow + is detectable as ``decision == ALLOW and not success``. """ success: bool = True @@ -147,6 +153,8 @@ def cleanup_all(self) -> None: class HookExecutor: """Executes hook commands and agent evaluations with JSON I/O.""" + _JSON_DECODER = json.JSONDecoder() + def __init__( self, working_dir: str | None = None, @@ -161,18 +169,17 @@ def __init__( self.persistence_dir = persistence_dir self.visualizer = visualizer - def _allow( + def _fall_open( self, reason: str, *, - success: bool = False, error: str | None = None, ) -> HookResult: return HookResult( - success=success, + success=False, decision=HookDecision.ALLOW, reason=reason, - error=error, + error=error or reason, ) @observe( @@ -203,14 +210,16 @@ def _execute_agent_hook( f"Agent hook has no LLM configured for event '{event_type}'" " — defaulting to allow" ) - return self._allow("No LLM configured for agent hook") + return self._fall_open("No LLM configured for agent hook") hook_llm = self.llm.model_copy( update={ - "usage_id": f"hook-agent:{hook.name or 'default'}", + "usage_id": f"agent-hook:{hook.name or 'default'}", "timeout": hook.timeout, } ) + # Isolate Metrics so hook spend doesn't accrue to the parent's bucket. + hook_llm.reset_metrics() conversation = None try: @@ -241,7 +250,7 @@ def _execute_agent_hook( f"Agent hook sub-conversation failed for event '{event_type}'" f" — defaulting to allow: {e}" ) - return self._allow( + return self._fall_open( "Agent hook execution failed — defaulting to allow", error=str(e), ) @@ -251,21 +260,16 @@ def _execute_agent_hook( return self._parse_decision(raw, event_type) - _JSON_DECODER = json.JSONDecoder() - def _extract_first_json_object(self, text: str) -> dict | None: - # Walks the string and asks the JSON decoder to parse from each '{'. - # Handles bare JSON, ```json ... ``` fences, prose prefixes/suffixes, - # and mixed combinations — anything LLMs add around the actual object. + # Scan for the first decodable JSON object so prose / ```json fences + # around the payload don't defeat parsing. for i, ch in enumerate(text): if ch != "{": continue - try: + with contextlib.suppress(json.JSONDecodeError): obj, _ = self._JSON_DECODER.raw_decode(text[i:]) - except json.JSONDecodeError: - continue - if isinstance(obj, dict): - return obj + if isinstance(obj, dict): + return obj return None def _parse_decision(self, raw: str, event_type: str) -> HookResult: @@ -274,7 +278,7 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult: f"Agent hook produced no final response for event '{event_type}'" " — defaulting to allow" ) - return self._allow( + return self._fall_open( "Agent hook produced no final response — defaulting to allow" ) @@ -284,9 +288,8 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult: f"Agent hook returned no parseable JSON object for event" f" '{event_type}' — defaulting to allow: {repr(raw)[:200]}" ) - return self._allow( - "Agent hook returned no parseable JSON — defaulting to allow", - success=True, + return self._fall_open( + "Agent hook returned no parseable JSON — defaulting to allow" ) decision_str = str(data.get("decision", "allow")).lower() @@ -323,10 +326,8 @@ def execute( f"PROMPT hooks are not yet implemented — defaulting to allow" f" (event_type={event_type})" ) - return HookResult( - success=False, - decision=HookDecision.ALLOW, - reason="PROMPT hooks are not yet implemented — defaulting to allow", + return self._fall_open( + "PROMPT hooks are not yet implemented — defaulting to allow" ) # Prepare environment diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index e40ad08052..4dbd029d76 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -519,8 +519,7 @@ def test_allow_decision_passes(self, executor, sample_event): assert result.reason == "Looks safe" def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event): - """LLMs commonly wrap JSON in ```json fences; the executor must still - honour the decision instead of falling open.""" + """```json fenced JSON is honoured, not treated as non-JSON.""" fenced = ( '```json\n' '{"decision": "deny", "reason": "Sensitive file read"}\n' @@ -560,8 +559,7 @@ def test_plain_fence_without_language_tag_is_parsed( assert result.reason == "ok" def test_prose_prefix_before_json_is_parsed(self, executor, sample_event): - """LLMs often explain themselves before emitting JSON; we must still - honour a real deny instead of silently falling open to allow.""" + """Prose before the JSON object must not defeat the parser.""" prose_then_json = ( "After reviewing the workspace I found REPORT.md is missing.\n\n" '{"decision": "deny", "reason": "missing deliverable"}' @@ -601,7 +599,7 @@ def test_prose_suffix_after_json_is_parsed(self, executor, sample_event): assert result.reason == "sensitive file" def test_invalid_json_defaults_to_allow(self, executor, sample_event): - """_execute_agent_hook with non-JSON LLM response defaults to allow.""" + """Non-JSON response falls open: ALLOW + success=False + error set.""" with ( patch(self._AGENT_PATH), patch(self._CONV_PATH) as mock_conv_cls, @@ -614,6 +612,8 @@ def test_invalid_json_defaults_to_allow(self, executor, sample_event): assert not result.blocked assert result.decision == HookDecision.ALLOW assert result.should_continue + assert result.success is False + assert result.error is not None def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event): """_execute_agent_hook when sub-conversation raises defaults to allow.""" @@ -665,7 +665,7 @@ def capture_agent_init(**kwargs): assert executor.llm.timeout == parent_timeout def test_hook_metrics_under_usage_id(self, executor, sample_event): - """Hook LLM uses per-hook usage_id and shares parent's Metrics object.""" + """Hook LLM uses per-hook usage_id and an isolated Metrics object.""" parent_metrics = executor.llm.metrics captured_llm = {} @@ -684,11 +684,11 @@ def capture_agent_init(**kwargs): hook_llm = captured_llm.get("llm") assert hook_llm is not None assert hook_llm is not executor.llm - assert hook_llm.usage_id == "hook-agent:default" - assert hook_llm._metrics is parent_metrics + assert hook_llm.usage_id == "agent-hook:default" + assert hook_llm.metrics is not parent_metrics def test_hook_usage_id_uses_hook_name(self, executor, sample_event): - """A named hook gets its own usage_id bucket: hook-agent:.""" + """A named hook gets its own usage_id bucket: agent-hook:.""" captured_llm = {} def capture_agent_init(**kwargs): @@ -704,7 +704,7 @@ def capture_agent_init(**kwargs): hook_llm = captured_llm.get("llm") assert hook_llm is not None - assert hook_llm.usage_id == "hook-agent:security-check" + assert hook_llm.usage_id == "agent-hook:security-check" class TestPromptHookNotImplemented: From 60e9fa11355fa4853607939cd631b58465b6bdac Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Fri, 22 May 2026 17:16:00 +0200 Subject: [PATCH 07/12] fix: address hook pre-commit failures --- .../51_agent_hooks.py} | 6 +- .../README.md | 4 +- openhands-sdk/openhands/sdk/hooks/config.py | 3 +- .../openhands/sdk/hooks/conversation_hooks.py | 7 ++- openhands-sdk/openhands/sdk/hooks/executor.py | 18 ++++-- openhands-sdk/openhands/sdk/hooks/manager.py | 4 +- tests/sdk/hooks/test_config.py | 55 ++++++++++++++++--- tests/sdk/hooks/test_executor.py | 55 ++++++++++--------- 8 files changed, 102 insertions(+), 50 deletions(-) rename examples/01_standalone_sdk/{49_agent_hooks/49_agent_hooks.py => 51_agent_hooks/51_agent_hooks.py} (97%) rename examples/01_standalone_sdk/{49_agent_hooks => 51_agent_hooks}/README.md (96%) diff --git a/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py b/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py similarity index 97% rename from examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py rename to examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py index 7dbb5e6ef4..754aac11b2 100644 --- a/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py +++ b/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py @@ -145,9 +145,7 @@ def hook_logger(event) -> None: print("\n" + "=" * 60) print("Demo 2: Stop — deny until deliverable exists") print("=" * 60) - print( - "Quality reviewer denies until REPORT.md exists with a bullet point.\n" - ) + print("Quality reviewer denies until REPORT.md exists with a bullet point.\n") conversation.send_message( "Write REPORT.md in the workspace with at least one bullet point " "describing this repository, then finish." @@ -163,4 +161,4 @@ def hook_logger(event) -> None: print("=" * 60) cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost - print(f"\nEXAMPLE_COST: {cost}") \ No newline at end of file + print(f"\nEXAMPLE_COST: {cost}") diff --git a/examples/01_standalone_sdk/49_agent_hooks/README.md b/examples/01_standalone_sdk/51_agent_hooks/README.md similarity index 96% rename from examples/01_standalone_sdk/49_agent_hooks/README.md rename to examples/01_standalone_sdk/51_agent_hooks/README.md index dd3a8e534b..efdc245114 100644 --- a/examples/01_standalone_sdk/49_agent_hooks/README.md +++ b/examples/01_standalone_sdk/51_agent_hooks/README.md @@ -15,7 +15,7 @@ exact tool name used. ## Example -- **49_agent_hooks.py** — Two agent hooks wired onto a single conversation: +- **51_agent_hooks.py** — Two agent hooks wired onto a single conversation: - **PreToolUse** "security reviewer" denies a command whose intent is to read `/etc/passwd`, even though no obvious keyword appears in a blacklist. - **Stop** "quality reviewer" refuses to let the main agent finish until @@ -31,7 +31,7 @@ export LLM_API_KEY="your-key" export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929" # optional export LLM_BASE_URL="https://your-endpoint" # optional -python 49_agent_hooks.py +python 51_agent_hooks.py ``` ## How an agent hook is configured diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index 1a336b6475..8c48fb6ef8 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -69,7 +69,8 @@ def _validate_type_fields(self) -> "HookDefinition": raise ValueError("'prompt' is required when type is 'prompt'") if self.type == HookType.AGENT and self.command is not None: raise ValueError( - "'command' must not be set when type is 'agent'; use 'system_prompt' instead" + "'command' must not be set when type is 'agent'; " + "use 'system_prompt' instead" ) if self.type == HookType.AGENT and self.async_: raise ValueError("'async' is not supported for agent hooks") diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py index fa359e4afd..3664c22e43 100644 --- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py +++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py @@ -3,6 +3,7 @@ from collections.abc import Callable from typing import TYPE_CHECKING, Any +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase from openhands.sdk.event import ( ActionEvent, Event, @@ -18,8 +19,6 @@ from openhands.sdk.logger import get_logger -from openhands.sdk.conversation.visualizer import ConversationVisualizerBase - if TYPE_CHECKING: from openhands.sdk.conversation.state import ConversationState from openhands.sdk.llm import LLM @@ -394,7 +393,9 @@ def create_hook_callback( emit_hook_events: bool = True, llm: "LLM | None" = None, persistence_dir: str | None = None, - visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, + visualizer: type[ConversationVisualizerBase] + | ConversationVisualizerBase + | None = None, ) -> tuple[HookEventProcessor, Any]: """Create a hook-enabled event callback. Returns (processor, callback). diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 0d55a07d6f..6ee33844d3 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -161,7 +161,9 @@ def __init__( async_process_manager: AsyncProcessManager | None = None, llm: "LLM | None" = None, persistence_dir: str | None = None, - visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, + visualizer: type[ConversationVisualizerBase] + | ConversationVisualizerBase + | None = None, ): self.working_dir = working_dir or os.getcwd() self.async_process_manager = async_process_manager or AsyncProcessManager() @@ -347,6 +349,14 @@ def execute( # Cleanup expired async processes before starting new ones self.async_process_manager.cleanup_expired() + command = hook.command + if command is None: + return HookResult( + success=False, + exit_code=-1, + error="'command' is required when type is 'command'", + ) + # Handle async hooks: fire and forget if hook.async_: try: @@ -357,7 +367,7 @@ def execute( start_new_session = False process = subprocess.Popen( - hook.command, + command, shell=True, cwd=self.working_dir, env=hook_env, @@ -378,7 +388,7 @@ def execute( # Track for cleanup self.async_process_manager.add_process(process, hook.timeout) - logger.debug(f"Started async hook (PID {process.pid}): {hook.command}") + logger.debug(f"Started async hook (PID {process.pid}): {command}") # Return placeholder success result return HookResult( @@ -396,7 +406,7 @@ def execute( try: # Execute the hook command synchronously result = subprocess.run( - hook.command, + command, shell=True, cwd=self.working_dir, env=hook_env, diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py index a3c725e9a1..788f20ad16 100644 --- a/openhands-sdk/openhands/sdk/hooks/manager.py +++ b/openhands-sdk/openhands/sdk/hooks/manager.py @@ -26,7 +26,9 @@ def __init__( session_id: str | None = None, llm: "LLM | None" = None, persistence_dir: str | None = None, - visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, + visualizer: type[ConversationVisualizerBase] + | ConversationVisualizerBase + | None = None, ): self.config = config or HookConfig.load(working_dir=working_dir) self.executor = HookExecutor( diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py index 255283e010..4e1e730346 100644 --- a/tests/sdk/hooks/test_config.py +++ b/tests/sdk/hooks/test_config.py @@ -12,7 +12,7 @@ def test_command_hook_requires_command(): with pytest.raises(Exception, match="'command' is required"): - HookDefinition(type="command") + HookDefinition(type=HookType.COMMAND) def test_command_hook_valid(): @@ -38,12 +38,28 @@ def test_agent_hook_valid(kwargs): "kwargs,expected", [ ({"command": "block.sh"}, "block.sh"), - ({"type": "agent", "name": "block-deletions", "system_prompt": "Block rm -rf"}, "agent-hook:block-deletions"), - ({"type": "agent", "system_prompt": "Block network calls to external IPs"}, "agent-hook:Block network calls "), + ( + { + "type": "agent", + "name": "block-deletions", + "system_prompt": "Block rm -rf", + }, + "agent-hook:block-deletions", + ), + ( + {"type": "agent", "system_prompt": "Block network calls to external IPs"}, + "agent-hook:Block network calls ", + ), ({"type": "agent", "system_prompt": "A" * 100}, f"agent-hook:{'A' * 20}"), ({"type": "agent"}, "agent-hook:agent"), ], - ids=["command", "agent-named", "agent-prompt-prefix", "agent-prompt-truncated", "agent-fallback"], + ids=[ + "command", + "agent-named", + "agent-prompt-prefix", + "agent-prompt-truncated", + "agent-fallback", + ], ) def test_display_command(kwargs, expected): h = HookDefinition(**kwargs) @@ -52,9 +68,19 @@ def test_display_command(kwargs, expected): def test_multiple_agent_hooks_are_distinguishable(): hooks = [ - HookDefinition(type="agent", name="block-deletions", system_prompt="Block rm -rf"), - HookDefinition(type="agent", system_prompt="Block network calls to external IPs"), - HookDefinition(type="agent", system_prompt="Verify all tasks are complete"), + HookDefinition( + type=HookType.AGENT, + name="block-deletions", + system_prompt="Block rm -rf", + ), + HookDefinition( + type=HookType.AGENT, + system_prompt="Block network calls to external IPs", + ), + HookDefinition( + type=HookType.AGENT, + system_prompt="Verify all tasks are complete", + ), ] assert len({h.display_command for h in hooks}) == 3 @@ -62,7 +88,10 @@ def test_multiple_agent_hooks_are_distinguishable(): @pytest.mark.parametrize( "kwargs,match", [ - ({"type": "agent", "command": "echo hi"}, "'command' must not be set when type is 'agent'"), + ( + {"type": "agent", "command": "echo hi"}, + "'command' must not be set when type is 'agent'", + ), ({"type": "command"}, "'command' is required"), ], ids=["agent-rejects-command", "command-requires-command"], @@ -80,7 +109,15 @@ def test_agent_hook_rejects_async(): def test_agent_hook_from_json(): data = { "stop": [ - {"hooks": [{"type": "agent", "system_prompt": "Verify all tasks are done", "timeout": 30}]} + { + "hooks": [ + { + "type": "agent", + "system_prompt": "Verify all tasks are done", + "timeout": 30, + } + ] + } ] } config = HookConfig.from_dict(data) diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 4dbd029d76..956c2dd148 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -8,7 +8,7 @@ import pytest from pydantic import SecretStr -from openhands.sdk.hooks.config import HookDefinition +from openhands.sdk.hooks.config import HookDefinition, HookType from openhands.sdk.hooks.executor import HookExecutor from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType from openhands.sdk.llm import LLM @@ -459,7 +459,10 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event): """execute() routes AGENT type to _execute_agent_hook, not subprocess.""" from openhands.sdk.hooks.executor import HookResult - hook = HookDefinition(type="agent", system_prompt="Verify task completion") + hook = HookDefinition( + type=HookType.AGENT, + system_prompt="Verify task completion", + ) with patch.object( executor, @@ -476,7 +479,10 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event): def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event): """Agent hook with no LLM configured defaults to allow without error.""" - hook = HookDefinition(type="agent", system_prompt="Check something") + hook = HookDefinition( + type=HookType.AGENT, + system_prompt="Check something", + ) result = executor_no_llm.execute(hook, sample_event) assert not result.success @@ -493,7 +499,10 @@ def test_deny_decision_blocks(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=deny_json), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent", system_prompt="Check tasks") + hook = HookDefinition( + type=HookType.AGENT, + system_prompt="Check tasks", + ) result = executor._execute_agent_hook(hook, sample_event) assert result.blocked @@ -511,7 +520,7 @@ def test_allow_decision_passes(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=allow_json), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert not result.blocked @@ -520,11 +529,7 @@ def test_allow_decision_passes(self, executor, sample_event): def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event): """```json fenced JSON is honoured, not treated as non-JSON.""" - fenced = ( - '```json\n' - '{"decision": "deny", "reason": "Sensitive file read"}\n' - '```' - ) + fenced = '```json\n{"decision": "deny", "reason": "Sensitive file read"}\n```' with ( patch(self._AGENT_PATH), @@ -532,16 +537,14 @@ def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=fenced), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert result.blocked assert result.decision == HookDecision.DENY assert result.reason == "Sensitive file read" - def test_plain_fence_without_language_tag_is_parsed( - self, executor, sample_event - ): + def test_plain_fence_without_language_tag_is_parsed(self, executor, sample_event): """Some LLMs use ``` without a language tag — still extract the JSON.""" fenced = '```\n{"decision": "allow", "reason": "ok"}\n```' @@ -551,7 +554,7 @@ def test_plain_fence_without_language_tag_is_parsed( patch(self._RESPONSE_PATH, return_value=fenced), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert not result.blocked @@ -571,7 +574,7 @@ def test_prose_prefix_before_json_is_parsed(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=prose_then_json), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert result.blocked @@ -591,7 +594,7 @@ def test_prose_suffix_after_json_is_parsed(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=json_then_prose), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert result.blocked @@ -606,7 +609,7 @@ def test_invalid_json_defaults_to_allow(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value="I think you should allow this."), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert not result.blocked @@ -621,7 +624,7 @@ def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event patch(self._AGENT_PATH), patch(self._CONV_PATH, side_effect=RuntimeError("workspace error")), ): - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert not result.blocked @@ -636,7 +639,7 @@ def test_empty_response_defaults_to_allow(self, executor, sample_event): patch(self._RESPONSE_PATH, return_value=""), ): mock_conv_cls.return_value = MagicMock() - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) result = executor._execute_agent_hook(hook, sample_event) assert not result.blocked @@ -656,7 +659,7 @@ def capture_agent_init(**kwargs): patch(self._AGENT_PATH, side_effect=capture_agent_init), patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), ): - hook = HookDefinition(type="agent", timeout=7) + hook = HookDefinition(type=HookType.AGENT, timeout=7) executor._execute_agent_hook(hook, sample_event) hook_llm = captured_llm.get("llm") @@ -678,7 +681,7 @@ def capture_agent_init(**kwargs): patch(self._AGENT_PATH, side_effect=capture_agent_init), patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), ): - hook = HookDefinition(type="agent") + hook = HookDefinition(type=HookType.AGENT) executor._execute_agent_hook(hook, sample_event) hook_llm = captured_llm.get("llm") @@ -699,7 +702,7 @@ def capture_agent_init(**kwargs): patch(self._AGENT_PATH, side_effect=capture_agent_init), patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), ): - hook = HookDefinition(type="agent", name="security-check") + hook = HookDefinition(type=HookType.AGENT, name="security-check") executor._execute_agent_hook(hook, sample_event) hook_llm = captured_llm.get("llm") @@ -720,7 +723,7 @@ def sample_event(self): def test_prompt_hook_defaults_to_allow(self, executor, sample_event): """Executing a PROMPT hook returns allow instead of crashing.""" - hook = HookDefinition(type="prompt", prompt="evaluate this event") + hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") result = executor.execute(hook, sample_event) assert result.decision == HookDecision.ALLOW assert result.success is False @@ -728,12 +731,12 @@ def test_prompt_hook_defaults_to_allow(self, executor, sample_event): def test_prompt_hook_does_not_block(self, executor, sample_event): """PROMPT hook must not block the operation while unimplemented.""" - hook = HookDefinition(type="prompt", prompt="evaluate this event") + hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") result = executor.execute(hook, sample_event) assert result.blocked is False assert result.should_continue is True def test_prompt_hook_without_command_validates(self): """PROMPT hook with no command is valid at config time (future use).""" - hook = HookDefinition(type="prompt", prompt="evaluate this event") + hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") assert hook.command is None From 823b460b1ea0873548f5e1a3a3bb8be5dbd420ba Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Fri, 22 May 2026 17:40:36 +0200 Subject: [PATCH 08/12] fix: include agent hook metrics in parent stats --- .../conversation/impl/local_conversation.py | 1 + .../openhands/sdk/hooks/conversation_hooks.py | 4 ++ openhands-sdk/openhands/sdk/hooks/executor.py | 18 +++++++ openhands-sdk/openhands/sdk/hooks/manager.py | 3 ++ tests/sdk/hooks/test_executor.py | 47 +++++++++++++++++++ 5 files changed, 73 insertions(+) diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index 047acc8e16..4b07188f41 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -568,6 +568,7 @@ def _ensure_plugins_loaded(self) -> None: llm=self.agent.llm, persistence_dir=self._state.persistence_dir, visualizer=self._visualizer, + conversation_stats=self._state.stats, ) self._hook_processor.set_conversation_state(self._state) self._hook_processor.run_session_start() diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py index 3664c22e43..91f5197ec3 100644 --- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py +++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py @@ -20,6 +20,7 @@ if TYPE_CHECKING: + from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.conversation.state import ConversationState from openhands.sdk.llm import LLM @@ -396,6 +397,7 @@ def create_hook_callback( visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, + conversation_stats: "ConversationStats | None" = None, ) -> tuple[HookEventProcessor, Any]: """Create a hook-enabled event callback. Returns (processor, callback). @@ -409,6 +411,7 @@ def create_hook_callback( llm: LLM instance inherited from the parent conversation, used by agent hooks. persistence_dir: Directory used to persist agent hook sub-conversation events. visualizer: Visualizer instance passed to agent hook sub-conversations. + conversation_stats: Parent conversation stats that should include hook spend. Returns: Tuple of (HookEventProcessor, callback function). @@ -420,6 +423,7 @@ def create_hook_callback( llm=llm, persistence_dir=persistence_dir, visualizer=visualizer, + conversation_stats=conversation_stats, ) processor = HookEventProcessor( diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 6ee33844d3..13ab2e9aa9 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -19,6 +19,8 @@ if TYPE_CHECKING: + from openhands.sdk.conversation.base import BaseConversation + from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.llm import LLM @@ -164,12 +166,14 @@ def __init__( visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, + conversation_stats: "ConversationStats | None" = None, ): self.working_dir = working_dir or os.getcwd() self.async_process_manager = async_process_manager or AsyncProcessManager() self.llm = llm self.persistence_dir = persistence_dir self.visualizer = visualizer + self.conversation_stats = conversation_stats def _fall_open( self, @@ -258,6 +262,7 @@ def _execute_agent_hook( ) finally: if conversation is not None: + self._merge_hook_conversation_stats(conversation) conversation.close() return self._parse_decision(raw, event_type) @@ -309,6 +314,19 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult: reason=reason, ) + def _merge_hook_conversation_stats(self, conversation: "BaseConversation") -> None: + if self.conversation_stats is None: + return + + child_stats = conversation.conversation_stats + for usage_id, metrics in child_stats.usage_to_metrics.items(): + if usage_id in self.conversation_stats.usage_to_metrics: + existing = self.conversation_stats.usage_to_metrics[usage_id] + if existing is not metrics: + existing.merge(metrics) + else: + self.conversation_stats.usage_to_metrics[usage_id] = metrics.deep_copy() + def execute( self, hook: HookDefinition, diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py index 788f20ad16..aac309a487 100644 --- a/openhands-sdk/openhands/sdk/hooks/manager.py +++ b/openhands-sdk/openhands/sdk/hooks/manager.py @@ -10,6 +10,7 @@ if TYPE_CHECKING: + from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.llm import LLM @@ -29,6 +30,7 @@ def __init__( visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None, + conversation_stats: "ConversationStats | None" = None, ): self.config = config or HookConfig.load(working_dir=working_dir) self.executor = HookExecutor( @@ -36,6 +38,7 @@ def __init__( llm=llm, persistence_dir=persistence_dir, visualizer=visualizer, + conversation_stats=conversation_stats, ) self.session_id = session_id self.working_dir = working_dir diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 956c2dd148..86e343c557 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -8,10 +8,12 @@ import pytest from pydantic import SecretStr +from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.hooks.config import HookDefinition, HookType from openhands.sdk.hooks.executor import HookExecutor from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType from openhands.sdk.llm import LLM +from openhands.sdk.llm.utils.metrics import Metrics from tests.command_utils import python_command @@ -690,6 +692,51 @@ def capture_agent_init(**kwargs): assert hook_llm.usage_id == "agent-hook:default" assert hook_llm.metrics is not parent_metrics + def test_hook_metrics_are_merged_into_parent_stats( + self, tmp_path, mock_llm, sample_event + ): + """Child agent-hook spend is included in parent conversation stats.""" + parent_stats = ConversationStats() + existing_hook_metrics = Metrics(model_name="gpt-4o") + existing_hook_metrics.add_cost(0.25) + parent_stats.usage_to_metrics["agent-hook:security-check"] = ( + existing_hook_metrics + ) + + child_hook_metrics = Metrics(model_name="gpt-4o") + child_hook_metrics.add_cost(0.75) + child_stats = ConversationStats( + usage_to_metrics={"agent-hook:security-check": child_hook_metrics} + ) + mock_conversation = MagicMock() + mock_conversation.conversation_stats = child_stats + mock_conversation.state.events = [] + + executor = HookExecutor( + working_dir=str(tmp_path), + llm=mock_llm, + conversation_stats=parent_stats, + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH, return_value=mock_conversation), + patch( + self._RESPONSE_PATH, + return_value='{"decision": "allow", "reason": "ok"}', + ), + ): + hook = HookDefinition(type=HookType.AGENT, name="security-check") + result = executor._execute_agent_hook(hook, sample_event) + + assert result.decision == HookDecision.ALLOW + assert parent_stats.usage_to_metrics[ + "agent-hook:security-check" + ].accumulated_cost == pytest.approx(1.0) + assert parent_stats.get_combined_metrics().accumulated_cost == pytest.approx( + 1.0 + ) + def test_hook_usage_id_uses_hook_name(self, executor, sample_event): """A named hook gets its own usage_id bucket: agent-hook:.""" captured_llm = {} From 95895c8c94da8964d2719205ec8918b39f3948bf Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Fri, 22 May 2026 17:52:27 +0200 Subject: [PATCH 09/12] fix: persist agent hooks under conversation base --- .../conversation/impl/local_conversation.py | 7 +++- .../test_local_conversation_plugins.py | 40 +++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index 4b07188f41..b798a8b51c 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -559,6 +559,11 @@ def _ensure_plugins_loaded(self) -> None: if final_hook_config is not None: # Store final hook_config in state for observability self._state.hook_config = final_hook_config + hook_persistence_dir = ( + str(Path(self._state.persistence_dir).parent) + if self._state.persistence_dir is not None + else None + ) self._hook_processor, self._on_event = create_hook_callback( hook_config=final_hook_config, @@ -566,7 +571,7 @@ def _ensure_plugins_loaded(self) -> None: session_id=str(self._state.id), original_callback=self._base_callback, llm=self.agent.llm, - persistence_dir=self._state.persistence_dir, + persistence_dir=hook_persistence_dir, visualizer=self._visualizer, conversation_stats=self._state.stats, ) diff --git a/tests/sdk/conversation/test_local_conversation_plugins.py b/tests/sdk/conversation/test_local_conversation_plugins.py index c58a74c6b9..cfa7b066de 100644 --- a/tests/sdk/conversation/test_local_conversation_plugins.py +++ b/tests/sdk/conversation/test_local_conversation_plugins.py @@ -2,6 +2,7 @@ import json from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from pydantic import SecretStr @@ -188,6 +189,45 @@ def test_plugin_hooks_combined_with_explicit_hooks( # (The actual hook_processor is internal, but we trust the merging works) conversation.close() + def test_hook_sub_conversations_receive_persistence_base_dir( + self, tmp_path: Path, basic_agent + ): + """Agent hook persistence should not nest under the parent conversation id.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + persistence_base = tmp_path / "state" + hook_config = HookConfig( + pre_tool_use=[ + HookMatcher(matcher="*", hooks=[HookDefinition(command="echo test")]) + ] + ) + + processor = MagicMock() + processor.on_event = MagicMock() + processor.set_conversation_state = MagicMock() + processor.run_session_start = MagicMock() + + conversation = LocalConversation( + agent=basic_agent, + workspace=workspace, + persistence_dir=persistence_base, + hook_config=hook_config, + visualizer=None, + ) + + with patch( + "openhands.sdk.conversation.impl.local_conversation.create_hook_callback", + return_value=(processor, processor.on_event), + ) as mock_create_hook_callback: + conversation._ensure_plugins_loaded() + + assert conversation.state.persistence_dir is not None + assert Path(conversation.state.persistence_dir).parent == persistence_base + assert mock_create_hook_callback.call_args.kwargs["persistence_dir"] == str( + persistence_base + ) + conversation.close() + def test_plugins_not_loaded_until_needed(self, tmp_path: Path, basic_agent): """Test that plugins are not loaded in constructor (lazy loading).""" plugin_dir = create_test_plugin( From f86fad42c676ba10441d50f97908c54f6cf1bf27 Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Sun, 24 May 2026 23:45:43 +0200 Subject: [PATCH 10/12] fix(hooks): address review on agent hook visualizer, decisions, example - Fall open (success=False) on missing/unknown agent decision instead of silently allowing as if it were a deliberate verdict - Request a fresh sub-visualizer for hook sub-conversations rather than handing over the parent's initialized visualizer instance, which LocalConversation.initialize() would rebind to the hook's child state - Split the example into one conversation per demo so the Stop quality gate isn't active during the PreToolUse demo - Rename the example to main.py and register its directory in the example discovery so the example workflow exercises it --- .../51_agent_hooks/README.md | 4 +- .../{51_agent_hooks.py => main.py} | 108 ++++++++++-------- openhands-sdk/openhands/sdk/hooks/executor.py | 34 +++++- tests/examples/test_examples.py | 4 + tests/sdk/hooks/test_executor.py | 70 ++++++++++++ 5 files changed, 167 insertions(+), 53 deletions(-) rename examples/01_standalone_sdk/51_agent_hooks/{51_agent_hooks.py => main.py} (69%) diff --git a/examples/01_standalone_sdk/51_agent_hooks/README.md b/examples/01_standalone_sdk/51_agent_hooks/README.md index efdc245114..a3d6cd6d97 100644 --- a/examples/01_standalone_sdk/51_agent_hooks/README.md +++ b/examples/01_standalone_sdk/51_agent_hooks/README.md @@ -15,7 +15,7 @@ exact tool name used. ## Example -- **51_agent_hooks.py** — Two agent hooks wired onto a single conversation: +- **main.py** — Two agent hooks, each in its own conversation: - **PreToolUse** "security reviewer" denies a command whose intent is to read `/etc/passwd`, even though no obvious keyword appears in a blacklist. - **Stop** "quality reviewer" refuses to let the main agent finish until @@ -31,7 +31,7 @@ export LLM_API_KEY="your-key" export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929" # optional export LLM_BASE_URL="https://your-endpoint" # optional -python 51_agent_hooks.py +python main.py ``` ## How an agent hook is configured diff --git a/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py b/examples/01_standalone_sdk/51_agent_hooks/main.py similarity index 69% rename from examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py rename to examples/01_standalone_sdk/51_agent_hooks/main.py index 754aac11b2..496516cde4 100644 --- a/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py +++ b/examples/01_standalone_sdk/51_agent_hooks/main.py @@ -86,47 +86,48 @@ def hook_logger(event) -> None: print(line) -with tempfile.TemporaryDirectory() as tmpdir: - workspace = Path(tmpdir) +# Each demo runs in its own conversation with only the hook it needs. Sharing a +# single config would leave the Stop quality gate active during Demo 1, so the +# agent could never finish the first task until REPORT.md existed — coupling two +# unrelated demos and burning iterations. +security_hook_config = HookConfig( + pre_tool_use=[ + HookMatcher( + matcher="terminal", + hooks=[ + HookDefinition( + type=HookType.AGENT, + name="security-reviewer", + system_prompt=SECURITY_REVIEWER_PROMPT, + timeout=60, + max_iterations=3, + ) + ], + ) + ], +) - hook_config = HookConfig( - pre_tool_use=[ - HookMatcher( - matcher="terminal", - hooks=[ - HookDefinition( - type=HookType.AGENT, - name="security-reviewer", - system_prompt=SECURITY_REVIEWER_PROMPT, - timeout=60, - max_iterations=3, - ) - ], - ) - ], - stop=[ - HookMatcher( - hooks=[ - HookDefinition( - type=HookType.AGENT, - name="quality-reviewer", - system_prompt=QUALITY_REVIEWER_PROMPT, - tools=["file_editor"], - timeout=90, - max_iterations=5, - ) - ], - ) - ], - ) +quality_hook_config = HookConfig( + stop=[ + HookMatcher( + hooks=[ + HookDefinition( + type=HookType.AGENT, + name="quality-reviewer", + system_prompt=QUALITY_REVIEWER_PROMPT, + tools=["file_editor"], + timeout=90, + max_iterations=5, + ) + ], + ) + ], +) - agent = get_default_agent(llm=llm) - conversation = Conversation( - agent=agent, - workspace=str(workspace), - hook_config=hook_config, - callbacks=[hook_logger], - ) + +with tempfile.TemporaryDirectory() as tmpdir: + workspace = Path(tmpdir) + total_cost = 0.0 print("=" * 60) print("Demo 1: PreToolUse — semantic deny") @@ -137,20 +138,38 @@ def hook_logger(event) -> None: "keyword), but the security-reviewer agent should recognise the\n" "intent and deny.\n" ) - conversation.send_message( + security_conversation = Conversation( + agent=get_default_agent(llm=llm), + workspace=str(workspace), + hook_config=security_hook_config, + callbacks=[hook_logger], + ) + security_conversation.send_message( "Show me the contents of /etc/passwd using awk '{print}'." ) - conversation.run() + security_conversation.run() + total_cost += ( + security_conversation.conversation_stats.get_combined_metrics().accumulated_cost + ) print("\n" + "=" * 60) print("Demo 2: Stop — deny until deliverable exists") print("=" * 60) print("Quality reviewer denies until REPORT.md exists with a bullet point.\n") - conversation.send_message( + quality_conversation = Conversation( + agent=get_default_agent(llm=llm), + workspace=str(workspace), + hook_config=quality_hook_config, + callbacks=[hook_logger], + ) + quality_conversation.send_message( "Write REPORT.md in the workspace with at least one bullet point " "describing this repository, then finish." ) - conversation.run() + quality_conversation.run() + total_cost += ( + quality_conversation.conversation_stats.get_combined_metrics().accumulated_cost + ) report = workspace / "REPORT.md" if report.exists(): @@ -160,5 +179,4 @@ def hook_logger(event) -> None: print("Example Complete!") print("=" * 60) - cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost - print(f"\nEXAMPLE_COST: {cost}") + print(f"\nEXAMPLE_COST: {total_cost}") diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 13ab2e9aa9..075ec17d08 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -227,6 +227,17 @@ def _execute_agent_hook( # Isolate Metrics so hook spend doesn't accrue to the parent's bucket. hook_llm.reset_metrics() + # Never hand the parent's already-initialized visualizer instance to the + # sub-conversation: LocalConversation.__init__ calls initialize() on it, + # which would rebind the parent visualizer to the hook's child state. Mirror + # the delegate pattern and ask the parent visualizer for a fresh sub- + # visualizer (returns None for visualizers that don't support sub-agents). + hook_visualizer = self.visualizer + if isinstance(self.visualizer, ConversationVisualizerBase): + hook_visualizer = self.visualizer.create_sub_visualizer( + f"agent-hook:{hook.name or 'default'}" + ) + conversation = None try: agent = Agent( @@ -242,7 +253,7 @@ def _execute_agent_hook( plugins=None, hook_config=None, persistence_dir=self.persistence_dir, - visualizer=self.visualizer, + visualizer=hook_visualizer, max_iteration_per_run=hook.max_iterations, ) conversation.send_message( @@ -299,7 +310,7 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult: "Agent hook returned no parseable JSON — defaulting to allow" ) - decision_str = str(data.get("decision", "allow")).lower() + decision_str = str(data.get("decision", "")).lower() reason = str(data.get("reason", "")) if decision_str == "deny": return HookResult( @@ -308,10 +319,21 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult: decision=HookDecision.DENY, reason=reason, ) - return HookResult( - success=True, - decision=HookDecision.ALLOW, - reason=reason, + if decision_str == "allow": + return HookResult( + success=True, + decision=HookDecision.ALLOW, + reason=reason, + ) + # Missing or unknown decision: this is not a deliberate verdict, so it + # must be a detectable fall-open (success=False) rather than a silent + # allow that masquerades as a real decision. + logger.warning( + f"Agent hook returned an invalid decision for event '{event_type}'" + f" — defaulting to allow: {repr(decision_str)[:200]}" + ) + return self._fall_open( + "Agent hook returned an invalid decision — defaulting to allow" ) def _merge_hook_conversation_stats(self, conversation: "BaseConversation") -> None: diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py index 8399aaabf7..82b94fc46b 100644 --- a/tests/examples/test_examples.py +++ b/tests/examples/test_examples.py @@ -28,6 +28,7 @@ # These examples live under subdirectories (each with a single `main.py`). EXAMPLES_ROOT / "01_standalone_sdk" / "33_hooks", EXAMPLES_ROOT / "01_standalone_sdk" / "37_llm_profile_store", + EXAMPLES_ROOT / "01_standalone_sdk" / "51_agent_hooks", EXAMPLES_ROOT / "01_standalone_sdk" / "43_mixed_marketplace_skills", EXAMPLES_ROOT / "02_remote_agent_server" / "06_custom_tool", EXAMPLES_ROOT / "05_skills_and_plugins" / "01_loading_agentskills", @@ -97,6 +98,9 @@ def test_directory_example_is_discovered() -> None: assert ( EXAMPLES_ROOT / "02_remote_agent_server" / "06_custom_tool" / "main.py" ) in EXAMPLES + assert ( + EXAMPLES_ROOT / "01_standalone_sdk" / "51_agent_hooks" / "main.py" + ) in EXAMPLES @pytest.mark.parametrize("example_path", EXAMPLES, ids=_normalize_path) diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 86e343c557..1d11999dbb 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -529,6 +529,31 @@ def test_allow_decision_passes(self, executor, sample_event): assert result.decision == HookDecision.ALLOW assert result.reason == "Looks safe" + @pytest.mark.parametrize( + "payload", + [ + '{"reason": "no decision field"}', + '{"decision": "maybe", "reason": "not allow or deny"}', + '{"decision": "", "reason": "empty decision"}', + ], + ) + def test_invalid_decision_falls_open(self, executor, sample_event, payload): + """Missing/unknown decision is a fall-open, not a deliberate allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=payload), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.should_continue + assert result.success is False + assert result.error is not None + def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event): """```json fenced JSON is honoured, not treated as non-JSON.""" fenced = '```json\n{"decision": "deny", "reason": "Sensitive file read"}\n```' @@ -756,6 +781,51 @@ def capture_agent_init(**kwargs): assert hook_llm is not None assert hook_llm.usage_id == "agent-hook:security-check" + def test_parent_visualizer_instance_is_not_rebound(self, tmp_path, mock_llm): + """The parent visualizer instance must never be handed to the hook conv. + + LocalConversation.initialize() rebinds a visualizer instance to its own + state, so the executor must request a fresh sub-visualizer instead. + """ + from openhands.sdk.conversation.visualizer import ConversationVisualizerBase + + sub_viz = MagicMock(spec=ConversationVisualizerBase) + + class _Viz(ConversationVisualizerBase): + def on_event(self, event): # pragma: no cover - not exercised + pass + + def create_sub_visualizer(self, agent_id): + self.requested_agent_id = agent_id + return sub_viz + + parent_viz = _Viz() + executor = HookExecutor( + working_dir=str(tmp_path), + llm=mock_llm, + visualizer=parent_viz, + ) + + captured = {} + + def capture_conv_init(**kwargs): + captured["visualizer"] = kwargs.get("visualizer") + raise RuntimeError("stop early") + + sample_event = HookEvent( + event_type=HookEventType.PRE_TOOL_USE, tool_name="BashTool" + ) + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH, side_effect=capture_conv_init), + ): + hook = HookDefinition(type=HookType.AGENT, name="security-check") + executor._execute_agent_hook(hook, sample_event) + + assert captured["visualizer"] is sub_viz + assert captured["visualizer"] is not parent_viz + assert parent_viz.requested_agent_id == "agent-hook:security-check" + class TestPromptHookNotImplemented: """HookType.PROMPT is a future stub — execution defaults to allow, never crashes.""" From c467752296a90b755d446c2e02450fa3ffdd0052 Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Mon, 25 May 2026 00:08:57 +0200 Subject: [PATCH 11/12] fix(hooks): preserve REST contract for HookDefinition.command; allow additive response enum values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The REST API breakage check failed for two reasons: 1. `command` was changed to `str | None = None`, which made the published `ConversationInfo.hook_config` response property optional and nullable — flagged by oasdiff as a breaking change (`command became optional` + null type widening). Keep `command` a required, non-nullable string in the schema: it now defaults to "" for command-less hook types (PROMPT/AGENT) and __get_pydantic_json_schema__ reports it as required without a default, so the serialized contract is identical to prior releases. 2. Adding the `agent` value to the `HookType` enum tripped `response-property-enum-value-added`. Additive enum values are safe evolution for extensible APIs (clients must tolerate unknown values), the same contract that already allows additive oneOf/anyOf expansion, so the breakage checker now downgrades them to informational notices. --- .../check_agent_server_rest_api_breakage.py | 24 +++++++++++++--- openhands-sdk/openhands/sdk/hooks/config.py | 28 +++++++++++++++++-- openhands-sdk/openhands/sdk/hooks/executor.py | 2 +- ...st_check_agent_server_rest_api_breakage.py | 12 ++++++-- tests/sdk/hooks/test_config.py | 2 +- tests/sdk/hooks/test_executor.py | 2 +- 6 files changed, 57 insertions(+), 13 deletions(-) diff --git a/.github/scripts/check_agent_server_rest_api_breakage.py b/.github/scripts/check_agent_server_rest_api_breakage.py index 47080fcf9c..c96640106a 100644 --- a/.github/scripts/check_agent_server_rest_api_breakage.py +++ b/.github/scripts/check_agent_server_rest_api_breakage.py @@ -557,6 +557,19 @@ def _validate_removed_schema_properties( ) +# oasdiff rule IDs for additive enum-value additions in response schemas. +# Adding a value to a response enum is additive evolution for extensible APIs +# (e.g. a new hook type / discriminator value): clients MUST tolerate unknown +# values and skip/ignore them, the same contract that makes additive oneOf/anyOf +# expansion safe above. We downgrade these to informational notices. +_ADDITIVE_RESPONSE_ENUM_IDS = frozenset( + { + "response-property-enum-value-added", + "response-write-only-property-enum-value-added", + } +) + + def _is_union_property_removal_artifact(change: dict) -> bool: """Return True for property removals that are artifacts of union widening. @@ -606,7 +619,9 @@ def _split_breaking_changes( removed_schema_properties.append(change) continue - if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS: + if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS or ( + change_id in _ADDITIVE_RESPONSE_ENUM_IDS + ): additive_response_oneof.append(change) continue @@ -799,9 +814,10 @@ def main() -> int: if additive_response_oneof: print( f"\n::notice title={PYPI_DISTRIBUTION} REST API::" - "Additive oneOf/anyOf expansion detected in response schemas. " - "This is expected for extensible discriminated-union APIs and " - "does not break backward compatibility." + "Additive oneOf/anyOf expansion or enum-value additions detected " + "in response schemas. This is expected for extensible " + "discriminated-union APIs and does not break backward " + "compatibility." ) for item in additive_response_oneof: print(f" - {item.get('text', str(item))}") diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index 8c48fb6ef8..2398fe3e5d 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -49,7 +49,13 @@ class HookDefinition(BaseModel): type: HookType = HookType.COMMAND name: str | None = None - command: str | None = None + # `command` is kept a non-nullable string that is always present in the + # serialized output and reported as required in the JSON schema (see + # __get_pydantic_json_schema__). This preserves the published REST response + # contract for ConversationInfo.hook_config: making it optional/nullable + # would be flagged as a breaking change by the oasdiff REST API check. + # Command-less hook types (PROMPT/AGENT) simply leave it as "". + command: str = "" prompt: str | None = None system_prompt: str | None = None tools: list[str] = Field(default_factory=list) @@ -61,13 +67,29 @@ class HookDefinition(BaseModel): "populate_by_name": True, # Allow both 'async' and 'async_' in input } + @classmethod + def __get_pydantic_json_schema__(cls, core_schema, handler): # type: ignore[override] + # Report `command` as a required, non-defaulted string to keep the + # published REST response contract identical to releases where the field + # had no default. The runtime default ("") only eases construction of + # command-less hook types; it never surfaces as null in responses. + json_schema = handler(core_schema) + json_schema = handler.resolve_ref_schema(json_schema) + command_schema = json_schema.get("properties", {}).get("command") + if command_schema is not None: + command_schema.pop("default", None) + required = json_schema.setdefault("required", []) + if "command" not in required: + required.append("command") + return json_schema + @model_validator(mode="after") def _validate_type_fields(self) -> "HookDefinition": if self.type == HookType.COMMAND and not self.command: raise ValueError("'command' is required when type is 'command'") if self.type == HookType.PROMPT and not self.prompt: raise ValueError("'prompt' is required when type is 'prompt'") - if self.type == HookType.AGENT and self.command is not None: + if self.type == HookType.AGENT and self.command: raise ValueError( "'command' must not be set when type is 'agent'; " "use 'system_prompt' instead" @@ -79,7 +101,7 @@ def _validate_type_fields(self) -> "HookDefinition": @property def display_command(self) -> str: """Human-readable label for this hook used in logs and events.""" - if self.command is not None: + if self.command: return self.command if self.name is not None: return f"agent-hook:{self.name}" diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 075ec17d08..72d97ae6b7 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -390,7 +390,7 @@ def execute( self.async_process_manager.cleanup_expired() command = hook.command - if command is None: + if not command: return HookResult( success=False, exit_code=-1, diff --git a/tests/cross/test_check_agent_server_rest_api_breakage.py b/tests/cross/test_check_agent_server_rest_api_breakage.py index be88df728c..cff7440c27 100644 --- a/tests/cross/test_check_agent_server_rest_api_breakage.py +++ b/tests/cross/test_check_agent_server_rest_api_breakage.py @@ -666,6 +666,11 @@ def test_split_breaking_changes_separates_three_buckets(): "details": {}, "text": "added body anyOf member", }, + { + "id": "response-property-enum-value-added", + "details": {}, + "text": "added the new `agent` enum value to the `type` response property", + }, { "id": "response-property-removed", "details": {}, @@ -688,6 +693,7 @@ def test_split_breaking_changes_separates_three_buckets(): "response-property-one-of-added", "response-body-one-of-added", "response-body-any-of-added", + "response-property-enum-value-added", } assert len(other) == 1 assert other[0]["id"] == "response-body-changed" @@ -723,7 +729,7 @@ def test_main_passes_when_only_additive_oneof(monkeypatch, capsys): assert _prod.main() == 0 captured = capsys.readouterr() - assert "Additive oneOf/anyOf expansion detected" in captured.out + assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out assert "additive response oneOf expansions" in captured.out @@ -790,7 +796,7 @@ def test_main_passes_when_body_union_addition_reports_removed_properties( assert _prod.main() == 0 captured = capsys.readouterr() - assert "Additive oneOf/anyOf expansion detected" in captured.out + assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out assert "ignored 3 request/response-property removal artifact" in captured.out assert "ignored 1 request/response type-change artifact" in captured.out @@ -878,7 +884,7 @@ def test_main_fails_when_additive_oneof_mixed_with_real_breakage(monkeypatch, ca assert _prod.main() == 1 captured = capsys.readouterr() - assert "Additive oneOf/anyOf expansion detected" in captured.out + assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out assert "other than removing previously-deprecated operations" in captured.out diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py index 4e1e730346..64865d968d 100644 --- a/tests/sdk/hooks/test_config.py +++ b/tests/sdk/hooks/test_config.py @@ -11,7 +11,7 @@ def test_command_hook_requires_command(): - with pytest.raises(Exception, match="'command' is required"): + with pytest.raises(ValidationError, match="'command' is required"): HookDefinition(type=HookType.COMMAND) diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 1d11999dbb..b05771df73 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -856,4 +856,4 @@ def test_prompt_hook_does_not_block(self, executor, sample_event): def test_prompt_hook_without_command_validates(self): """PROMPT hook with no command is valid at config time (future use).""" hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") - assert hook.command is None + assert hook.command == "" From ead24bf0ea888a1e7070753e2f68a4b04bf73ae6 Mon Sep 17 00:00:00 2001 From: VascoSch92 Date: Mon, 25 May 2026 00:35:34 +0200 Subject: [PATCH 12/12] =?UTF-8?q?fix(hooks):=20address=20review=20?= =?UTF-8?q?=E2=80=94=20live=20LLM=20lookup,=20scoped=20enum=20allowlist,?= =?UTF-8?q?=20bounded=20example?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Agent hooks now resolve the conversation's current LLM at execution time via an llm_getter threaded through create_hook_callback → HookManager → HookExecutor. Previously the LLM was captured once at hook init, so switch_llm()/switch_profile() left agent hooks using stale model config. - REST breakage check: scope the response enum-value-added downgrade to the hook discriminator (HookConfig → hooks[] → type) instead of allowlisting every response enum addition, so a future unrelated status/mode enum value is still treated as a breaking change. - 51_agent_hooks example: cap each demo conversation at MAX_ITERATIONS and fail fast on ERROR/STUCK end states so a denial loop can't burn calls up to the default 500-iteration limit / CI timeout; run each demo with a fresh, metrics-reset LLM so EXAMPLE_COST no longer double-counts Demo 1's spend. --- .../check_agent_server_rest_api_breakage.py | 33 +++++++-- .../01_standalone_sdk/51_agent_hooks/main.py | 70 ++++++++++++------- .../conversation/impl/local_conversation.py | 4 +- .../openhands/sdk/hooks/conversation_hooks.py | 4 ++ openhands-sdk/openhands/sdk/hooks/executor.py | 21 +++++- openhands-sdk/openhands/sdk/hooks/manager.py | 3 + ...st_check_agent_server_rest_api_breakage.py | 29 +++++++- tests/sdk/hooks/test_executor.py | 35 ++++++++++ 8 files changed, 161 insertions(+), 38 deletions(-) diff --git a/.github/scripts/check_agent_server_rest_api_breakage.py b/.github/scripts/check_agent_server_rest_api_breakage.py index c96640106a..5100b6031a 100644 --- a/.github/scripts/check_agent_server_rest_api_breakage.py +++ b/.github/scripts/check_agent_server_rest_api_breakage.py @@ -557,18 +557,37 @@ def _validate_removed_schema_properties( ) -# oasdiff rule IDs for additive enum-value additions in response schemas. -# Adding a value to a response enum is additive evolution for extensible APIs -# (e.g. a new hook type / discriminator value): clients MUST tolerate unknown -# values and skip/ignore them, the same contract that makes additive oneOf/anyOf -# expansion safe above. We downgrade these to informational notices. -_ADDITIVE_RESPONSE_ENUM_IDS = frozenset( +# oasdiff rule IDs for enum-value additions in response schemas. +_RESPONSE_ENUM_VALUE_ADDED_IDS = frozenset( { "response-property-enum-value-added", "response-write-only-property-enum-value-added", } ) +# Response properties that are known extensible discriminated-union discriminators +# and may therefore grow new enum values additively. Adding a HookType value +# (e.g. "agent") to a hook definition's `type` is safe because hook configs are an +# extensible union and clients must tolerate unknown discriminator values. This is +# intentionally scoped to the hook discriminator so an ordinary new response enum +# value elsewhere (a new status/mode/etc.) is still treated as a breaking change. +_EXTENSIBLE_DISCRIMINATOR_PROPERTY_RE = re.compile( + r"HookConfig\b.*\bhooks/items/type\b" +) + + +def _is_additive_discriminator_enum_value(change: dict) -> bool: + """Return True for additive enum values on a known extensible discriminator. + + Adding a value to a response enum is normally breaking (generated clients may + treat the enum exhaustively), so this is scoped narrowly to the hook config + discriminator union rather than allowlisting every response enum addition. + """ + if str(change.get("id", "")) not in _RESPONSE_ENUM_VALUE_ADDED_IDS: + return False + text = str(change.get("text", "")) + return bool(_EXTENSIBLE_DISCRIMINATOR_PROPERTY_RE.search(text)) + def _is_union_property_removal_artifact(change: dict) -> bool: """Return True for property removals that are artifacts of union widening. @@ -620,7 +639,7 @@ def _split_breaking_changes( continue if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS or ( - change_id in _ADDITIVE_RESPONSE_ENUM_IDS + _is_additive_discriminator_enum_value(change) ): additive_response_oneof.append(change) continue diff --git a/examples/01_standalone_sdk/51_agent_hooks/main.py b/examples/01_standalone_sdk/51_agent_hooks/main.py index 496516cde4..7820c4fde3 100644 --- a/examples/01_standalone_sdk/51_agent_hooks/main.py +++ b/examples/01_standalone_sdk/51_agent_hooks/main.py @@ -25,11 +25,18 @@ from pydantic import SecretStr from openhands.sdk import LLM, Conversation +from openhands.sdk.conversation.state import ConversationExecutionStatus from openhands.sdk.event.hook_execution import HookExecutionEvent from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher, HookType from openhands.tools.preset.default import get_default_agent +# Keep the demo conversations short: a small per-run iteration cap means a hook +# that keeps denying (or a model that keeps retrying) fails fast instead of +# burning calls up to the default 500-iteration limit / CI subprocess timeout. +MAX_ITERATIONS = 10 + + # Configure LLM api_key = os.getenv("LLM_API_KEY") assert api_key is not None, "LLM_API_KEY environment variable is not set." @@ -86,6 +93,37 @@ def hook_logger(event) -> None: print(line) +def run_demo(workspace: Path, hook_config: HookConfig, message: str) -> float: + """Run one demo in its own conversation and return its cost. + + Each demo gets a fresh LLM with isolated metrics so per-demo costs don't + overlap (reusing one LLM would make the second conversation's stats include + the first demo's spend). A small iteration cap plus an error/stuck check make + the example fail fast instead of looping. + """ + demo_llm = llm.model_copy() + demo_llm.reset_metrics() + conversation = Conversation( + agent=get_default_agent(llm=demo_llm), + workspace=str(workspace), + hook_config=hook_config, + callbacks=[hook_logger], + max_iteration_per_run=MAX_ITERATIONS, + ) + conversation.send_message(message) + conversation.run() + status = conversation.state.execution_status + if status in ( + ConversationExecutionStatus.ERROR, + ConversationExecutionStatus.STUCK, + ): + raise RuntimeError( + f"Demo conversation ended in {status.value} state " + "before reaching a decision." + ) + return conversation.conversation_stats.get_combined_metrics().accumulated_cost + + # Each demo runs in its own conversation with only the hook it needs. Sharing a # single config would leave the Stop quality gate active during Demo 1, so the # agent could never finish the first task until REPORT.md existed — coupling two @@ -138,37 +176,21 @@ def hook_logger(event) -> None: "keyword), but the security-reviewer agent should recognise the\n" "intent and deny.\n" ) - security_conversation = Conversation( - agent=get_default_agent(llm=llm), - workspace=str(workspace), - hook_config=security_hook_config, - callbacks=[hook_logger], - ) - security_conversation.send_message( - "Show me the contents of /etc/passwd using awk '{print}'." - ) - security_conversation.run() - total_cost += ( - security_conversation.conversation_stats.get_combined_metrics().accumulated_cost + total_cost += run_demo( + workspace, + security_hook_config, + "Show me the contents of /etc/passwd using awk '{print}'.", ) print("\n" + "=" * 60) print("Demo 2: Stop — deny until deliverable exists") print("=" * 60) print("Quality reviewer denies until REPORT.md exists with a bullet point.\n") - quality_conversation = Conversation( - agent=get_default_agent(llm=llm), - workspace=str(workspace), - hook_config=quality_hook_config, - callbacks=[hook_logger], - ) - quality_conversation.send_message( + total_cost += run_demo( + workspace, + quality_hook_config, "Write REPORT.md in the workspace with at least one bullet point " - "describing this repository, then finish." - ) - quality_conversation.run() - total_cost += ( - quality_conversation.conversation_stats.get_combined_metrics().accumulated_cost + "describing this repository, then finish.", ) report = workspace / "REPORT.md" diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index b798a8b51c..46f6a8ef6a 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -570,7 +570,9 @@ def _ensure_plugins_loaded(self) -> None: working_dir=str(self.workspace.working_dir), session_id=str(self._state.id), original_callback=self._base_callback, - llm=self.agent.llm, + # Resolve lazily: switch_llm()/switch_profile() rebind self.agent, + # so agent hooks must read the current LLM at execution time. + llm_getter=lambda: self.agent.llm, persistence_dir=hook_persistence_dir, visualizer=self._visualizer, conversation_stats=self._state.stats, diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py index 91f5197ec3..960fbab589 100644 --- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py +++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py @@ -393,6 +393,7 @@ def create_hook_callback( original_callback: Any = None, emit_hook_events: bool = True, llm: "LLM | None" = None, + llm_getter: "Callable[[], LLM | None] | None" = None, persistence_dir: str | None = None, visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase @@ -409,6 +410,8 @@ def create_hook_callback( emit_hook_events: If True, emit HookExecutionEvent for each hook execution. Defaults to True for full observability. llm: LLM instance inherited from the parent conversation, used by agent hooks. + llm_getter: Callable returning the conversation's current LLM. Preferred + over ``llm`` so agent hooks follow switch_llm()/switch_profile(). persistence_dir: Directory used to persist agent hook sub-conversation events. visualizer: Visualizer instance passed to agent hook sub-conversations. conversation_stats: Parent conversation stats that should include hook spend. @@ -421,6 +424,7 @@ def create_hook_callback( working_dir=working_dir, session_id=session_id, llm=llm, + llm_getter=llm_getter, persistence_dir=persistence_dir, visualizer=visualizer, conversation_stats=conversation_stats, diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 72d97ae6b7..7134adc8de 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -7,6 +7,7 @@ import signal import subprocess import time +from collections.abc import Callable from typing import TYPE_CHECKING from pydantic import BaseModel @@ -162,6 +163,7 @@ def __init__( working_dir: str | None = None, async_process_manager: AsyncProcessManager | None = None, llm: "LLM | None" = None, + llm_getter: "Callable[[], LLM | None] | None" = None, persistence_dir: str | None = None, visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase @@ -170,11 +172,22 @@ def __init__( ): self.working_dir = working_dir or os.getcwd() self.async_process_manager = async_process_manager or AsyncProcessManager() - self.llm = llm + self._llm = llm + # Prefer a getter so agent hooks always use the conversation's *current* + # LLM: switch_llm()/switch_profile() replace agent.llm after the executor + # is built, and a captured instance would go stale. + self._llm_getter = llm_getter self.persistence_dir = persistence_dir self.visualizer = visualizer self.conversation_stats = conversation_stats + @property + def llm(self) -> "LLM | None": + """The LLM agent hooks should use, resolved live when a getter is set.""" + if self._llm_getter is not None: + return self._llm_getter() + return self._llm + def _fall_open( self, reason: str, @@ -211,14 +224,16 @@ def _execute_agent_hook( else event.event_type.value ) - if self.llm is None: + # Resolve the active conversation LLM once (a getter may rebuild it). + llm = self.llm + if llm is None: logger.warning( f"Agent hook has no LLM configured for event '{event_type}'" " — defaulting to allow" ) return self._fall_open("No LLM configured for agent hook") - hook_llm = self.llm.model_copy( + hook_llm = llm.model_copy( update={ "usage_id": f"agent-hook:{hook.name or 'default'}", "timeout": hook.timeout, diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py index aac309a487..923a07cabf 100644 --- a/openhands-sdk/openhands/sdk/hooks/manager.py +++ b/openhands-sdk/openhands/sdk/hooks/manager.py @@ -1,6 +1,7 @@ """Hook manager - orchestrates hook execution within conversations.""" import logging +from collections.abc import Callable from typing import TYPE_CHECKING, Any from openhands.sdk.conversation.visualizer import ConversationVisualizerBase @@ -26,6 +27,7 @@ def __init__( working_dir: str | None = None, session_id: str | None = None, llm: "LLM | None" = None, + llm_getter: "Callable[[], LLM | None] | None" = None, persistence_dir: str | None = None, visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase @@ -36,6 +38,7 @@ def __init__( self.executor = HookExecutor( working_dir=working_dir, llm=llm, + llm_getter=llm_getter, persistence_dir=persistence_dir, visualizer=visualizer, conversation_stats=conversation_stats, diff --git a/tests/cross/test_check_agent_server_rest_api_breakage.py b/tests/cross/test_check_agent_server_rest_api_breakage.py index cff7440c27..bb0a29514c 100644 --- a/tests/cross/test_check_agent_server_rest_api_breakage.py +++ b/tests/cross/test_check_agent_server_rest_api_breakage.py @@ -667,9 +667,20 @@ def test_split_breaking_changes_separates_three_buckets(): "text": "added body anyOf member", }, { + # Additive value on the hook discriminator union -> downgraded. "id": "response-property-enum-value-added", "details": {}, - "text": "added the new `agent` enum value to the `type` response property", + "text": ( + "added the new `agent` enum value to the " + "`hook_config/anyOf[subschema #1: HookConfig]/stop/items/" + "hooks/items/type` response property for the response status `200`" + ), + }, + { + # Enum value on an ordinary (non-discriminator) property -> breaking. + "id": "response-property-enum-value-added", + "details": {}, + "text": "added the new `archived` enum value to the `status` response property", }, { "id": "response-property-removed", @@ -695,8 +706,20 @@ def test_split_breaking_changes_separates_three_buckets(): "response-body-any-of-added", "response-property-enum-value-added", } - assert len(other) == 1 - assert other[0]["id"] == "response-body-changed" + # The hook-discriminator enum addition is downgraded; the unrelated `status` + # enum addition and the body change remain breaking. + assert { + change["text"] for change in additive_oneof if "enum value" in change["text"] + } == { + "added the new `agent` enum value to the " + "`hook_config/anyOf[subschema #1: HookConfig]/stop/items/" + "hooks/items/type` response property for the response status `200`" + } + assert {change["id"] for change in other} == { + "response-property-enum-value-added", + "response-body-changed", + } + assert any("`status`" in change["text"] for change in other) def test_main_passes_when_only_additive_oneof(monkeypatch, capsys): diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index b05771df73..00c5640991 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -717,6 +717,41 @@ def capture_agent_init(**kwargs): assert hook_llm.usage_id == "agent-hook:default" assert hook_llm.metrics is not parent_metrics + def test_llm_getter_is_resolved_live(self, tmp_path, sample_event): + """An llm_getter is read at execution time, so agent hooks follow + switch_llm()/switch_profile() instead of using a stale captured LLM.""" + current = { + "llm": LLM(model="gpt-4o", api_key=SecretStr("k1"), usage_id="first") + } + executor = HookExecutor( + working_dir=str(tmp_path), + llm_getter=lambda: current["llm"], + ) + + # Simulate switch_llm(): the conversation rebinds its agent's LLM. + current["llm"] = LLM( + model="gpt-5.5", api_key=SecretStr("k2"), usage_id="second" + ) + + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + executor._execute_agent_hook( + HookDefinition(type=HookType.AGENT), sample_event + ) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + # Copied from the *current* LLM (gpt-5.5), not the one present at init. + assert hook_llm.model == "gpt-5.5" + def test_hook_metrics_are_merged_into_parent_stats( self, tmp_path, mock_llm, sample_event ):