From 81fba9d5dbfd22a85f273e521640f62f318b5f64 Mon Sep 17 00:00:00 2001
From: "burak.keles" <burak.keles@ode.al>
Date: Thu, 30 Apr 2026 16:01:51 +0300
Subject: [PATCH 01/12] Add agent-based hook execution with sub-conversation
 support

---
 .../conversation/impl/local_conversation.py   |   1 +
 openhands-sdk/openhands/sdk/hooks/config.py   |  30 +--
 .../openhands/sdk/hooks/conversation_hooks.py |  16 +-
 openhands-sdk/openhands/sdk/hooks/executor.py | 205 ++++++++++++++++-
 openhands-sdk/openhands/sdk/hooks/manager.py  |   9 +-
 tests/sdk/hooks/test_config.py                |  62 +++++
 tests/sdk/hooks/test_executor.py              | 216 ++++++++++++++++++
 7 files changed, 515 insertions(+), 24 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
index a6bb1ec60e..279794f223 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -565,6 +565,7 @@ def _ensure_plugins_loaded(self) -> None:
                 working_dir=str(self.workspace.working_dir),
                 session_id=str(self._state.id),
                 original_callback=self._base_callback,
+                llm=self.agent.llm,
             )
             self._hook_processor.set_conversation_state(self._state)
             self._hook_processor.run_session_start()
diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index 594597cf8b..de0f1105ce 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -41,13 +41,14 @@ class HookType(StrEnum):
 
     COMMAND = "command"  # Shell command executed via subprocess
     PROMPT = "prompt"  # LLM-based evaluation (future)
+    AGENT = "agent"  # Agent-based evaluation with tool access
 
 
 class HookDefinition(BaseModel):
     """A single hook definition."""
 
     type: HookType = HookType.COMMAND
-    command: str
+    command: str | None = None
     prompt: str | None = None
     timeout: int = 60
     async_: bool = Field(default=False, alias="async")  # 'async' is a reserved keyword
@@ -56,25 +57,28 @@ class HookDefinition(BaseModel):
         "populate_by_name": True,  # Allow both 'async' and 'async_' in input
     }
 
-    @model_validator(mode="before")
-    @classmethod
-    def _set_command_for_prompt_hooks(cls, data: Any) -> Any:
-        if (
-            isinstance(data, dict)
-            and data.get("type") == "prompt"
-            and "command" not in data
-        ):
-            data["command"] = ""
-        return data
-
     @model_validator(mode="after")
-    def _check_required_fields(self) -> "HookDefinition":
+    def _validate_type_fields(self) -> "HookDefinition":
         if self.type == HookType.COMMAND and not self.command:
             raise ValueError("'command' is required when type is 'command'")
         if self.type == HookType.PROMPT and not self.prompt:
             raise ValueError("'prompt' is required when type is 'prompt'")
+        if self.type == HookType.AGENT and self.command is not None:
+            raise ValueError(
+                "'command' must not be set when type is 'agent'; use 'prompt' instead"
+            )
+        if self.type == HookType.AGENT and self.async_:
+            raise ValueError("'async' is not supported for agent hooks")
         return self
 
+    @property
+    def display_command(self) -> str:
+        """Human-readable label for this hook used in logs and events."""
+        return (
+            self.command
+            if self.command is not None
+            else f"agent-hook:{self.type.value}"
+        )
 
 class HookMatcher(BaseModel):
     """Matches events to hooks based on patterns.
diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
index 4eaac1c1ac..a0438075ac 100644
--- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
+++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
@@ -20,6 +20,7 @@
 
 if TYPE_CHECKING:
     from openhands.sdk.conversation.state import ConversationState
+    from openhands.sdk.llm import LLM
 
 logger = get_logger(__name__)
 
@@ -149,7 +150,7 @@ def _handle_pre_tool_use(self, event: ActionEvent) -> None:
         for hook, result in zip(hooks, results, strict=False):
             self._emit_hook_execution_event(
                 hook_event_type=HookEventType.PRE_TOOL_USE,
-                hook_command=hook.command,
+                hook_command=hook.display_command,
                 result=result,
                 tool_name=tool_name,
                 action_id=event.id,
@@ -224,7 +225,7 @@ def _handle_post_tool_use(self, event: ObservationEvent) -> None:
         for hook, result in zip(hooks, results, strict=False):
             self._emit_hook_execution_event(
                 hook_event_type=HookEventType.POST_TOOL_USE,
-                hook_command=hook.command,
+                hook_command=hook.display_command,
                 result=result,
                 tool_name=tool_name,
                 action_id=action_event.id,
@@ -266,7 +267,7 @@ def _handle_user_prompt_submit(self, event: MessageEvent) -> MessageEvent:
         for hook, result in zip(hooks, results, strict=False):
             self._emit_hook_execution_event(
                 hook_event_type=HookEventType.USER_PROMPT_SUBMIT,
-                hook_command=hook.command,
+                hook_command=hook.display_command,
                 result=result,
                 message_id=event.id,
                 hook_input={"message": message},
@@ -328,7 +329,7 @@ def run_session_start(self) -> None:
         for hook, result in zip(hooks, results, strict=False):
             self._emit_hook_execution_event(
                 hook_event_type=HookEventType.SESSION_START,
-                hook_command=hook.command,
+                hook_command=hook.display_command,
                 result=result,
             )
             if result.error:
@@ -342,7 +343,7 @@ def run_session_end(self) -> None:
         for hook, result in zip(hooks, results, strict=False):
             self._emit_hook_execution_event(
                 hook_event_type=HookEventType.SESSION_END,
-                hook_command=hook.command,
+                hook_command=hook.display_command,
                 result=result,
             )
             if result.error:
@@ -360,7 +361,7 @@ def run_stop(self, reason: str | None = None) -> tuple[bool, str | None]:
         for hook, result in zip(hooks, results, strict=False):
             self._emit_hook_execution_event(
                 hook_event_type=HookEventType.STOP,
-                hook_command=hook.command,
+                hook_command=hook.display_command,
                 result=result,
                 hook_input={"reason": reason} if reason else None,
             )
@@ -389,6 +390,7 @@ def create_hook_callback(
     session_id: str | None = None,
     original_callback: Any = None,
     emit_hook_events: bool = True,
+    llm: "LLM | None" = None,
 ) -> tuple[HookEventProcessor, Any]:
     """Create a hook-enabled event callback. Returns (processor, callback).
 
@@ -399,6 +401,7 @@ def create_hook_callback(
         original_callback: Callback to chain after hook processing.
         emit_hook_events: If True, emit HookExecutionEvent for each hook execution.
             Defaults to True for full observability.
+        llm: LLM instance inherited from the parent conversation, used by agent hooks.
 
     Returns:
         Tuple of (HookEventProcessor, callback function).
@@ -407,6 +410,7 @@ def create_hook_callback(
         config=hook_config,
         working_dir=working_dir,
         session_id=session_id,
+        llm=llm,
     )
 
     processor = HookEventProcessor(
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 6f3ce897b1..978f6354b6 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -1,19 +1,27 @@
-"""Hook executor - runs shell commands with JSON I/O."""
+"""Hook executor - runs shell commands and agent evaluations with JSON I/O."""
 
+import concurrent.futures
 import json
 import logging
 import os
 import signal
 import subprocess
 import time
+from typing import TYPE_CHECKING
 
+import opentelemetry.context as otel_context
 from pydantic import BaseModel
 
-from openhands.sdk.hooks.config import HookDefinition
+from openhands.sdk.hooks.config import HookDefinition, HookType
 from openhands.sdk.hooks.types import HookDecision, HookEvent
+from openhands.sdk.observability.laminar import observe
 from openhands.sdk.utils import sanitized_env
 
 
+if TYPE_CHECKING:
+    from openhands.sdk.llm import LLM
+
+
 class HookResult(BaseModel):
     """Result from executing a hook.
 
@@ -138,15 +146,188 @@ def cleanup_all(self) -> None:
 
 
 class HookExecutor:
-    """Executes hook commands with JSON I/O."""
+    """Executes hook commands and agent evaluations with JSON I/O."""
 
     def __init__(
         self,
         working_dir: str | None = None,
         async_process_manager: AsyncProcessManager | None = None,
+        llm: "LLM | None" = None,
     ):
         self.working_dir = working_dir or os.getcwd()
         self.async_process_manager = async_process_manager or AsyncProcessManager()
+        self.llm = llm
+
+    @observe(
+        name="hook.execute.agent",
+        ignore_inputs=["self", "hook", "event"],
+        ignore_output=True,
+    )
+    def _execute_agent_hook(
+        self,
+        hook: HookDefinition,
+        event: HookEvent,
+    ) -> HookResult:
+        """Execute an agent-based hook by spawning a short-lived sub-conversation.
+
+        The sub-conversation inherits the parent LLM, runs with read-only tools
+        and no hooks (preventing recursion), and must return a JSON decision.
+        """
+        # Lazy imports to avoid circular dependency:
+        # executor <- manager <- conversation_hooks <- local_conversation -> executor
+        from openhands.sdk.agent import Agent  # type: ignore[attr-defined]
+        from openhands.sdk.conversation.impl.local_conversation import LocalConversation
+        from openhands.sdk.conversation.response_utils import get_agent_final_response
+        from openhands.sdk.tool.spec import Tool
+
+        event_type = (
+            event.event_type
+            if isinstance(event.event_type, str)
+            else event.event_type.value
+        )
+
+        if self.llm is None:
+            logger.warning(
+                f"Agent hook has no LLM configured for event '{event_type}'"
+                " — defaulting to allow"
+            )
+            return HookResult(
+                success=True,
+                decision=HookDecision.ALLOW,
+                reason="No LLM configured for agent hook",
+            )
+
+        no_policy = "No additional policy specified. Use your best judgment."
+        system_prompt = (
+            "You are a security policy evaluator for an AI coding agent. "
+            "Your only job is to decide whether an agent operation"
+            " should be allowed or denied.\n\n"
+            f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```\n\n"
+            "## Instructions\n"
+            "Use the available tools to investigate if needed (e.g. read files"
+            " in the working directory). Then call `finish` with a JSON string"
+            " as the message:\n"
+            '  {"decision": "allow" | "deny", "reason": "<brief explanation>"}\n\n'
+            "When in doubt, prefer `allow`. Only deny when the operation clearly"
+            " violates the policy below.\n\n"
+            f"## Policy\n{hook.prompt or no_policy}"
+        )
+
+        # Isolated LLM copy — hook metrics not attributed to the parent conversation.
+        hook_llm = self.llm.model_copy(update={"usage_id": "hook-agent-evaluator"})
+        hook_llm.reset_metrics()
+
+        # Capture the current OTel span context so the sub-conversation thread
+        # becomes a child of this span rather than an orphaned trace.
+        parent_ctx = otel_context.get_current()
+
+        conversation = None
+        try:
+            agent = Agent(
+                llm=hook_llm,
+                tools=[
+                    Tool(name="GrepTool"),
+                    Tool(name="GlobTool"),
+                ],
+                include_default_tools=["FinishTool"],
+                system_prompt=system_prompt,
+            )
+            # hook_config=None disables all hooks in the sub-conversation (no recursion)
+            conversation = LocalConversation(
+                agent=agent,
+                workspace=self.working_dir,
+                plugins=None,
+                hook_config=None,
+                persistence_dir=None,
+                visualizer=None,
+                max_iteration_per_run=10,
+            )
+            conversation.send_message(
+                f"Evaluate this {event_type} hook event and make your decision."
+            )
+
+            def _run_with_context() -> None:
+                token = otel_context.attach(parent_ctx)
+                try:
+                    conversation.run()
+                finally:
+                    otel_context.detach(token)
+
+            pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+            try:
+                future = pool.submit(_run_with_context)
+                try:
+                    future.result(timeout=hook.timeout)
+                except concurrent.futures.TimeoutError:
+                    logger.warning(
+                        f"Agent hook timed out after {hook.timeout}s"
+                        f" for event '{event_type}' — defaulting to allow"
+                    )
+                    return HookResult(
+                        success=False,
+                        decision=HookDecision.ALLOW,
+                        reason=(
+                            f"Agent hook timed out after {hook.timeout} seconds"
+                            " — defaulting to allow"
+                        ),
+                    )
+            finally:
+                # Don't block waiting for the thread — the conversation may still be
+                # running but we've already returned or moved on.
+                pool.shutdown(wait=False)
+            raw = get_agent_final_response(conversation.state.events)
+        except Exception as e:
+            logger.warning(
+                f"Agent hook sub-conversation failed for event '{event_type}'"
+                f" — defaulting to allow: {e}"
+            )
+            return HookResult(
+                success=False,
+                decision=HookDecision.ALLOW,
+                reason="Agent hook execution failed — defaulting to allow",
+                error=str(e),
+            )
+        finally:
+            if conversation is not None:
+                conversation.close()
+
+        if not raw:
+            logger.warning(
+                f"Agent hook produced no final response for event '{event_type}'"
+                " — defaulting to allow"
+            )
+            return HookResult(
+                success=False,
+                decision=HookDecision.ALLOW,
+                reason="Agent hook produced no final response — defaulting to allow",
+            )
+
+        try:
+            data = json.loads(raw)
+            decision_str = str(data.get("decision", "allow")).lower()
+            reason = str(data.get("reason", ""))
+            if decision_str == "deny":
+                return HookResult(
+                    success=True,
+                    blocked=True,
+                    decision=HookDecision.DENY,
+                    reason=reason,
+                )
+            return HookResult(
+                success=True,
+                decision=HookDecision.ALLOW,
+                reason=reason,
+            )
+        except (json.JSONDecodeError, AttributeError):
+            logger.warning(
+                f"Agent hook returned non-JSON response for event '{event_type}'"
+                f" — defaulting to allow: {repr(raw)[:200]}"
+            )
+            return HookResult(
+                success=True,
+                decision=HookDecision.ALLOW,
+                reason="Agent hook returned non-JSON response — defaulting to allow",
+            )
 
     def execute(
         self,
@@ -155,6 +336,24 @@ def execute(
         env: dict[str, str] | None = None,
     ) -> HookResult:
         """Execute a single hook."""
+        if hook.type == HookType.AGENT:
+            return self._execute_agent_hook(hook, event)
+        if hook.type == HookType.PROMPT:
+            event_type = (
+                event.event_type
+                if isinstance(event.event_type, str)
+                else event.event_type.value
+            )
+            logger.warning(
+                f"PROMPT hooks are not yet implemented — defaulting to allow"
+                f" (event_type={event_type})"
+            )
+            return HookResult(
+                success=False,
+                decision=HookDecision.ALLOW,
+                reason="PROMPT hooks are not yet implemented — defaulting to allow",
+            )
+
         # Prepare environment
         hook_env = sanitized_env()
         hook_env["OPENHANDS_PROJECT_DIR"] = self.working_dir
diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py
index af57be2734..7f7b50945f 100644
--- a/openhands-sdk/openhands/sdk/hooks/manager.py
+++ b/openhands-sdk/openhands/sdk/hooks/manager.py
@@ -1,13 +1,17 @@
 """Hook manager - orchestrates hook execution within conversations."""
 
 import logging
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from openhands.sdk.hooks.config import HookConfig
 from openhands.sdk.hooks.executor import HookExecutor, HookResult
 from openhands.sdk.hooks.types import HookEvent, HookEventType
 
 
+if TYPE_CHECKING:
+    from openhands.sdk.llm import LLM
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -19,9 +23,10 @@ def __init__(
         config: HookConfig | None = None,
         working_dir: str | None = None,
         session_id: str | None = None,
+        llm: "LLM | None" = None,
     ):
         self.config = config or HookConfig.load(working_dir=working_dir)
-        self.executor = HookExecutor(working_dir=working_dir)
+        self.executor = HookExecutor(working_dir=working_dir, llm=llm)
         self.session_id = session_id
         self.working_dir = working_dir
 
diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py
index f5a1c832e4..d5059fbd82 100644
--- a/tests/sdk/hooks/test_config.py
+++ b/tests/sdk/hooks/test_config.py
@@ -10,6 +10,68 @@
 from openhands.sdk.hooks.types import HookEventType
 
 
+class TestHookDefinitionValidation:
+    """Tests for HookDefinition type-specific field validation."""
+
+    def test_command_hook_requires_command(self):
+        """COMMAND type must have a command field."""
+        with pytest.raises(Exception, match="'command' is required"):
+            HookDefinition(type="command")
+
+    def test_command_hook_valid(self):
+        """COMMAND type with command is valid."""
+        h = HookDefinition(command="echo hi")
+        assert h.type == HookType.COMMAND
+        assert h.command == "echo hi"
+
+    def test_agent_hook_valid_with_prompt(self):
+        """AGENT type with prompt (no command) is valid."""
+        h = HookDefinition(type="agent", prompt="Block writes to /etc")
+        assert h.type == HookType.AGENT
+        assert h.prompt == "Block writes to /etc"
+        assert h.command is None
+
+    def test_agent_hook_valid_without_prompt(self):
+        """AGENT type without prompt is valid (prompt is optional)."""
+        h = HookDefinition(type="agent")
+        assert h.type == HookType.AGENT
+        assert h.prompt is None
+
+    def test_agent_hook_rejects_command_field(self):
+        """AGENT type must not have command set alongside it."""
+        with pytest.raises(
+            Exception, match="'command' must not be set when type is 'agent'"
+        ):
+            HookDefinition(type="agent", command="echo hi")
+
+    def test_agent_hook_rejects_async(self):
+        """AGENT type does not support async_=True."""
+        with pytest.raises(Exception, match="not supported for agent hooks"):
+            HookDefinition.model_validate({"type": "agent", "async": True})
+
+    def test_agent_hook_from_json(self):
+        """AGENT hook can be loaded from JSON config."""
+        data = {
+            "stop": [
+                {
+                    "hooks": [
+                        {
+                            "type": "agent",
+                            "prompt": "Verify all tasks are done",
+                            "timeout": 30,
+                        }
+                    ]
+                }
+            ]
+        }
+        config = HookConfig.from_dict(data)
+        hooks = config.get_hooks_for_event(HookEventType.STOP)
+        assert len(hooks) == 1
+        assert hooks[0].type == HookType.AGENT
+        assert hooks[0].prompt == "Verify all tasks are done"
+        assert hooks[0].timeout == 30
+
+
 class TestHookMatcher:
     """Tests for HookMatcher pattern matching."""
 
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 26d31d0565..69e6323b68 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -3,12 +3,15 @@
 import json
 import subprocess
 from unittest import mock
+from unittest.mock import MagicMock, patch
 
 import pytest
+from pydantic import SecretStr
 
 from openhands.sdk.hooks.config import HookDefinition
 from openhands.sdk.hooks.executor import HookExecutor
 from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType
+from openhands.sdk.llm import LLM
 from tests.command_utils import python_command
 
 
@@ -421,3 +424,216 @@ def test_cleanup_expired_keeps_active_processes(self, tmp_path):
 
         # Clean up for test teardown
         process.terminate()
+
+
+class TestAgentHookExecution:
+    """Tests for HookType.AGENT execution path."""
+
+    # Patch at the package/module level so lazy `from X import Y` picks up the mock.
+    _AGENT_PATH = "openhands.sdk.agent.Agent"
+    _CONV_PATH = "openhands.sdk.conversation.impl.local_conversation.LocalConversation"
+    _RESPONSE_PATH = (
+        "openhands.sdk.conversation.response_utils.get_agent_final_response"
+    )
+
+    @pytest.fixture
+    def mock_llm(self):
+        return LLM(model="gpt-4o", api_key=SecretStr("test-key"), usage_id="test")
+
+    @pytest.fixture
+    def executor(self, tmp_path, mock_llm):
+        return HookExecutor(working_dir=str(tmp_path), llm=mock_llm)
+
+    @pytest.fixture
+    def executor_no_llm(self, tmp_path):
+        return HookExecutor(working_dir=str(tmp_path), llm=None)
+
+    @pytest.fixture
+    def sample_event(self):
+        return HookEvent(
+            event_type=HookEventType.STOP,
+            session_id="test-session",
+        )
+
+    def test_execute_dispatches_to_agent_hook(self, executor, sample_event):
+        """execute() routes AGENT type to _execute_agent_hook, not subprocess."""
+        from openhands.sdk.hooks.executor import HookResult
+
+        hook = HookDefinition(type="agent", prompt="Verify task completion")
+
+        with patch.object(
+            executor,
+            "_execute_agent_hook",
+            return_value=HookResult(
+                success=True, decision=HookDecision.ALLOW, reason="ok"
+            ),
+        ) as mock_agent:
+            result = executor.execute(hook, sample_event)
+            mock_agent.assert_called_once_with(hook, sample_event)
+
+        assert result.decision == HookDecision.ALLOW
+        assert result.should_continue
+
+    def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event):
+        """Agent hook with no LLM configured defaults to allow without error."""
+        hook = HookDefinition(type="agent", prompt="Check something")
+        result = executor_no_llm.execute(hook, sample_event)
+
+        assert result.success
+        assert result.decision == HookDecision.ALLOW
+        assert not result.blocked
+
+    def test_deny_decision_blocks(self, executor, sample_event):
+        """_execute_agent_hook parsing deny JSON produces a blocked HookResult."""
+        deny_json = '{"decision": "deny", "reason": "Tasks not complete"}'
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=deny_json),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent", prompt="Check tasks")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert result.blocked
+        assert result.decision == HookDecision.DENY
+        assert result.reason == "Tasks not complete"
+        assert not result.should_continue
+
+    def test_allow_decision_passes(self, executor, sample_event):
+        """_execute_agent_hook parsing allow JSON produces a non-blocking HookResult."""
+        allow_json = '{"decision": "allow", "reason": "Looks safe"}'
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=allow_json),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert result.reason == "Looks safe"
+
+    def test_invalid_json_defaults_to_allow(self, executor, sample_event):
+        """_execute_agent_hook with non-JSON LLM response defaults to allow."""
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value="I think you should allow this."),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert result.should_continue
+
+    def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event):
+        """_execute_agent_hook when sub-conversation raises defaults to allow."""
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH, side_effect=RuntimeError("workspace error")),
+        ):
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert result.error is not None
+
+    def test_empty_response_defaults_to_allow(self, executor, sample_event):
+        """_execute_agent_hook when agent produces no response defaults to allow."""
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=""),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert result.success is False
+
+    def test_timeout_enforced(self, executor, sample_event):
+        """Agent hook respects hook.timeout via ThreadPoolExecutor."""
+        import time
+
+        def slow_run():
+            time.sleep(10)
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+        ):
+            mock_conv = MagicMock()
+            mock_conv.run = slow_run
+            mock_conv_cls.return_value = mock_conv
+
+            hook = HookDefinition(type="agent", timeout=1)
+            start = time.time()
+            result = executor._execute_agent_hook(hook, sample_event)
+            elapsed = time.time() - start
+
+        assert elapsed < 5
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert "timed out" in (result.reason or "")
+
+    def test_metrics_are_isolated_from_parent_llm(self, executor, sample_event):
+        """model_copy + reset_metrics ensures hook tokens don't pollute parent LLM."""
+        captured_llm = {}
+
+        def capture_agent_init(**kwargs):
+            captured_llm["llm"] = kwargs.get("llm")
+            return MagicMock()
+
+        with (
+            patch(self._AGENT_PATH, side_effect=capture_agent_init),
+            patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
+        ):
+            hook = HookDefinition(type="agent")
+            executor._execute_agent_hook(hook, sample_event)
+
+        hook_llm = captured_llm.get("llm")
+        assert hook_llm is not None
+        assert hook_llm is not executor.llm
+        assert hook_llm.usage_id == "hook-agent-evaluator"
+
+
+class TestPromptHookNotImplemented:
+    """HookType.PROMPT is a future stub — execution defaults to allow, never crashes."""
+
+    @pytest.fixture
+    def executor(self, tmp_path):
+        return HookExecutor(working_dir=str(tmp_path))
+
+    @pytest.fixture
+    def sample_event(self):
+        return HookEvent(event_type=HookEventType.PRE_TOOL_USE, tool_name="BashTool")
+
+    def test_prompt_hook_defaults_to_allow(self, executor, sample_event):
+        """Executing a PROMPT hook returns allow instead of crashing."""
+        hook = HookDefinition(type="prompt")
+        result = executor.execute(hook, sample_event)
+        assert result.decision == HookDecision.ALLOW
+        assert result.success is False
+        assert "not yet implemented" in (result.reason or "")
+
+    def test_prompt_hook_does_not_block(self, executor, sample_event):
+        """PROMPT hook must not block the operation while unimplemented."""
+        hook = HookDefinition(type="prompt")
+        result = executor.execute(hook, sample_event)
+        assert result.blocked is False
+        assert result.should_continue is True
+
+    def test_prompt_hook_without_command_validates(self):
+        """PROMPT hook with no command is valid at config time (future use)."""
+        hook = HookDefinition(type="prompt")
+        assert hook.command is None

From 6e6406c0fa8d86509e1b4e6789cd5728f4ad0bfe Mon Sep 17 00:00:00 2001
From: "burak.keles" <burak.keles@ode.al>
Date: Thu, 30 Apr 2026 16:48:02 +0300
Subject: [PATCH 02/12] Add display_command property to HookDefinition for
 improved hook labeling

---
 openhands-sdk/openhands/sdk/hooks/config.py | 13 +++++----
 tests/sdk/hooks/test_config.py              | 31 +++++++++++++++++++++
 2 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index de0f1105ce..c13dfd5e26 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -48,6 +48,7 @@ class HookDefinition(BaseModel):
     """A single hook definition."""
 
     type: HookType = HookType.COMMAND
+    name: str | None = None
     command: str | None = None
     prompt: str | None = None
     timeout: int = 60
@@ -74,11 +75,13 @@ def _validate_type_fields(self) -> "HookDefinition":
     @property
     def display_command(self) -> str:
         """Human-readable label for this hook used in logs and events."""
-        return (
-            self.command
-            if self.command is not None
-            else f"agent-hook:{self.type.value}"
-        )
+        if self.command is not None:
+            return self.command
+        if self.name is not None:
+            return f"agent-hook:{self.name}"
+        if self.prompt:
+            return f"agent-hook:{self.prompt[:20]}"
+        return "agent-hook:agent"
 
 class HookMatcher(BaseModel):
     """Matches events to hooks based on patterns.
diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py
index d5059fbd82..568e8ccfab 100644
--- a/tests/sdk/hooks/test_config.py
+++ b/tests/sdk/hooks/test_config.py
@@ -37,6 +37,37 @@ def test_agent_hook_valid_without_prompt(self):
         assert h.type == HookType.AGENT
         assert h.prompt is None
 
+    def test_display_command_uses_command_for_command_hooks(self):
+        h = HookDefinition(command="block.sh")
+        assert h.display_command == "block.sh"
+
+    def test_display_command_uses_name_when_set(self):
+        h = HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf")
+        assert h.display_command == "agent-hook:block-deletions"
+
+    def test_display_command_falls_back_to_prompt_prefix(self):
+        h = HookDefinition(type="agent", prompt="Block network calls to external IPs")
+        assert h.display_command == "agent-hook:Block network calls "
+
+    def test_display_command_truncates_long_prompt(self):
+        long_prompt = "A" * 100
+        h = HookDefinition(type="agent", prompt=long_prompt)
+        assert h.display_command == f"agent-hook:{'A' * 20}"
+
+    def test_display_command_fallback_when_no_name_no_prompt(self):
+        h = HookDefinition(type="agent")
+        assert h.display_command == "agent-hook:agent"
+
+    def test_three_hooks_are_distinguishable(self):
+        """Multiple agent hooks without names get unique labels via prompt prefix."""
+        hooks = [
+            HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf"),
+            HookDefinition(type="agent", prompt="Block network calls to external IPs"),
+            HookDefinition(type="agent", prompt="Verify all tasks are complete"),
+        ]
+        labels = [h.display_command for h in hooks]
+        assert len(set(labels)) == 3  # all distinct
+
     def test_agent_hook_rejects_command_field(self):
         """AGENT type must not have command set alongside it."""
         with pytest.raises(

From 7eb38715f0d7efc0d39c0a0f62b0b7b71826640e Mon Sep 17 00:00:00 2001
From: "burak.keles" <burak.keles@ode.al>
Date: Sat, 9 May 2026 18:23:26 +0300
Subject: [PATCH 03/12] Address PR review feedback on agent hook implementation

  - Replace hardcoded system prompt with user-configurable system_prompt field
  - Replace prompt field with system_prompt on HookDefinition
  - Make tools configurable via hook.tools (default empty list)
  - Thread persistence_dir and visualizer through HookManager and create_hook_callback
  - Add max_iterations field to HookDefinition; remove hardcoded 10
  - Inject event payload into user message so agent always has context to evaluate
  - Call conversation.pause() before pool.shutdown on timeout for safe cleanup
  - Change HookType to StrEnum
  - Move test_config.py tests from classes to module-level functions with parametrize
---
 .../conversation/impl/local_conversation.py   |   2 +
 openhands-sdk/openhands/sdk/hooks/config.py   |  10 +-
 .../openhands/sdk/hooks/conversation_hooks.py |   8 +
 openhands-sdk/openhands/sdk/hooks/executor.py |  45 ++---
 openhands-sdk/openhands/sdk/hooks/manager.py  |  10 +-
 tests/sdk/hooks/test_config.py                | 168 ++++++++----------
 tests/sdk/hooks/test_executor.py              |   6 +-
 7 files changed, 122 insertions(+), 127 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
index 279794f223..047acc8e16 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -566,6 +566,8 @@ def _ensure_plugins_loaded(self) -> None:
                 session_id=str(self._state.id),
                 original_callback=self._base_callback,
                 llm=self.agent.llm,
+                persistence_dir=self._state.persistence_dir,
+                visualizer=self._visualizer,
             )
             self._hook_processor.set_conversation_state(self._state)
             self._hook_processor.run_session_start()
diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index c13dfd5e26..673f8ca7cc 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -50,8 +50,10 @@ class HookDefinition(BaseModel):
     type: HookType = HookType.COMMAND
     name: str | None = None
     command: str | None = None
-    prompt: str | None = None
+    system_prompt: str | None = None
+    tools: list[str] = Field(default_factory=list)
     timeout: int = 60
+    max_iterations: int = 10
     async_: bool = Field(default=False, alias="async")  # 'async' is a reserved keyword
 
     model_config = {
@@ -66,7 +68,7 @@ def _validate_type_fields(self) -> "HookDefinition":
             raise ValueError("'prompt' is required when type is 'prompt'")
         if self.type == HookType.AGENT and self.command is not None:
             raise ValueError(
-                "'command' must not be set when type is 'agent'; use 'prompt' instead"
+                "'command' must not be set when type is 'agent'; use 'system_prompt' instead"
             )
         if self.type == HookType.AGENT and self.async_:
             raise ValueError("'async' is not supported for agent hooks")
@@ -79,8 +81,8 @@ def display_command(self) -> str:
             return self.command
         if self.name is not None:
             return f"agent-hook:{self.name}"
-        if self.prompt:
-            return f"agent-hook:{self.prompt[:20]}"
+        if self.system_prompt:
+            return f"agent-hook:{self.system_prompt[:20]}"
         return "agent-hook:agent"
 
 class HookMatcher(BaseModel):
diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
index a0438075ac..fa359e4afd 100644
--- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
+++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
@@ -18,6 +18,8 @@
 from openhands.sdk.logger import get_logger
 
 
+from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
+
 if TYPE_CHECKING:
     from openhands.sdk.conversation.state import ConversationState
     from openhands.sdk.llm import LLM
@@ -391,6 +393,8 @@ def create_hook_callback(
     original_callback: Any = None,
     emit_hook_events: bool = True,
     llm: "LLM | None" = None,
+    persistence_dir: str | None = None,
+    visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None,
 ) -> tuple[HookEventProcessor, Any]:
     """Create a hook-enabled event callback. Returns (processor, callback).
 
@@ -402,6 +406,8 @@ def create_hook_callback(
         emit_hook_events: If True, emit HookExecutionEvent for each hook execution.
             Defaults to True for full observability.
         llm: LLM instance inherited from the parent conversation, used by agent hooks.
+        persistence_dir: Directory used to persist agent hook sub-conversation events.
+        visualizer: Visualizer instance passed to agent hook sub-conversations.
 
     Returns:
         Tuple of (HookEventProcessor, callback function).
@@ -411,6 +417,8 @@ def create_hook_callback(
         working_dir=working_dir,
         session_id=session_id,
         llm=llm,
+        persistence_dir=persistence_dir,
+        visualizer=visualizer,
     )
 
     processor = HookEventProcessor(
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 978f6354b6..71afbe513e 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -12,6 +12,7 @@
 import opentelemetry.context as otel_context
 from pydantic import BaseModel
 
+from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
 from openhands.sdk.hooks.config import HookDefinition, HookType
 from openhands.sdk.hooks.types import HookDecision, HookEvent
 from openhands.sdk.observability.laminar import observe
@@ -153,10 +154,14 @@ def __init__(
         working_dir: str | None = None,
         async_process_manager: AsyncProcessManager | None = None,
         llm: "LLM | None" = None,
+        persistence_dir: str | None = None,
+        visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None,
     ):
         self.working_dir = working_dir or os.getcwd()
         self.async_process_manager = async_process_manager or AsyncProcessManager()
         self.llm = llm
+        self.persistence_dir = persistence_dir
+        self.visualizer = visualizer
 
     @observe(
         name="hook.execute.agent",
@@ -170,8 +175,8 @@ def _execute_agent_hook(
     ) -> HookResult:
         """Execute an agent-based hook by spawning a short-lived sub-conversation.
 
-        The sub-conversation inherits the parent LLM, runs with read-only tools
-        and no hooks (preventing recursion), and must return a JSON decision.
+        The sub-conversation inherits the parent LLM, runs with user-configured
+        tools and no hooks (preventing recursion), and must return a JSON decision.
         """
         # Lazy imports to avoid circular dependency:
         # executor <- manager <- conversation_hooks <- local_conversation -> executor
@@ -197,22 +202,6 @@ def _execute_agent_hook(
                 reason="No LLM configured for agent hook",
             )
 
-        no_policy = "No additional policy specified. Use your best judgment."
-        system_prompt = (
-            "You are a security policy evaluator for an AI coding agent. "
-            "Your only job is to decide whether an agent operation"
-            " should be allowed or denied.\n\n"
-            f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```\n\n"
-            "## Instructions\n"
-            "Use the available tools to investigate if needed (e.g. read files"
-            " in the working directory). Then call `finish` with a JSON string"
-            " as the message:\n"
-            '  {"decision": "allow" | "deny", "reason": "<brief explanation>"}\n\n'
-            "When in doubt, prefer `allow`. Only deny when the operation clearly"
-            " violates the policy below.\n\n"
-            f"## Policy\n{hook.prompt or no_policy}"
-        )
-
         # Isolated LLM copy — hook metrics not attributed to the parent conversation.
         hook_llm = self.llm.model_copy(update={"usage_id": "hook-agent-evaluator"})
         hook_llm.reset_metrics()
@@ -225,12 +214,9 @@ def _execute_agent_hook(
         try:
             agent = Agent(
                 llm=hook_llm,
-                tools=[
-                    Tool(name="GrepTool"),
-                    Tool(name="GlobTool"),
-                ],
+                tools=[Tool(name=t) for t in hook.tools],
                 include_default_tools=["FinishTool"],
-                system_prompt=system_prompt,
+                system_prompt=hook.system_prompt,
             )
             # hook_config=None disables all hooks in the sub-conversation (no recursion)
             conversation = LocalConversation(
@@ -238,12 +224,14 @@ def _execute_agent_hook(
                 workspace=self.working_dir,
                 plugins=None,
                 hook_config=None,
-                persistence_dir=None,
-                visualizer=None,
-                max_iteration_per_run=10,
+                persistence_dir=self.persistence_dir,
+                visualizer=self.visualizer,
+                max_iteration_per_run=hook.max_iterations,
             )
+            # Event payload is always injected — the agent needs it to evaluate.
             conversation.send_message(
-                f"Evaluate this {event_type} hook event and make your decision."
+                f"Evaluate this {event_type} hook event and make your decision.\n\n"
+                f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```"
             )
 
             def _run_with_context() -> None:
@@ -263,6 +251,7 @@ def _run_with_context() -> None:
                         f"Agent hook timed out after {hook.timeout}s"
                         f" for event '{event_type}' — defaulting to allow"
                     )
+                    conversation.pause()
                     return HookResult(
                         success=False,
                         decision=HookDecision.ALLOW,
@@ -272,8 +261,6 @@ def _run_with_context() -> None:
                         ),
                     )
             finally:
-                # Don't block waiting for the thread — the conversation may still be
-                # running but we've already returned or moved on.
                 pool.shutdown(wait=False)
             raw = get_agent_final_response(conversation.state.events)
         except Exception as e:
diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py
index 7f7b50945f..a3c725e9a1 100644
--- a/openhands-sdk/openhands/sdk/hooks/manager.py
+++ b/openhands-sdk/openhands/sdk/hooks/manager.py
@@ -3,6 +3,7 @@
 import logging
 from typing import TYPE_CHECKING, Any
 
+from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
 from openhands.sdk.hooks.config import HookConfig
 from openhands.sdk.hooks.executor import HookExecutor, HookResult
 from openhands.sdk.hooks.types import HookEvent, HookEventType
@@ -24,9 +25,16 @@ def __init__(
         working_dir: str | None = None,
         session_id: str | None = None,
         llm: "LLM | None" = None,
+        persistence_dir: str | None = None,
+        visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None,
     ):
         self.config = config or HookConfig.load(working_dir=working_dir)
-        self.executor = HookExecutor(working_dir=working_dir, llm=llm)
+        self.executor = HookExecutor(
+            working_dir=working_dir,
+            llm=llm,
+            persistence_dir=persistence_dir,
+            visualizer=visualizer,
+        )
         self.session_id = session_id
         self.working_dir = working_dir
 
diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py
index 568e8ccfab..255283e010 100644
--- a/tests/sdk/hooks/test_config.py
+++ b/tests/sdk/hooks/test_config.py
@@ -10,97 +10,85 @@
 from openhands.sdk.hooks.types import HookEventType
 
 
-class TestHookDefinitionValidation:
-    """Tests for HookDefinition type-specific field validation."""
-
-    def test_command_hook_requires_command(self):
-        """COMMAND type must have a command field."""
-        with pytest.raises(Exception, match="'command' is required"):
-            HookDefinition(type="command")
-
-    def test_command_hook_valid(self):
-        """COMMAND type with command is valid."""
-        h = HookDefinition(command="echo hi")
-        assert h.type == HookType.COMMAND
-        assert h.command == "echo hi"
-
-    def test_agent_hook_valid_with_prompt(self):
-        """AGENT type with prompt (no command) is valid."""
-        h = HookDefinition(type="agent", prompt="Block writes to /etc")
-        assert h.type == HookType.AGENT
-        assert h.prompt == "Block writes to /etc"
-        assert h.command is None
-
-    def test_agent_hook_valid_without_prompt(self):
-        """AGENT type without prompt is valid (prompt is optional)."""
-        h = HookDefinition(type="agent")
-        assert h.type == HookType.AGENT
-        assert h.prompt is None
-
-    def test_display_command_uses_command_for_command_hooks(self):
-        h = HookDefinition(command="block.sh")
-        assert h.display_command == "block.sh"
-
-    def test_display_command_uses_name_when_set(self):
-        h = HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf")
-        assert h.display_command == "agent-hook:block-deletions"
-
-    def test_display_command_falls_back_to_prompt_prefix(self):
-        h = HookDefinition(type="agent", prompt="Block network calls to external IPs")
-        assert h.display_command == "agent-hook:Block network calls "
-
-    def test_display_command_truncates_long_prompt(self):
-        long_prompt = "A" * 100
-        h = HookDefinition(type="agent", prompt=long_prompt)
-        assert h.display_command == f"agent-hook:{'A' * 20}"
-
-    def test_display_command_fallback_when_no_name_no_prompt(self):
-        h = HookDefinition(type="agent")
-        assert h.display_command == "agent-hook:agent"
-
-    def test_three_hooks_are_distinguishable(self):
-        """Multiple agent hooks without names get unique labels via prompt prefix."""
-        hooks = [
-            HookDefinition(type="agent", name="block-deletions", prompt="Block rm -rf"),
-            HookDefinition(type="agent", prompt="Block network calls to external IPs"),
-            HookDefinition(type="agent", prompt="Verify all tasks are complete"),
+def test_command_hook_requires_command():
+    with pytest.raises(Exception, match="'command' is required"):
+        HookDefinition(type="command")
+
+
+def test_command_hook_valid():
+    h = HookDefinition(command="echo hi")
+    assert h.type == HookType.COMMAND
+    assert h.command == "echo hi"
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        {"type": "agent", "system_prompt": "Block writes to /etc"},
+        {"type": "agent"},
+    ],
+    ids=["with-system-prompt", "without-system-prompt"],
+)
+def test_agent_hook_valid(kwargs):
+    h = HookDefinition(**kwargs)
+    assert h.type == HookType.AGENT
+
+
+@pytest.mark.parametrize(
+    "kwargs,expected",
+    [
+        ({"command": "block.sh"}, "block.sh"),
+        ({"type": "agent", "name": "block-deletions", "system_prompt": "Block rm -rf"}, "agent-hook:block-deletions"),
+        ({"type": "agent", "system_prompt": "Block network calls to external IPs"}, "agent-hook:Block network calls "),
+        ({"type": "agent", "system_prompt": "A" * 100}, f"agent-hook:{'A' * 20}"),
+        ({"type": "agent"}, "agent-hook:agent"),
+    ],
+    ids=["command", "agent-named", "agent-prompt-prefix", "agent-prompt-truncated", "agent-fallback"],
+)
+def test_display_command(kwargs, expected):
+    h = HookDefinition(**kwargs)
+    assert h.display_command == expected
+
+
+def test_multiple_agent_hooks_are_distinguishable():
+    hooks = [
+        HookDefinition(type="agent", name="block-deletions", system_prompt="Block rm -rf"),
+        HookDefinition(type="agent", system_prompt="Block network calls to external IPs"),
+        HookDefinition(type="agent", system_prompt="Verify all tasks are complete"),
+    ]
+    assert len({h.display_command for h in hooks}) == 3
+
+
+@pytest.mark.parametrize(
+    "kwargs,match",
+    [
+        ({"type": "agent", "command": "echo hi"}, "'command' must not be set when type is 'agent'"),
+        ({"type": "command"}, "'command' is required"),
+    ],
+    ids=["agent-rejects-command", "command-requires-command"],
+)
+def test_hook_definition_validation_errors(kwargs, match):
+    with pytest.raises(Exception, match=match):
+        HookDefinition(**kwargs)
+
+
+def test_agent_hook_rejects_async():
+    with pytest.raises(Exception, match="not supported for agent hooks"):
+        HookDefinition.model_validate({"type": "agent", "async": True})
+
+
+def test_agent_hook_from_json():
+    data = {
+        "stop": [
+            {"hooks": [{"type": "agent", "system_prompt": "Verify all tasks are done", "timeout": 30}]}
         ]
-        labels = [h.display_command for h in hooks]
-        assert len(set(labels)) == 3  # all distinct
-
-    def test_agent_hook_rejects_command_field(self):
-        """AGENT type must not have command set alongside it."""
-        with pytest.raises(
-            Exception, match="'command' must not be set when type is 'agent'"
-        ):
-            HookDefinition(type="agent", command="echo hi")
-
-    def test_agent_hook_rejects_async(self):
-        """AGENT type does not support async_=True."""
-        with pytest.raises(Exception, match="not supported for agent hooks"):
-            HookDefinition.model_validate({"type": "agent", "async": True})
-
-    def test_agent_hook_from_json(self):
-        """AGENT hook can be loaded from JSON config."""
-        data = {
-            "stop": [
-                {
-                    "hooks": [
-                        {
-                            "type": "agent",
-                            "prompt": "Verify all tasks are done",
-                            "timeout": 30,
-                        }
-                    ]
-                }
-            ]
-        }
-        config = HookConfig.from_dict(data)
-        hooks = config.get_hooks_for_event(HookEventType.STOP)
-        assert len(hooks) == 1
-        assert hooks[0].type == HookType.AGENT
-        assert hooks[0].prompt == "Verify all tasks are done"
-        assert hooks[0].timeout == 30
+    }
+    config = HookConfig.from_dict(data)
+    hooks = config.get_hooks_for_event(HookEventType.STOP)
+    assert len(hooks) == 1
+    assert hooks[0].type == HookType.AGENT
+    assert hooks[0].system_prompt == "Verify all tasks are done"
+    assert hooks[0].timeout == 30
 
 
 class TestHookMatcher:
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 69e6323b68..564050f12d 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -459,7 +459,7 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event):
         """execute() routes AGENT type to _execute_agent_hook, not subprocess."""
         from openhands.sdk.hooks.executor import HookResult
 
-        hook = HookDefinition(type="agent", prompt="Verify task completion")
+        hook = HookDefinition(type="agent", system_prompt="Verify task completion")
 
         with patch.object(
             executor,
@@ -476,7 +476,7 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event):
 
     def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event):
         """Agent hook with no LLM configured defaults to allow without error."""
-        hook = HookDefinition(type="agent", prompt="Check something")
+        hook = HookDefinition(type="agent", system_prompt="Check something")
         result = executor_no_llm.execute(hook, sample_event)
 
         assert result.success
@@ -493,7 +493,7 @@ def test_deny_decision_blocks(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=deny_json),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent", prompt="Check tasks")
+            hook = HookDefinition(type="agent", system_prompt="Check tasks")
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert result.blocked

From c166503bf995ba39a3ce85e59f9fa98c1028d88e Mon Sep 17 00:00:00 2001
From: "burak.keles" <burak.keles@ode.al>
Date: Thu, 21 May 2026 13:01:22 +0300
Subject: [PATCH 04/12] Introduce agent-based hooks example and improve agent
 hook execution

- Add `49_agent_hooks` example demoing `type="agent"` hooks for lifecycle events.
  - Includes security and quality review use cases with detailed README.
- Refactor hook executor for enhanced JSON parsing and timeout handling.
  - Simplify sub-conversation logic with `_allow` and `_parse_decision` helpers.
  - Improve robustness against non-standard LLM responses like fenced JSON or chatter.
- Make hook LLM copy respect `hook.timeout` and ensure usage metrics isolation.
- Extend test coverage for various hook scenarios, including agent LLM configuration.
---
 .../49_agent_hooks/49_agent_hooks.py          | 166 ++++++++++++++++++
 .../49_agent_hooks/README.md                  |  60 +++++++
 openhands-sdk/openhands/sdk/hooks/config.py   |   2 +-
 openhands-sdk/openhands/sdk/hooks/executor.py | 146 +++++++--------
 tests/sdk/hooks/test_executor.py              | 144 ++++++++++++---
 5 files changed, 416 insertions(+), 102 deletions(-)
 create mode 100644 examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py
 create mode 100644 examples/01_standalone_sdk/49_agent_hooks/README.md

diff --git a/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py b/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py
new file mode 100644
index 0000000000..7dbb5e6ef4
--- /dev/null
+++ b/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py
@@ -0,0 +1,166 @@
+"""OpenHands Agent SDK — Agent-based Hooks Example
+
+Demonstrates the `type="agent"` hook, which evaluates lifecycle events with an
+LLM-driven sub-agent instead of a shell script. The hook agent receives the
+event JSON, reasons about it semantically, and replies with a decision payload:
+
+    {"decision": "allow" | "deny", "reason": "..."}
+
+Two demos:
+
+- PreToolUse (security reviewer): inspects the INTENT of a terminal command,
+  not just its syntax. A command like `awk '{print}' /etc/passwd` would slip
+  past a blacklist of command names, but the agent hook recognises it as
+  reading a sensitive system file and denies it.
+- Stop (quality reviewer): refuses to let the main agent finish until the
+  required deliverable exists in the workspace.
+
+For shell-command hooks see ../33_hooks/33_hooks.py.
+"""
+
+import os
+import tempfile
+from pathlib import Path
+
+from pydantic import SecretStr
+
+from openhands.sdk import LLM, Conversation
+from openhands.sdk.event.hook_execution import HookExecutionEvent
+from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher, HookType
+from openhands.tools.preset.default import get_default_agent
+
+
+# Configure LLM
+api_key = os.getenv("LLM_API_KEY")
+assert api_key is not None, "LLM_API_KEY environment variable is not set."
+model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929")
+base_url = os.getenv("LLM_BASE_URL")
+
+llm = LLM(
+    usage_id="agent",
+    model=model,
+    base_url=base_url,
+    api_key=SecretStr(api_key),
+)
+
+SECURITY_REVIEWER_PROMPT = """You are a security reviewer for a coding agent.
+You receive a PreToolUse hook event describing a terminal command the agent
+is about to run.
+
+Reason about the SEMANTIC INTENT of the command, not just its syntax. Block:
+- reading sensitive system files (/etc/passwd, /etc/shadow, ~/.ssh/*, credentials)
+- broad filesystem discovery outside the workspace (rg --files /etc, find /, ls /root)
+- exfiltrating data over the network
+- modifying system state (sudo, apt install, systemctl)
+
+Allow ordinary inspection, builds, tests, and edits inside the workspace.
+When unsure, prefer allow.
+
+Reply with a single JSON object and nothing else:
+{"decision": "allow" | "deny", "reason": "<short explanation>"}
+"""
+
+QUALITY_REVIEWER_PROMPT = """You are a quality reviewer enforcing task completion.
+You receive a Stop hook event when the main agent tries to finish.
+
+The task requires the file REPORT.md to exist in the workspace and contain at
+least one bullet point describing the repository. Use the file_editor tool to
+check whether the file exists and inspect its contents.
+
+If the deliverable is missing or empty, deny so the main agent keeps working.
+Otherwise allow.
+
+Reply with a single JSON object and nothing else:
+{"decision": "allow" | "deny", "reason": "<short explanation>"}
+"""
+
+
+def hook_logger(event) -> None:
+    """Surface each hook decision so the demo output is self-explanatory."""
+    if not isinstance(event, HookExecutionEvent):
+        return
+    status = "DENY " if event.blocked else ("ALLOW" if event.success else "FAIL ")
+    line = f"  [hook] {event.hook_event_type} {status} -> {event.hook_command}"
+    if event.reason:
+        line += f"\n         reason: {event.reason}"
+    print(line)
+
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    workspace = Path(tmpdir)
+
+    hook_config = HookConfig(
+        pre_tool_use=[
+            HookMatcher(
+                matcher="terminal",
+                hooks=[
+                    HookDefinition(
+                        type=HookType.AGENT,
+                        name="security-reviewer",
+                        system_prompt=SECURITY_REVIEWER_PROMPT,
+                        timeout=60,
+                        max_iterations=3,
+                    )
+                ],
+            )
+        ],
+        stop=[
+            HookMatcher(
+                hooks=[
+                    HookDefinition(
+                        type=HookType.AGENT,
+                        name="quality-reviewer",
+                        system_prompt=QUALITY_REVIEWER_PROMPT,
+                        tools=["file_editor"],
+                        timeout=90,
+                        max_iterations=5,
+                    )
+                ],
+            )
+        ],
+    )
+
+    agent = get_default_agent(llm=llm)
+    conversation = Conversation(
+        agent=agent,
+        workspace=str(workspace),
+        hook_config=hook_config,
+        callbacks=[hook_logger],
+    )
+
+    print("=" * 60)
+    print("Demo 1: PreToolUse — semantic deny")
+    print("=" * 60)
+    print(
+        "Asking the agent to read /etc/passwd via awk. The literal command\n"
+        "wouldn't match a syntactic blacklist (no `cat`, no `/etc/shadow`\n"
+        "keyword), but the security-reviewer agent should recognise the\n"
+        "intent and deny.\n"
+    )
+    conversation.send_message(
+        "Show me the contents of /etc/passwd using awk '{print}'."
+    )
+    conversation.run()
+
+    print("\n" + "=" * 60)
+    print("Demo 2: Stop — deny until deliverable exists")
+    print("=" * 60)
+    print(
+        "Quality reviewer denies until REPORT.md exists with a bullet point.\n"
+    )
+    conversation.send_message(
+        "Write REPORT.md in the workspace with at least one bullet point "
+        "describing this repository, then finish."
+    )
+    conversation.run()
+
+    report = workspace / "REPORT.md"
+    if report.exists():
+        print(f"\n[REPORT.md preview: {report.read_text()[:120]!r}...]")
+
+    print("\n" + "=" * 60)
+    print("Example Complete!")
+    print("=" * 60)
+
+    cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
+    print(f"\nEXAMPLE_COST: {cost}")
\ No newline at end of file
diff --git a/examples/01_standalone_sdk/49_agent_hooks/README.md b/examples/01_standalone_sdk/49_agent_hooks/README.md
new file mode 100644
index 0000000000..4a354c1b26
--- /dev/null
+++ b/examples/01_standalone_sdk/49_agent_hooks/README.md
@@ -0,0 +1,60 @@
+# Agent-based Hooks Example
+
+This folder demonstrates the `type="agent"` hook — a lifecycle hook whose
+decision is produced by an LLM-driven sub-agent rather than a shell script.
+
+For shell-command hooks see [`../33_hooks/`](../33_hooks).
+
+## Why an agent hook
+
+A shell-based PreToolUse hook can only block what its blacklist literally
+matches. The agent rewrites `cat /etc/passwd` as `awk '{print}' /etc/passwd`
+and slips through. An agent hook reasons about the **semantic intent** of the
+command — "reading a sensitive system file" — and denies regardless of the
+exact tool name used.
+
+## Example
+
+- **49_agent_hooks.py** — Two agent hooks wired onto a single conversation:
+  - **PreToolUse** "security reviewer" denies a command whose intent is to
+    read `/etc/passwd`, even though no obvious keyword appears in a blacklist.
+  - **Stop** "quality reviewer" refuses to let the main agent finish until
+    the required deliverable (`REPORT.md`) is present in the workspace.
+
+Each hook decision is printed to the console via a `HookExecutionEvent`
+callback, so you can watch the allow/deny outcomes as the demo runs.
+
+## Running
+
+```bash
+export LLM_API_KEY="your-key"
+export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929"  # optional
+export LLM_BASE_URL="https://your-endpoint"              # optional
+
+python 49_agent_hooks.py
+```
+
+## How an agent hook is configured
+
+```python
+HookDefinition(
+    type=HookType.AGENT,
+    name="security-reviewer",      # bucket for cost metrics (hook-agent:<name>)
+    system_prompt="...",           # instructs the hook agent; must request JSON
+    tools=["file_editor"],         # optional tools the hook agent may use
+                                   # (use registered names, e.g. "file_editor",
+                                   # "terminal" — not class names like
+                                   # "FileEditorTool")
+    timeout=60,                    # forwarded to the per-hook LLM copy
+    max_iterations=3,              # cap on hook sub-conversation steps
+)
+```
+
+The hook agent receives the event JSON and must reply with:
+
+```json
+{"decision": "allow" | "deny", "reason": "<short explanation>"}
+```
+
+Anything else (non-JSON, missing field, sub-conversation error) defaults to
+`allow` so a broken hook cannot wedge the main agent.
diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index 673f8ca7cc..7efe68501d 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -53,7 +53,7 @@ class HookDefinition(BaseModel):
     system_prompt: str | None = None
     tools: list[str] = Field(default_factory=list)
     timeout: int = 60
-    max_iterations: int = 10
+    max_iterations: int = 3
     async_: bool = Field(default=False, alias="async")  # 'async' is a reserved keyword
 
     model_config = {
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 71afbe513e..1178065272 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -1,6 +1,5 @@
 """Hook executor - runs shell commands and agent evaluations with JSON I/O."""
 
-import concurrent.futures
 import json
 import logging
 import os
@@ -9,7 +8,6 @@
 import time
 from typing import TYPE_CHECKING
 
-import opentelemetry.context as otel_context
 from pydantic import BaseModel
 
 from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
@@ -163,6 +161,20 @@ def __init__(
         self.persistence_dir = persistence_dir
         self.visualizer = visualizer
 
+    def _allow(
+        self,
+        reason: str,
+        *,
+        success: bool = False,
+        error: str | None = None,
+    ) -> HookResult:
+        return HookResult(
+            success=success,
+            decision=HookDecision.ALLOW,
+            reason=reason,
+            error=error,
+        )
+
     @observe(
         name="hook.execute.agent",
         ignore_inputs=["self", "hook", "event"],
@@ -173,11 +185,6 @@ def _execute_agent_hook(
         hook: HookDefinition,
         event: HookEvent,
     ) -> HookResult:
-        """Execute an agent-based hook by spawning a short-lived sub-conversation.
-
-        The sub-conversation inherits the parent LLM, runs with user-configured
-        tools and no hooks (preventing recursion), and must return a JSON decision.
-        """
         # Lazy imports to avoid circular dependency:
         # executor <- manager <- conversation_hooks <- local_conversation -> executor
         from openhands.sdk.agent import Agent  # type: ignore[attr-defined]
@@ -196,19 +203,14 @@ def _execute_agent_hook(
                 f"Agent hook has no LLM configured for event '{event_type}'"
                 " — defaulting to allow"
             )
-            return HookResult(
-                success=True,
-                decision=HookDecision.ALLOW,
-                reason="No LLM configured for agent hook",
-            )
-
-        # Isolated LLM copy — hook metrics not attributed to the parent conversation.
-        hook_llm = self.llm.model_copy(update={"usage_id": "hook-agent-evaluator"})
-        hook_llm.reset_metrics()
+            return self._allow("No LLM configured for agent hook")
 
-        # Capture the current OTel span context so the sub-conversation thread
-        # becomes a child of this span rather than an orphaned trace.
-        parent_ctx = otel_context.get_current()
+        hook_llm = self.llm.model_copy(
+            update={
+                "usage_id": f"hook-agent:{hook.name or 'default'}",
+                "timeout": hook.timeout,
+            }
+        )
 
         conversation = None
         try:
@@ -218,7 +220,7 @@ def _execute_agent_hook(
                 include_default_tools=["FinishTool"],
                 system_prompt=hook.system_prompt,
             )
-            # hook_config=None disables all hooks in the sub-conversation (no recursion)
+            # hook_config=None disables hooks in the sub-conversation (no recursion)
             conversation = LocalConversation(
                 agent=agent,
                 workspace=self.working_dir,
@@ -228,93 +230,79 @@ def _execute_agent_hook(
                 visualizer=self.visualizer,
                 max_iteration_per_run=hook.max_iterations,
             )
-            # Event payload is always injected — the agent needs it to evaluate.
             conversation.send_message(
                 f"Evaluate this {event_type} hook event and make your decision.\n\n"
                 f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```"
             )
-
-            def _run_with_context() -> None:
-                token = otel_context.attach(parent_ctx)
-                try:
-                    conversation.run()
-                finally:
-                    otel_context.detach(token)
-
-            pool = concurrent.futures.ThreadPoolExecutor(max_workers=1)
-            try:
-                future = pool.submit(_run_with_context)
-                try:
-                    future.result(timeout=hook.timeout)
-                except concurrent.futures.TimeoutError:
-                    logger.warning(
-                        f"Agent hook timed out after {hook.timeout}s"
-                        f" for event '{event_type}' — defaulting to allow"
-                    )
-                    conversation.pause()
-                    return HookResult(
-                        success=False,
-                        decision=HookDecision.ALLOW,
-                        reason=(
-                            f"Agent hook timed out after {hook.timeout} seconds"
-                            " — defaulting to allow"
-                        ),
-                    )
-            finally:
-                pool.shutdown(wait=False)
+            conversation.run()
             raw = get_agent_final_response(conversation.state.events)
         except Exception as e:
             logger.warning(
                 f"Agent hook sub-conversation failed for event '{event_type}'"
                 f" — defaulting to allow: {e}"
             )
-            return HookResult(
-                success=False,
-                decision=HookDecision.ALLOW,
-                reason="Agent hook execution failed — defaulting to allow",
+            return self._allow(
+                "Agent hook execution failed — defaulting to allow",
                 error=str(e),
             )
         finally:
             if conversation is not None:
                 conversation.close()
 
+        return self._parse_decision(raw, event_type)
+
+    _JSON_DECODER = json.JSONDecoder()
+
+    def _extract_first_json_object(self, text: str) -> dict | None:
+        # Walks the string and asks the JSON decoder to parse from each '{'.
+        # Handles bare JSON, ```json ... ``` fences, prose prefixes/suffixes,
+        # and mixed combinations — anything LLMs add around the actual object.
+        for i, ch in enumerate(text):
+            if ch != "{":
+                continue
+            try:
+                obj, _ = self._JSON_DECODER.raw_decode(text[i:])
+            except json.JSONDecodeError:
+                continue
+            if isinstance(obj, dict):
+                return obj
+        return None
+
+    def _parse_decision(self, raw: str, event_type: str) -> HookResult:
         if not raw:
             logger.warning(
                 f"Agent hook produced no final response for event '{event_type}'"
                 " — defaulting to allow"
             )
-            return HookResult(
-                success=False,
-                decision=HookDecision.ALLOW,
-                reason="Agent hook produced no final response — defaulting to allow",
+            return self._allow(
+                "Agent hook produced no final response — defaulting to allow"
             )
 
-        try:
-            data = json.loads(raw)
-            decision_str = str(data.get("decision", "allow")).lower()
-            reason = str(data.get("reason", ""))
-            if decision_str == "deny":
-                return HookResult(
-                    success=True,
-                    blocked=True,
-                    decision=HookDecision.DENY,
-                    reason=reason,
-                )
-            return HookResult(
-                success=True,
-                decision=HookDecision.ALLOW,
-                reason=reason,
-            )
-        except (json.JSONDecodeError, AttributeError):
+        data = self._extract_first_json_object(raw)
+        if data is None:
             logger.warning(
-                f"Agent hook returned non-JSON response for event '{event_type}'"
-                f" — defaulting to allow: {repr(raw)[:200]}"
+                f"Agent hook returned no parseable JSON object for event"
+                f" '{event_type}' — defaulting to allow: {repr(raw)[:200]}"
+            )
+            return self._allow(
+                "Agent hook returned no parseable JSON — defaulting to allow",
+                success=True,
             )
+
+        decision_str = str(data.get("decision", "allow")).lower()
+        reason = str(data.get("reason", ""))
+        if decision_str == "deny":
             return HookResult(
                 success=True,
-                decision=HookDecision.ALLOW,
-                reason="Agent hook returned non-JSON response — defaulting to allow",
+                blocked=True,
+                decision=HookDecision.DENY,
+                reason=reason,
             )
+        return HookResult(
+            success=True,
+            decision=HookDecision.ALLOW,
+            reason=reason,
+        )
 
     def execute(
         self,
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 564050f12d..530a8278a7 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -479,7 +479,7 @@ def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event):
         hook = HookDefinition(type="agent", system_prompt="Check something")
         result = executor_no_llm.execute(hook, sample_event)
 
-        assert result.success
+        assert not result.success
         assert result.decision == HookDecision.ALLOW
         assert not result.blocked
 
@@ -518,6 +518,88 @@ def test_allow_decision_passes(self, executor, sample_event):
         assert result.decision == HookDecision.ALLOW
         assert result.reason == "Looks safe"
 
+    def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event):
+        """LLMs commonly wrap JSON in ```json fences; the executor must still
+        honour the decision instead of falling open."""
+        fenced = (
+            '```json\n'
+            '{"decision": "deny", "reason": "Sensitive file read"}\n'
+            '```'
+        )
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=fenced),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert result.blocked
+        assert result.decision == HookDecision.DENY
+        assert result.reason == "Sensitive file read"
+
+    def test_plain_fence_without_language_tag_is_parsed(
+        self, executor, sample_event
+    ):
+        """Some LLMs use ``` without a language tag — still extract the JSON."""
+        fenced = '```\n{"decision": "allow", "reason": "ok"}\n```'
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=fenced),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert result.reason == "ok"
+
+    def test_prose_prefix_before_json_is_parsed(self, executor, sample_event):
+        """LLMs often explain themselves before emitting JSON; we must still
+        honour a real deny instead of silently falling open to allow."""
+        prose_then_json = (
+            "After reviewing the workspace I found REPORT.md is missing.\n\n"
+            '{"decision": "deny", "reason": "missing deliverable"}'
+        )
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=prose_then_json),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert result.blocked
+        assert result.decision == HookDecision.DENY
+        assert result.reason == "missing deliverable"
+
+    def test_prose_suffix_after_json_is_parsed(self, executor, sample_event):
+        """Trailing chatter after the JSON object must not defeat the parser."""
+        json_then_prose = (
+            '{"decision": "deny", "reason": "sensitive file"}\n\n'
+            "Let me know if you need more details."
+        )
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=json_then_prose),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type="agent")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert result.blocked
+        assert result.decision == HookDecision.DENY
+        assert result.reason == "sensitive file"
+
     def test_invalid_json_defaults_to_allow(self, executor, sample_event):
         """_execute_agent_hook with non-JSON LLM response defaults to allow."""
         with (
@@ -561,33 +643,31 @@ def test_empty_response_defaults_to_allow(self, executor, sample_event):
         assert result.decision == HookDecision.ALLOW
         assert result.success is False
 
-    def test_timeout_enforced(self, executor, sample_event):
-        """Agent hook respects hook.timeout via ThreadPoolExecutor."""
-        import time
+    def test_timeout_propagated_to_hook_llm(self, executor, sample_event):
+        """hook.timeout is forwarded to the copied LLM, parent's timeout untouched."""
+        parent_timeout = executor.llm.timeout
+        captured_llm = {}
 
-        def slow_run():
-            time.sleep(10)
+        def capture_agent_init(**kwargs):
+            captured_llm["llm"] = kwargs.get("llm")
+            return MagicMock()
 
         with (
-            patch(self._AGENT_PATH),
-            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._AGENT_PATH, side_effect=capture_agent_init),
+            patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
         ):
-            mock_conv = MagicMock()
-            mock_conv.run = slow_run
-            mock_conv_cls.return_value = mock_conv
+            hook = HookDefinition(type="agent", timeout=7)
+            executor._execute_agent_hook(hook, sample_event)
 
-            hook = HookDefinition(type="agent", timeout=1)
-            start = time.time()
-            result = executor._execute_agent_hook(hook, sample_event)
-            elapsed = time.time() - start
+        hook_llm = captured_llm.get("llm")
+        assert hook_llm is not None
+        assert hook_llm.timeout == 7
+        assert executor.llm.timeout == parent_timeout
 
-        assert elapsed < 5
-        assert not result.blocked
-        assert result.decision == HookDecision.ALLOW
-        assert "timed out" in (result.reason or "")
+    def test_hook_metrics_under_usage_id(self, executor, sample_event):
+        """Hook LLM uses per-hook usage_id and shares parent's Metrics object."""
+        parent_metrics = executor.llm.metrics
 
-    def test_metrics_are_isolated_from_parent_llm(self, executor, sample_event):
-        """model_copy + reset_metrics ensures hook tokens don't pollute parent LLM."""
         captured_llm = {}
 
         def capture_agent_init(**kwargs):
@@ -604,7 +684,27 @@ def capture_agent_init(**kwargs):
         hook_llm = captured_llm.get("llm")
         assert hook_llm is not None
         assert hook_llm is not executor.llm
-        assert hook_llm.usage_id == "hook-agent-evaluator"
+        assert hook_llm.usage_id == "hook-agent:default"
+        assert hook_llm._metrics is parent_metrics
+
+    def test_hook_usage_id_uses_hook_name(self, executor, sample_event):
+        """A named hook gets its own usage_id bucket: hook-agent:<name>."""
+        captured_llm = {}
+
+        def capture_agent_init(**kwargs):
+            captured_llm["llm"] = kwargs.get("llm")
+            return MagicMock()
+
+        with (
+            patch(self._AGENT_PATH, side_effect=capture_agent_init),
+            patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
+        ):
+            hook = HookDefinition(type="agent", name="security-check")
+            executor._execute_agent_hook(hook, sample_event)
+
+        hook_llm = captured_llm.get("llm")
+        assert hook_llm is not None
+        assert hook_llm.usage_id == "hook-agent:security-check"
 
 
 class TestPromptHookNotImplemented:

From ba8e3fc5d906f62d1787c529b782d03026bb2ca5 Mon Sep 17 00:00:00 2001
From: "burak.keles" <burak.keles@ode.al>
Date: Thu, 21 May 2026 13:32:16 +0300
Subject: [PATCH 05/12] resolve conflict

---
 openhands-sdk/openhands/sdk/hooks/config.py | 2 ++
 tests/sdk/hooks/test_executor.py            | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index 7efe68501d..1a336b6475 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -50,6 +50,7 @@ class HookDefinition(BaseModel):
     type: HookType = HookType.COMMAND
     name: str | None = None
     command: str | None = None
+    prompt: str | None = None
     system_prompt: str | None = None
     tools: list[str] = Field(default_factory=list)
     timeout: int = 60
@@ -85,6 +86,7 @@ def display_command(self) -> str:
             return f"agent-hook:{self.system_prompt[:20]}"
         return "agent-hook:agent"
 
+
 class HookMatcher(BaseModel):
     """Matches events to hooks based on patterns.
 
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 530a8278a7..e40ad08052 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -720,7 +720,7 @@ def sample_event(self):
 
     def test_prompt_hook_defaults_to_allow(self, executor, sample_event):
         """Executing a PROMPT hook returns allow instead of crashing."""
-        hook = HookDefinition(type="prompt")
+        hook = HookDefinition(type="prompt", prompt="evaluate this event")
         result = executor.execute(hook, sample_event)
         assert result.decision == HookDecision.ALLOW
         assert result.success is False
@@ -728,12 +728,12 @@ def test_prompt_hook_defaults_to_allow(self, executor, sample_event):
 
     def test_prompt_hook_does_not_block(self, executor, sample_event):
         """PROMPT hook must not block the operation while unimplemented."""
-        hook = HookDefinition(type="prompt")
+        hook = HookDefinition(type="prompt", prompt="evaluate this event")
         result = executor.execute(hook, sample_event)
         assert result.blocked is False
         assert result.should_continue is True
 
     def test_prompt_hook_without_command_validates(self):
         """PROMPT hook with no command is valid at config time (future use)."""
-        hook = HookDefinition(type="prompt")
+        hook = HookDefinition(type="prompt", prompt="evaluate this event")
         assert hook.command is None

From 67d37356d6b355621ee59314d39c0a10ca22a58c Mon Sep 17 00:00:00 2001
From: "burak.keles" <burak.keles@ode.al>
Date: Thu, 21 May 2026 18:01:02 +0300
Subject: [PATCH 06/12] Refactor agent hook execution for improved JSON
 parsing, error handling, and metrics isolation. Update comments and tests for
 clarity.

---
 .../49_agent_hooks/README.md                  |  2 +-
 openhands-sdk/openhands/sdk/hooks/executor.py | 51 ++++++++++---------
 tests/sdk/hooks/test_executor.py              | 20 ++++----
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/examples/01_standalone_sdk/49_agent_hooks/README.md b/examples/01_standalone_sdk/49_agent_hooks/README.md
index 4a354c1b26..dd3a8e534b 100644
--- a/examples/01_standalone_sdk/49_agent_hooks/README.md
+++ b/examples/01_standalone_sdk/49_agent_hooks/README.md
@@ -39,7 +39,7 @@ python 49_agent_hooks.py
 ```python
 HookDefinition(
     type=HookType.AGENT,
-    name="security-reviewer",      # bucket for cost metrics (hook-agent:<name>)
+    name="security-reviewer",      # bucket for cost metrics (agent-hook:<name>)
     system_prompt="...",           # instructs the hook agent; must request JSON
     tools=["file_editor"],         # optional tools the hook agent may use
                                    # (use registered names, e.g. "file_editor",
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 1178065272..0d55a07d6f 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -1,5 +1,6 @@
 """Hook executor - runs shell commands and agent evaluations with JSON I/O."""
 
+import contextlib
 import json
 import logging
 import os
@@ -34,6 +35,11 @@ class HookResult(BaseModel):
       to ``False`` and the error is logged, but the operation still proceeds.
       In particular, exit code ``1`` does **not** block — only ``2`` does.
       Hooks intended to enforce a policy must exit with ``2``.
+
+    For agent / prompt hooks, ``success=True`` means the hook produced a
+    deliberate verdict (parsed ``allow`` or ``deny``). Fall-open paths set
+    ``success=False`` with ``error`` populated, so a "we couldn't decide" allow
+    is detectable as ``decision == ALLOW and not success``.
     """
 
     success: bool = True
@@ -147,6 +153,8 @@ def cleanup_all(self) -> None:
 class HookExecutor:
     """Executes hook commands and agent evaluations with JSON I/O."""
 
+    _JSON_DECODER = json.JSONDecoder()
+
     def __init__(
         self,
         working_dir: str | None = None,
@@ -161,18 +169,17 @@ def __init__(
         self.persistence_dir = persistence_dir
         self.visualizer = visualizer
 
-    def _allow(
+    def _fall_open(
         self,
         reason: str,
         *,
-        success: bool = False,
         error: str | None = None,
     ) -> HookResult:
         return HookResult(
-            success=success,
+            success=False,
             decision=HookDecision.ALLOW,
             reason=reason,
-            error=error,
+            error=error or reason,
         )
 
     @observe(
@@ -203,14 +210,16 @@ def _execute_agent_hook(
                 f"Agent hook has no LLM configured for event '{event_type}'"
                 " — defaulting to allow"
             )
-            return self._allow("No LLM configured for agent hook")
+            return self._fall_open("No LLM configured for agent hook")
 
         hook_llm = self.llm.model_copy(
             update={
-                "usage_id": f"hook-agent:{hook.name or 'default'}",
+                "usage_id": f"agent-hook:{hook.name or 'default'}",
                 "timeout": hook.timeout,
             }
         )
+        # Isolate Metrics so hook spend doesn't accrue to the parent's bucket.
+        hook_llm.reset_metrics()
 
         conversation = None
         try:
@@ -241,7 +250,7 @@ def _execute_agent_hook(
                 f"Agent hook sub-conversation failed for event '{event_type}'"
                 f" — defaulting to allow: {e}"
             )
-            return self._allow(
+            return self._fall_open(
                 "Agent hook execution failed — defaulting to allow",
                 error=str(e),
             )
@@ -251,21 +260,16 @@ def _execute_agent_hook(
 
         return self._parse_decision(raw, event_type)
 
-    _JSON_DECODER = json.JSONDecoder()
-
     def _extract_first_json_object(self, text: str) -> dict | None:
-        # Walks the string and asks the JSON decoder to parse from each '{'.
-        # Handles bare JSON, ```json ... ``` fences, prose prefixes/suffixes,
-        # and mixed combinations — anything LLMs add around the actual object.
+        # Scan for the first decodable JSON object so prose / ```json fences
+        # around the payload don't defeat parsing.
         for i, ch in enumerate(text):
             if ch != "{":
                 continue
-            try:
+            with contextlib.suppress(json.JSONDecodeError):
                 obj, _ = self._JSON_DECODER.raw_decode(text[i:])
-            except json.JSONDecodeError:
-                continue
-            if isinstance(obj, dict):
-                return obj
+                if isinstance(obj, dict):
+                    return obj
         return None
 
     def _parse_decision(self, raw: str, event_type: str) -> HookResult:
@@ -274,7 +278,7 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult:
                 f"Agent hook produced no final response for event '{event_type}'"
                 " — defaulting to allow"
             )
-            return self._allow(
+            return self._fall_open(
                 "Agent hook produced no final response — defaulting to allow"
             )
 
@@ -284,9 +288,8 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult:
                 f"Agent hook returned no parseable JSON object for event"
                 f" '{event_type}' — defaulting to allow: {repr(raw)[:200]}"
             )
-            return self._allow(
-                "Agent hook returned no parseable JSON — defaulting to allow",
-                success=True,
+            return self._fall_open(
+                "Agent hook returned no parseable JSON — defaulting to allow"
             )
 
         decision_str = str(data.get("decision", "allow")).lower()
@@ -323,10 +326,8 @@ def execute(
                 f"PROMPT hooks are not yet implemented — defaulting to allow"
                 f" (event_type={event_type})"
             )
-            return HookResult(
-                success=False,
-                decision=HookDecision.ALLOW,
-                reason="PROMPT hooks are not yet implemented — defaulting to allow",
+            return self._fall_open(
+                "PROMPT hooks are not yet implemented — defaulting to allow"
             )
 
         # Prepare environment
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index e40ad08052..4dbd029d76 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -519,8 +519,7 @@ def test_allow_decision_passes(self, executor, sample_event):
         assert result.reason == "Looks safe"
 
     def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event):
-        """LLMs commonly wrap JSON in ```json fences; the executor must still
-        honour the decision instead of falling open."""
+        """```json fenced JSON is honoured, not treated as non-JSON."""
         fenced = (
             '```json\n'
             '{"decision": "deny", "reason": "Sensitive file read"}\n'
@@ -560,8 +559,7 @@ def test_plain_fence_without_language_tag_is_parsed(
         assert result.reason == "ok"
 
     def test_prose_prefix_before_json_is_parsed(self, executor, sample_event):
-        """LLMs often explain themselves before emitting JSON; we must still
-        honour a real deny instead of silently falling open to allow."""
+        """Prose before the JSON object must not defeat the parser."""
         prose_then_json = (
             "After reviewing the workspace I found REPORT.md is missing.\n\n"
             '{"decision": "deny", "reason": "missing deliverable"}'
@@ -601,7 +599,7 @@ def test_prose_suffix_after_json_is_parsed(self, executor, sample_event):
         assert result.reason == "sensitive file"
 
     def test_invalid_json_defaults_to_allow(self, executor, sample_event):
-        """_execute_agent_hook with non-JSON LLM response defaults to allow."""
+        """Non-JSON response falls open: ALLOW + success=False + error set."""
         with (
             patch(self._AGENT_PATH),
             patch(self._CONV_PATH) as mock_conv_cls,
@@ -614,6 +612,8 @@ def test_invalid_json_defaults_to_allow(self, executor, sample_event):
         assert not result.blocked
         assert result.decision == HookDecision.ALLOW
         assert result.should_continue
+        assert result.success is False
+        assert result.error is not None
 
     def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event):
         """_execute_agent_hook when sub-conversation raises defaults to allow."""
@@ -665,7 +665,7 @@ def capture_agent_init(**kwargs):
         assert executor.llm.timeout == parent_timeout
 
     def test_hook_metrics_under_usage_id(self, executor, sample_event):
-        """Hook LLM uses per-hook usage_id and shares parent's Metrics object."""
+        """Hook LLM uses per-hook usage_id and an isolated Metrics object."""
         parent_metrics = executor.llm.metrics
 
         captured_llm = {}
@@ -684,11 +684,11 @@ def capture_agent_init(**kwargs):
         hook_llm = captured_llm.get("llm")
         assert hook_llm is not None
         assert hook_llm is not executor.llm
-        assert hook_llm.usage_id == "hook-agent:default"
-        assert hook_llm._metrics is parent_metrics
+        assert hook_llm.usage_id == "agent-hook:default"
+        assert hook_llm.metrics is not parent_metrics
 
     def test_hook_usage_id_uses_hook_name(self, executor, sample_event):
-        """A named hook gets its own usage_id bucket: hook-agent:<name>."""
+        """A named hook gets its own usage_id bucket: agent-hook:<name>."""
         captured_llm = {}
 
         def capture_agent_init(**kwargs):
@@ -704,7 +704,7 @@ def capture_agent_init(**kwargs):
 
         hook_llm = captured_llm.get("llm")
         assert hook_llm is not None
-        assert hook_llm.usage_id == "hook-agent:security-check"
+        assert hook_llm.usage_id == "agent-hook:security-check"
 
 
 class TestPromptHookNotImplemented:

From 60e9fa11355fa4853607939cd631b58465b6bdac Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Fri, 22 May 2026 17:16:00 +0200
Subject: [PATCH 07/12] fix: address hook pre-commit failures

---
 .../51_agent_hooks.py}                        |  6 +-
 .../README.md                                 |  4 +-
 openhands-sdk/openhands/sdk/hooks/config.py   |  3 +-
 .../openhands/sdk/hooks/conversation_hooks.py |  7 ++-
 openhands-sdk/openhands/sdk/hooks/executor.py | 18 ++++--
 openhands-sdk/openhands/sdk/hooks/manager.py  |  4 +-
 tests/sdk/hooks/test_config.py                | 55 ++++++++++++++++---
 tests/sdk/hooks/test_executor.py              | 55 ++++++++++---------
 8 files changed, 102 insertions(+), 50 deletions(-)
 rename examples/01_standalone_sdk/{49_agent_hooks/49_agent_hooks.py => 51_agent_hooks/51_agent_hooks.py} (97%)
 rename examples/01_standalone_sdk/{49_agent_hooks => 51_agent_hooks}/README.md (96%)

diff --git a/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py b/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py
similarity index 97%
rename from examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py
rename to examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py
index 7dbb5e6ef4..754aac11b2 100644
--- a/examples/01_standalone_sdk/49_agent_hooks/49_agent_hooks.py
+++ b/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py
@@ -145,9 +145,7 @@ def hook_logger(event) -> None:
     print("\n" + "=" * 60)
     print("Demo 2: Stop — deny until deliverable exists")
     print("=" * 60)
-    print(
-        "Quality reviewer denies until REPORT.md exists with a bullet point.\n"
-    )
+    print("Quality reviewer denies until REPORT.md exists with a bullet point.\n")
     conversation.send_message(
         "Write REPORT.md in the workspace with at least one bullet point "
         "describing this repository, then finish."
@@ -163,4 +161,4 @@ def hook_logger(event) -> None:
     print("=" * 60)
 
     cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
-    print(f"\nEXAMPLE_COST: {cost}")
\ No newline at end of file
+    print(f"\nEXAMPLE_COST: {cost}")
diff --git a/examples/01_standalone_sdk/49_agent_hooks/README.md b/examples/01_standalone_sdk/51_agent_hooks/README.md
similarity index 96%
rename from examples/01_standalone_sdk/49_agent_hooks/README.md
rename to examples/01_standalone_sdk/51_agent_hooks/README.md
index dd3a8e534b..efdc245114 100644
--- a/examples/01_standalone_sdk/49_agent_hooks/README.md
+++ b/examples/01_standalone_sdk/51_agent_hooks/README.md
@@ -15,7 +15,7 @@ exact tool name used.
 
 ## Example
 
-- **49_agent_hooks.py** — Two agent hooks wired onto a single conversation:
+- **51_agent_hooks.py** — Two agent hooks wired onto a single conversation:
   - **PreToolUse** "security reviewer" denies a command whose intent is to
     read `/etc/passwd`, even though no obvious keyword appears in a blacklist.
   - **Stop** "quality reviewer" refuses to let the main agent finish until
@@ -31,7 +31,7 @@ export LLM_API_KEY="your-key"
 export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929"  # optional
 export LLM_BASE_URL="https://your-endpoint"              # optional
 
-python 49_agent_hooks.py
+python 51_agent_hooks.py
 ```
 
 ## How an agent hook is configured
diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index 1a336b6475..8c48fb6ef8 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -69,7 +69,8 @@ def _validate_type_fields(self) -> "HookDefinition":
             raise ValueError("'prompt' is required when type is 'prompt'")
         if self.type == HookType.AGENT and self.command is not None:
             raise ValueError(
-                "'command' must not be set when type is 'agent'; use 'system_prompt' instead"
+                "'command' must not be set when type is 'agent'; "
+                "use 'system_prompt' instead"
             )
         if self.type == HookType.AGENT and self.async_:
             raise ValueError("'async' is not supported for agent hooks")
diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
index fa359e4afd..3664c22e43 100644
--- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
+++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
@@ -3,6 +3,7 @@
 from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 
+from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
 from openhands.sdk.event import (
     ActionEvent,
     Event,
@@ -18,8 +19,6 @@
 from openhands.sdk.logger import get_logger
 
 
-from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
-
 if TYPE_CHECKING:
     from openhands.sdk.conversation.state import ConversationState
     from openhands.sdk.llm import LLM
@@ -394,7 +393,9 @@ def create_hook_callback(
     emit_hook_events: bool = True,
     llm: "LLM | None" = None,
     persistence_dir: str | None = None,
-    visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None,
+    visualizer: type[ConversationVisualizerBase]
+    | ConversationVisualizerBase
+    | None = None,
 ) -> tuple[HookEventProcessor, Any]:
     """Create a hook-enabled event callback. Returns (processor, callback).
 
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 0d55a07d6f..6ee33844d3 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -161,7 +161,9 @@ def __init__(
         async_process_manager: AsyncProcessManager | None = None,
         llm: "LLM | None" = None,
         persistence_dir: str | None = None,
-        visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None,
+        visualizer: type[ConversationVisualizerBase]
+        | ConversationVisualizerBase
+        | None = None,
     ):
         self.working_dir = working_dir or os.getcwd()
         self.async_process_manager = async_process_manager or AsyncProcessManager()
@@ -347,6 +349,14 @@ def execute(
         # Cleanup expired async processes before starting new ones
         self.async_process_manager.cleanup_expired()
 
+        command = hook.command
+        if command is None:
+            return HookResult(
+                success=False,
+                exit_code=-1,
+                error="'command' is required when type is 'command'",
+            )
+
         # Handle async hooks: fire and forget
         if hook.async_:
             try:
@@ -357,7 +367,7 @@ def execute(
                     start_new_session = False
 
                 process = subprocess.Popen(
-                    hook.command,
+                    command,
                     shell=True,
                     cwd=self.working_dir,
                     env=hook_env,
@@ -378,7 +388,7 @@ def execute(
 
                 # Track for cleanup
                 self.async_process_manager.add_process(process, hook.timeout)
-                logger.debug(f"Started async hook (PID {process.pid}): {hook.command}")
+                logger.debug(f"Started async hook (PID {process.pid}): {command}")
 
                 # Return placeholder success result
                 return HookResult(
@@ -396,7 +406,7 @@ def execute(
         try:
             # Execute the hook command synchronously
             result = subprocess.run(
-                hook.command,
+                command,
                 shell=True,
                 cwd=self.working_dir,
                 env=hook_env,
diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py
index a3c725e9a1..788f20ad16 100644
--- a/openhands-sdk/openhands/sdk/hooks/manager.py
+++ b/openhands-sdk/openhands/sdk/hooks/manager.py
@@ -26,7 +26,9 @@ def __init__(
         session_id: str | None = None,
         llm: "LLM | None" = None,
         persistence_dir: str | None = None,
-        visualizer: type[ConversationVisualizerBase] | ConversationVisualizerBase | None = None,
+        visualizer: type[ConversationVisualizerBase]
+        | ConversationVisualizerBase
+        | None = None,
     ):
         self.config = config or HookConfig.load(working_dir=working_dir)
         self.executor = HookExecutor(
diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py
index 255283e010..4e1e730346 100644
--- a/tests/sdk/hooks/test_config.py
+++ b/tests/sdk/hooks/test_config.py
@@ -12,7 +12,7 @@
 
 def test_command_hook_requires_command():
     with pytest.raises(Exception, match="'command' is required"):
-        HookDefinition(type="command")
+        HookDefinition(type=HookType.COMMAND)
 
 
 def test_command_hook_valid():
@@ -38,12 +38,28 @@ def test_agent_hook_valid(kwargs):
     "kwargs,expected",
     [
         ({"command": "block.sh"}, "block.sh"),
-        ({"type": "agent", "name": "block-deletions", "system_prompt": "Block rm -rf"}, "agent-hook:block-deletions"),
-        ({"type": "agent", "system_prompt": "Block network calls to external IPs"}, "agent-hook:Block network calls "),
+        (
+            {
+                "type": "agent",
+                "name": "block-deletions",
+                "system_prompt": "Block rm -rf",
+            },
+            "agent-hook:block-deletions",
+        ),
+        (
+            {"type": "agent", "system_prompt": "Block network calls to external IPs"},
+            "agent-hook:Block network calls ",
+        ),
         ({"type": "agent", "system_prompt": "A" * 100}, f"agent-hook:{'A' * 20}"),
         ({"type": "agent"}, "agent-hook:agent"),
     ],
-    ids=["command", "agent-named", "agent-prompt-prefix", "agent-prompt-truncated", "agent-fallback"],
+    ids=[
+        "command",
+        "agent-named",
+        "agent-prompt-prefix",
+        "agent-prompt-truncated",
+        "agent-fallback",
+    ],
 )
 def test_display_command(kwargs, expected):
     h = HookDefinition(**kwargs)
@@ -52,9 +68,19 @@ def test_display_command(kwargs, expected):
 
 def test_multiple_agent_hooks_are_distinguishable():
     hooks = [
-        HookDefinition(type="agent", name="block-deletions", system_prompt="Block rm -rf"),
-        HookDefinition(type="agent", system_prompt="Block network calls to external IPs"),
-        HookDefinition(type="agent", system_prompt="Verify all tasks are complete"),
+        HookDefinition(
+            type=HookType.AGENT,
+            name="block-deletions",
+            system_prompt="Block rm -rf",
+        ),
+        HookDefinition(
+            type=HookType.AGENT,
+            system_prompt="Block network calls to external IPs",
+        ),
+        HookDefinition(
+            type=HookType.AGENT,
+            system_prompt="Verify all tasks are complete",
+        ),
     ]
     assert len({h.display_command for h in hooks}) == 3
 
@@ -62,7 +88,10 @@ def test_multiple_agent_hooks_are_distinguishable():
 @pytest.mark.parametrize(
     "kwargs,match",
     [
-        ({"type": "agent", "command": "echo hi"}, "'command' must not be set when type is 'agent'"),
+        (
+            {"type": "agent", "command": "echo hi"},
+            "'command' must not be set when type is 'agent'",
+        ),
         ({"type": "command"}, "'command' is required"),
     ],
     ids=["agent-rejects-command", "command-requires-command"],
@@ -80,7 +109,15 @@ def test_agent_hook_rejects_async():
 def test_agent_hook_from_json():
     data = {
         "stop": [
-            {"hooks": [{"type": "agent", "system_prompt": "Verify all tasks are done", "timeout": 30}]}
+            {
+                "hooks": [
+                    {
+                        "type": "agent",
+                        "system_prompt": "Verify all tasks are done",
+                        "timeout": 30,
+                    }
+                ]
+            }
         ]
     }
     config = HookConfig.from_dict(data)
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 4dbd029d76..956c2dd148 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -8,7 +8,7 @@
 import pytest
 from pydantic import SecretStr
 
-from openhands.sdk.hooks.config import HookDefinition
+from openhands.sdk.hooks.config import HookDefinition, HookType
 from openhands.sdk.hooks.executor import HookExecutor
 from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType
 from openhands.sdk.llm import LLM
@@ -459,7 +459,10 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event):
         """execute() routes AGENT type to _execute_agent_hook, not subprocess."""
         from openhands.sdk.hooks.executor import HookResult
 
-        hook = HookDefinition(type="agent", system_prompt="Verify task completion")
+        hook = HookDefinition(
+            type=HookType.AGENT,
+            system_prompt="Verify task completion",
+        )
 
         with patch.object(
             executor,
@@ -476,7 +479,10 @@ def test_execute_dispatches_to_agent_hook(self, executor, sample_event):
 
     def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event):
         """Agent hook with no LLM configured defaults to allow without error."""
-        hook = HookDefinition(type="agent", system_prompt="Check something")
+        hook = HookDefinition(
+            type=HookType.AGENT,
+            system_prompt="Check something",
+        )
         result = executor_no_llm.execute(hook, sample_event)
 
         assert not result.success
@@ -493,7 +499,10 @@ def test_deny_decision_blocks(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=deny_json),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent", system_prompt="Check tasks")
+            hook = HookDefinition(
+                type=HookType.AGENT,
+                system_prompt="Check tasks",
+            )
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert result.blocked
@@ -511,7 +520,7 @@ def test_allow_decision_passes(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=allow_json),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert not result.blocked
@@ -520,11 +529,7 @@ def test_allow_decision_passes(self, executor, sample_event):
 
     def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event):
         """```json fenced JSON is honoured, not treated as non-JSON."""
-        fenced = (
-            '```json\n'
-            '{"decision": "deny", "reason": "Sensitive file read"}\n'
-            '```'
-        )
+        fenced = '```json\n{"decision": "deny", "reason": "Sensitive file read"}\n```'
 
         with (
             patch(self._AGENT_PATH),
@@ -532,16 +537,14 @@ def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=fenced),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert result.blocked
         assert result.decision == HookDecision.DENY
         assert result.reason == "Sensitive file read"
 
-    def test_plain_fence_without_language_tag_is_parsed(
-        self, executor, sample_event
-    ):
+    def test_plain_fence_without_language_tag_is_parsed(self, executor, sample_event):
         """Some LLMs use ``` without a language tag — still extract the JSON."""
         fenced = '```\n{"decision": "allow", "reason": "ok"}\n```'
 
@@ -551,7 +554,7 @@ def test_plain_fence_without_language_tag_is_parsed(
             patch(self._RESPONSE_PATH, return_value=fenced),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert not result.blocked
@@ -571,7 +574,7 @@ def test_prose_prefix_before_json_is_parsed(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=prose_then_json),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert result.blocked
@@ -591,7 +594,7 @@ def test_prose_suffix_after_json_is_parsed(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=json_then_prose),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert result.blocked
@@ -606,7 +609,7 @@ def test_invalid_json_defaults_to_allow(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value="I think you should allow this."),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert not result.blocked
@@ -621,7 +624,7 @@ def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event
             patch(self._AGENT_PATH),
             patch(self._CONV_PATH, side_effect=RuntimeError("workspace error")),
         ):
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert not result.blocked
@@ -636,7 +639,7 @@ def test_empty_response_defaults_to_allow(self, executor, sample_event):
             patch(self._RESPONSE_PATH, return_value=""),
         ):
             mock_conv_cls.return_value = MagicMock()
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             result = executor._execute_agent_hook(hook, sample_event)
 
         assert not result.blocked
@@ -656,7 +659,7 @@ def capture_agent_init(**kwargs):
             patch(self._AGENT_PATH, side_effect=capture_agent_init),
             patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
         ):
-            hook = HookDefinition(type="agent", timeout=7)
+            hook = HookDefinition(type=HookType.AGENT, timeout=7)
             executor._execute_agent_hook(hook, sample_event)
 
         hook_llm = captured_llm.get("llm")
@@ -678,7 +681,7 @@ def capture_agent_init(**kwargs):
             patch(self._AGENT_PATH, side_effect=capture_agent_init),
             patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
         ):
-            hook = HookDefinition(type="agent")
+            hook = HookDefinition(type=HookType.AGENT)
             executor._execute_agent_hook(hook, sample_event)
 
         hook_llm = captured_llm.get("llm")
@@ -699,7 +702,7 @@ def capture_agent_init(**kwargs):
             patch(self._AGENT_PATH, side_effect=capture_agent_init),
             patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
         ):
-            hook = HookDefinition(type="agent", name="security-check")
+            hook = HookDefinition(type=HookType.AGENT, name="security-check")
             executor._execute_agent_hook(hook, sample_event)
 
         hook_llm = captured_llm.get("llm")
@@ -720,7 +723,7 @@ def sample_event(self):
 
     def test_prompt_hook_defaults_to_allow(self, executor, sample_event):
         """Executing a PROMPT hook returns allow instead of crashing."""
-        hook = HookDefinition(type="prompt", prompt="evaluate this event")
+        hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event")
         result = executor.execute(hook, sample_event)
         assert result.decision == HookDecision.ALLOW
         assert result.success is False
@@ -728,12 +731,12 @@ def test_prompt_hook_defaults_to_allow(self, executor, sample_event):
 
     def test_prompt_hook_does_not_block(self, executor, sample_event):
         """PROMPT hook must not block the operation while unimplemented."""
-        hook = HookDefinition(type="prompt", prompt="evaluate this event")
+        hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event")
         result = executor.execute(hook, sample_event)
         assert result.blocked is False
         assert result.should_continue is True
 
     def test_prompt_hook_without_command_validates(self):
         """PROMPT hook with no command is valid at config time (future use)."""
-        hook = HookDefinition(type="prompt", prompt="evaluate this event")
+        hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event")
         assert hook.command is None

From 823b460b1ea0873548f5e1a3a3bb8be5dbd420ba Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Fri, 22 May 2026 17:40:36 +0200
Subject: [PATCH 08/12] fix: include agent hook metrics in parent stats

---
 .../conversation/impl/local_conversation.py   |  1 +
 .../openhands/sdk/hooks/conversation_hooks.py |  4 ++
 openhands-sdk/openhands/sdk/hooks/executor.py | 18 +++++++
 openhands-sdk/openhands/sdk/hooks/manager.py  |  3 ++
 tests/sdk/hooks/test_executor.py              | 47 +++++++++++++++++++
 5 files changed, 73 insertions(+)

diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
index 047acc8e16..4b07188f41 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -568,6 +568,7 @@ def _ensure_plugins_loaded(self) -> None:
                 llm=self.agent.llm,
                 persistence_dir=self._state.persistence_dir,
                 visualizer=self._visualizer,
+                conversation_stats=self._state.stats,
             )
             self._hook_processor.set_conversation_state(self._state)
             self._hook_processor.run_session_start()
diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
index 3664c22e43..91f5197ec3 100644
--- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
+++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
@@ -20,6 +20,7 @@
 
 
 if TYPE_CHECKING:
+    from openhands.sdk.conversation.conversation_stats import ConversationStats
     from openhands.sdk.conversation.state import ConversationState
     from openhands.sdk.llm import LLM
 
@@ -396,6 +397,7 @@ def create_hook_callback(
     visualizer: type[ConversationVisualizerBase]
     | ConversationVisualizerBase
     | None = None,
+    conversation_stats: "ConversationStats | None" = None,
 ) -> tuple[HookEventProcessor, Any]:
     """Create a hook-enabled event callback. Returns (processor, callback).
 
@@ -409,6 +411,7 @@ def create_hook_callback(
         llm: LLM instance inherited from the parent conversation, used by agent hooks.
         persistence_dir: Directory used to persist agent hook sub-conversation events.
         visualizer: Visualizer instance passed to agent hook sub-conversations.
+        conversation_stats: Parent conversation stats that should include hook spend.
 
     Returns:
         Tuple of (HookEventProcessor, callback function).
@@ -420,6 +423,7 @@ def create_hook_callback(
         llm=llm,
         persistence_dir=persistence_dir,
         visualizer=visualizer,
+        conversation_stats=conversation_stats,
     )
 
     processor = HookEventProcessor(
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 6ee33844d3..13ab2e9aa9 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -19,6 +19,8 @@
 
 
 if TYPE_CHECKING:
+    from openhands.sdk.conversation.base import BaseConversation
+    from openhands.sdk.conversation.conversation_stats import ConversationStats
     from openhands.sdk.llm import LLM
 
 
@@ -164,12 +166,14 @@ def __init__(
         visualizer: type[ConversationVisualizerBase]
         | ConversationVisualizerBase
         | None = None,
+        conversation_stats: "ConversationStats | None" = None,
     ):
         self.working_dir = working_dir or os.getcwd()
         self.async_process_manager = async_process_manager or AsyncProcessManager()
         self.llm = llm
         self.persistence_dir = persistence_dir
         self.visualizer = visualizer
+        self.conversation_stats = conversation_stats
 
     def _fall_open(
         self,
@@ -258,6 +262,7 @@ def _execute_agent_hook(
             )
         finally:
             if conversation is not None:
+                self._merge_hook_conversation_stats(conversation)
                 conversation.close()
 
         return self._parse_decision(raw, event_type)
@@ -309,6 +314,19 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult:
             reason=reason,
         )
 
+    def _merge_hook_conversation_stats(self, conversation: "BaseConversation") -> None:
+        if self.conversation_stats is None:
+            return
+
+        child_stats = conversation.conversation_stats
+        for usage_id, metrics in child_stats.usage_to_metrics.items():
+            if usage_id in self.conversation_stats.usage_to_metrics:
+                existing = self.conversation_stats.usage_to_metrics[usage_id]
+                if existing is not metrics:
+                    existing.merge(metrics)
+            else:
+                self.conversation_stats.usage_to_metrics[usage_id] = metrics.deep_copy()
+
     def execute(
         self,
         hook: HookDefinition,
diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py
index 788f20ad16..aac309a487 100644
--- a/openhands-sdk/openhands/sdk/hooks/manager.py
+++ b/openhands-sdk/openhands/sdk/hooks/manager.py
@@ -10,6 +10,7 @@
 
 
 if TYPE_CHECKING:
+    from openhands.sdk.conversation.conversation_stats import ConversationStats
     from openhands.sdk.llm import LLM
 
 
@@ -29,6 +30,7 @@ def __init__(
         visualizer: type[ConversationVisualizerBase]
         | ConversationVisualizerBase
         | None = None,
+        conversation_stats: "ConversationStats | None" = None,
     ):
         self.config = config or HookConfig.load(working_dir=working_dir)
         self.executor = HookExecutor(
@@ -36,6 +38,7 @@ def __init__(
             llm=llm,
             persistence_dir=persistence_dir,
             visualizer=visualizer,
+            conversation_stats=conversation_stats,
         )
         self.session_id = session_id
         self.working_dir = working_dir
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 956c2dd148..86e343c557 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -8,10 +8,12 @@
 import pytest
 from pydantic import SecretStr
 
+from openhands.sdk.conversation.conversation_stats import ConversationStats
 from openhands.sdk.hooks.config import HookDefinition, HookType
 from openhands.sdk.hooks.executor import HookExecutor
 from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType
 from openhands.sdk.llm import LLM
+from openhands.sdk.llm.utils.metrics import Metrics
 from tests.command_utils import python_command
 
 
@@ -690,6 +692,51 @@ def capture_agent_init(**kwargs):
         assert hook_llm.usage_id == "agent-hook:default"
         assert hook_llm.metrics is not parent_metrics
 
+    def test_hook_metrics_are_merged_into_parent_stats(
+        self, tmp_path, mock_llm, sample_event
+    ):
+        """Child agent-hook spend is included in parent conversation stats."""
+        parent_stats = ConversationStats()
+        existing_hook_metrics = Metrics(model_name="gpt-4o")
+        existing_hook_metrics.add_cost(0.25)
+        parent_stats.usage_to_metrics["agent-hook:security-check"] = (
+            existing_hook_metrics
+        )
+
+        child_hook_metrics = Metrics(model_name="gpt-4o")
+        child_hook_metrics.add_cost(0.75)
+        child_stats = ConversationStats(
+            usage_to_metrics={"agent-hook:security-check": child_hook_metrics}
+        )
+        mock_conversation = MagicMock()
+        mock_conversation.conversation_stats = child_stats
+        mock_conversation.state.events = []
+
+        executor = HookExecutor(
+            working_dir=str(tmp_path),
+            llm=mock_llm,
+            conversation_stats=parent_stats,
+        )
+
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH, return_value=mock_conversation),
+            patch(
+                self._RESPONSE_PATH,
+                return_value='{"decision": "allow", "reason": "ok"}',
+            ),
+        ):
+            hook = HookDefinition(type=HookType.AGENT, name="security-check")
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert result.decision == HookDecision.ALLOW
+        assert parent_stats.usage_to_metrics[
+            "agent-hook:security-check"
+        ].accumulated_cost == pytest.approx(1.0)
+        assert parent_stats.get_combined_metrics().accumulated_cost == pytest.approx(
+            1.0
+        )
+
     def test_hook_usage_id_uses_hook_name(self, executor, sample_event):
         """A named hook gets its own usage_id bucket: agent-hook:<name>."""
         captured_llm = {}

From 95895c8c94da8964d2719205ec8918b39f3948bf Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Fri, 22 May 2026 17:52:27 +0200
Subject: [PATCH 09/12] fix: persist agent hooks under conversation base

---
 .../conversation/impl/local_conversation.py   |  7 +++-
 .../test_local_conversation_plugins.py        | 40 +++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
index 4b07188f41..b798a8b51c 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -559,6 +559,11 @@ def _ensure_plugins_loaded(self) -> None:
         if final_hook_config is not None:
             # Store final hook_config in state for observability
             self._state.hook_config = final_hook_config
+            hook_persistence_dir = (
+                str(Path(self._state.persistence_dir).parent)
+                if self._state.persistence_dir is not None
+                else None
+            )
 
             self._hook_processor, self._on_event = create_hook_callback(
                 hook_config=final_hook_config,
@@ -566,7 +571,7 @@ def _ensure_plugins_loaded(self) -> None:
                 session_id=str(self._state.id),
                 original_callback=self._base_callback,
                 llm=self.agent.llm,
-                persistence_dir=self._state.persistence_dir,
+                persistence_dir=hook_persistence_dir,
                 visualizer=self._visualizer,
                 conversation_stats=self._state.stats,
             )
diff --git a/tests/sdk/conversation/test_local_conversation_plugins.py b/tests/sdk/conversation/test_local_conversation_plugins.py
index c58a74c6b9..cfa7b066de 100644
--- a/tests/sdk/conversation/test_local_conversation_plugins.py
+++ b/tests/sdk/conversation/test_local_conversation_plugins.py
@@ -2,6 +2,7 @@
 
 import json
 from pathlib import Path
+from unittest.mock import MagicMock, patch
 
 import pytest
 from pydantic import SecretStr
@@ -188,6 +189,45 @@ def test_plugin_hooks_combined_with_explicit_hooks(
         # (The actual hook_processor is internal, but we trust the merging works)
         conversation.close()
 
+    def test_hook_sub_conversations_receive_persistence_base_dir(
+        self, tmp_path: Path, basic_agent
+    ):
+        """Agent hook persistence should not nest under the parent conversation id."""
+        workspace = tmp_path / "workspace"
+        workspace.mkdir()
+        persistence_base = tmp_path / "state"
+        hook_config = HookConfig(
+            pre_tool_use=[
+                HookMatcher(matcher="*", hooks=[HookDefinition(command="echo test")])
+            ]
+        )
+
+        processor = MagicMock()
+        processor.on_event = MagicMock()
+        processor.set_conversation_state = MagicMock()
+        processor.run_session_start = MagicMock()
+
+        conversation = LocalConversation(
+            agent=basic_agent,
+            workspace=workspace,
+            persistence_dir=persistence_base,
+            hook_config=hook_config,
+            visualizer=None,
+        )
+
+        with patch(
+            "openhands.sdk.conversation.impl.local_conversation.create_hook_callback",
+            return_value=(processor, processor.on_event),
+        ) as mock_create_hook_callback:
+            conversation._ensure_plugins_loaded()
+
+        assert conversation.state.persistence_dir is not None
+        assert Path(conversation.state.persistence_dir).parent == persistence_base
+        assert mock_create_hook_callback.call_args.kwargs["persistence_dir"] == str(
+            persistence_base
+        )
+        conversation.close()
+
     def test_plugins_not_loaded_until_needed(self, tmp_path: Path, basic_agent):
         """Test that plugins are not loaded in constructor (lazy loading)."""
         plugin_dir = create_test_plugin(

From f86fad42c676ba10441d50f97908c54f6cf1bf27 Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Sun, 24 May 2026 23:45:43 +0200
Subject: [PATCH 10/12] fix(hooks): address review on agent hook visualizer,
 decisions, example

- Fall open (success=False) on missing/unknown agent decision instead of
  silently allowing as if it were a deliberate verdict
- Request a fresh sub-visualizer for hook sub-conversations rather than
  handing over the parent's initialized visualizer instance, which
  LocalConversation.initialize() would rebind to the hook's child state
- Split the example into one conversation per demo so the Stop quality
  gate isn't active during the PreToolUse demo
- Rename the example to main.py and register its directory in the example
  discovery so the example workflow exercises it
---
 .../51_agent_hooks/README.md                  |   4 +-
 .../{51_agent_hooks.py => main.py}            | 108 ++++++++++--------
 openhands-sdk/openhands/sdk/hooks/executor.py |  34 +++++-
 tests/examples/test_examples.py               |   4 +
 tests/sdk/hooks/test_executor.py              |  70 ++++++++++++
 5 files changed, 167 insertions(+), 53 deletions(-)
 rename examples/01_standalone_sdk/51_agent_hooks/{51_agent_hooks.py => main.py} (69%)

diff --git a/examples/01_standalone_sdk/51_agent_hooks/README.md b/examples/01_standalone_sdk/51_agent_hooks/README.md
index efdc245114..a3d6cd6d97 100644
--- a/examples/01_standalone_sdk/51_agent_hooks/README.md
+++ b/examples/01_standalone_sdk/51_agent_hooks/README.md
@@ -15,7 +15,7 @@ exact tool name used.
 
 ## Example
 
-- **51_agent_hooks.py** — Two agent hooks wired onto a single conversation:
+- **main.py** — Two agent hooks, each in its own conversation:
   - **PreToolUse** "security reviewer" denies a command whose intent is to
     read `/etc/passwd`, even though no obvious keyword appears in a blacklist.
   - **Stop** "quality reviewer" refuses to let the main agent finish until
@@ -31,7 +31,7 @@ export LLM_API_KEY="your-key"
 export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929"  # optional
 export LLM_BASE_URL="https://your-endpoint"              # optional
 
-python 51_agent_hooks.py
+python main.py
 ```
 
 ## How an agent hook is configured
diff --git a/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py b/examples/01_standalone_sdk/51_agent_hooks/main.py
similarity index 69%
rename from examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py
rename to examples/01_standalone_sdk/51_agent_hooks/main.py
index 754aac11b2..496516cde4 100644
--- a/examples/01_standalone_sdk/51_agent_hooks/51_agent_hooks.py
+++ b/examples/01_standalone_sdk/51_agent_hooks/main.py
@@ -86,47 +86,48 @@ def hook_logger(event) -> None:
     print(line)
 
 
-with tempfile.TemporaryDirectory() as tmpdir:
-    workspace = Path(tmpdir)
+# Each demo runs in its own conversation with only the hook it needs. Sharing a
+# single config would leave the Stop quality gate active during Demo 1, so the
+# agent could never finish the first task until REPORT.md existed — coupling two
+# unrelated demos and burning iterations.
+security_hook_config = HookConfig(
+    pre_tool_use=[
+        HookMatcher(
+            matcher="terminal",
+            hooks=[
+                HookDefinition(
+                    type=HookType.AGENT,
+                    name="security-reviewer",
+                    system_prompt=SECURITY_REVIEWER_PROMPT,
+                    timeout=60,
+                    max_iterations=3,
+                )
+            ],
+        )
+    ],
+)
 
-    hook_config = HookConfig(
-        pre_tool_use=[
-            HookMatcher(
-                matcher="terminal",
-                hooks=[
-                    HookDefinition(
-                        type=HookType.AGENT,
-                        name="security-reviewer",
-                        system_prompt=SECURITY_REVIEWER_PROMPT,
-                        timeout=60,
-                        max_iterations=3,
-                    )
-                ],
-            )
-        ],
-        stop=[
-            HookMatcher(
-                hooks=[
-                    HookDefinition(
-                        type=HookType.AGENT,
-                        name="quality-reviewer",
-                        system_prompt=QUALITY_REVIEWER_PROMPT,
-                        tools=["file_editor"],
-                        timeout=90,
-                        max_iterations=5,
-                    )
-                ],
-            )
-        ],
-    )
+quality_hook_config = HookConfig(
+    stop=[
+        HookMatcher(
+            hooks=[
+                HookDefinition(
+                    type=HookType.AGENT,
+                    name="quality-reviewer",
+                    system_prompt=QUALITY_REVIEWER_PROMPT,
+                    tools=["file_editor"],
+                    timeout=90,
+                    max_iterations=5,
+                )
+            ],
+        )
+    ],
+)
 
-    agent = get_default_agent(llm=llm)
-    conversation = Conversation(
-        agent=agent,
-        workspace=str(workspace),
-        hook_config=hook_config,
-        callbacks=[hook_logger],
-    )
+
+with tempfile.TemporaryDirectory() as tmpdir:
+    workspace = Path(tmpdir)
+    total_cost = 0.0
 
     print("=" * 60)
     print("Demo 1: PreToolUse — semantic deny")
@@ -137,20 +138,38 @@ def hook_logger(event) -> None:
         "keyword), but the security-reviewer agent should recognise the\n"
         "intent and deny.\n"
     )
-    conversation.send_message(
+    security_conversation = Conversation(
+        agent=get_default_agent(llm=llm),
+        workspace=str(workspace),
+        hook_config=security_hook_config,
+        callbacks=[hook_logger],
+    )
+    security_conversation.send_message(
         "Show me the contents of /etc/passwd using awk '{print}'."
     )
-    conversation.run()
+    security_conversation.run()
+    total_cost += (
+        security_conversation.conversation_stats.get_combined_metrics().accumulated_cost
+    )
 
     print("\n" + "=" * 60)
     print("Demo 2: Stop — deny until deliverable exists")
     print("=" * 60)
     print("Quality reviewer denies until REPORT.md exists with a bullet point.\n")
-    conversation.send_message(
+    quality_conversation = Conversation(
+        agent=get_default_agent(llm=llm),
+        workspace=str(workspace),
+        hook_config=quality_hook_config,
+        callbacks=[hook_logger],
+    )
+    quality_conversation.send_message(
         "Write REPORT.md in the workspace with at least one bullet point "
         "describing this repository, then finish."
     )
-    conversation.run()
+    quality_conversation.run()
+    total_cost += (
+        quality_conversation.conversation_stats.get_combined_metrics().accumulated_cost
+    )
 
     report = workspace / "REPORT.md"
     if report.exists():
@@ -160,5 +179,4 @@ def hook_logger(event) -> None:
     print("Example Complete!")
     print("=" * 60)
 
-    cost = conversation.conversation_stats.get_combined_metrics().accumulated_cost
-    print(f"\nEXAMPLE_COST: {cost}")
+    print(f"\nEXAMPLE_COST: {total_cost}")
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 13ab2e9aa9..075ec17d08 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -227,6 +227,17 @@ def _execute_agent_hook(
         # Isolate Metrics so hook spend doesn't accrue to the parent's bucket.
         hook_llm.reset_metrics()
 
+        # Never hand the parent's already-initialized visualizer instance to the
+        # sub-conversation: LocalConversation.__init__ calls initialize() on it,
+        # which would rebind the parent visualizer to the hook's child state. Mirror
+        # the delegate pattern and ask the parent visualizer for a fresh sub-
+        # visualizer (returns None for visualizers that don't support sub-agents).
+        hook_visualizer = self.visualizer
+        if isinstance(self.visualizer, ConversationVisualizerBase):
+            hook_visualizer = self.visualizer.create_sub_visualizer(
+                f"agent-hook:{hook.name or 'default'}"
+            )
+
         conversation = None
         try:
             agent = Agent(
@@ -242,7 +253,7 @@ def _execute_agent_hook(
                 plugins=None,
                 hook_config=None,
                 persistence_dir=self.persistence_dir,
-                visualizer=self.visualizer,
+                visualizer=hook_visualizer,
                 max_iteration_per_run=hook.max_iterations,
             )
             conversation.send_message(
@@ -299,7 +310,7 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult:
                 "Agent hook returned no parseable JSON — defaulting to allow"
             )
 
-        decision_str = str(data.get("decision", "allow")).lower()
+        decision_str = str(data.get("decision", "")).lower()
         reason = str(data.get("reason", ""))
         if decision_str == "deny":
             return HookResult(
@@ -308,10 +319,21 @@ def _parse_decision(self, raw: str, event_type: str) -> HookResult:
                 decision=HookDecision.DENY,
                 reason=reason,
             )
-        return HookResult(
-            success=True,
-            decision=HookDecision.ALLOW,
-            reason=reason,
+        if decision_str == "allow":
+            return HookResult(
+                success=True,
+                decision=HookDecision.ALLOW,
+                reason=reason,
+            )
+        # Missing or unknown decision: this is not a deliberate verdict, so it
+        # must be a detectable fall-open (success=False) rather than a silent
+        # allow that masquerades as a real decision.
+        logger.warning(
+            f"Agent hook returned an invalid decision for event '{event_type}'"
+            f" — defaulting to allow: {repr(decision_str)[:200]}"
+        )
+        return self._fall_open(
+            "Agent hook returned an invalid decision — defaulting to allow"
         )
 
     def _merge_hook_conversation_stats(self, conversation: "BaseConversation") -> None:
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
index 8399aaabf7..82b94fc46b 100644
--- a/tests/examples/test_examples.py
+++ b/tests/examples/test_examples.py
@@ -28,6 +28,7 @@
     # These examples live under subdirectories (each with a single `main.py`).
     EXAMPLES_ROOT / "01_standalone_sdk" / "33_hooks",
     EXAMPLES_ROOT / "01_standalone_sdk" / "37_llm_profile_store",
+    EXAMPLES_ROOT / "01_standalone_sdk" / "51_agent_hooks",
     EXAMPLES_ROOT / "01_standalone_sdk" / "43_mixed_marketplace_skills",
     EXAMPLES_ROOT / "02_remote_agent_server" / "06_custom_tool",
     EXAMPLES_ROOT / "05_skills_and_plugins" / "01_loading_agentskills",
@@ -97,6 +98,9 @@ def test_directory_example_is_discovered() -> None:
     assert (
         EXAMPLES_ROOT / "02_remote_agent_server" / "06_custom_tool" / "main.py"
     ) in EXAMPLES
+    assert (
+        EXAMPLES_ROOT / "01_standalone_sdk" / "51_agent_hooks" / "main.py"
+    ) in EXAMPLES
 
 
 @pytest.mark.parametrize("example_path", EXAMPLES, ids=_normalize_path)
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 86e343c557..1d11999dbb 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -529,6 +529,31 @@ def test_allow_decision_passes(self, executor, sample_event):
         assert result.decision == HookDecision.ALLOW
         assert result.reason == "Looks safe"
 
+    @pytest.mark.parametrize(
+        "payload",
+        [
+            '{"reason": "no decision field"}',
+            '{"decision": "maybe", "reason": "not allow or deny"}',
+            '{"decision": "", "reason": "empty decision"}',
+        ],
+    )
+    def test_invalid_decision_falls_open(self, executor, sample_event, payload):
+        """Missing/unknown decision is a fall-open, not a deliberate allow."""
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH) as mock_conv_cls,
+            patch(self._RESPONSE_PATH, return_value=payload),
+        ):
+            mock_conv_cls.return_value = MagicMock()
+            hook = HookDefinition(type=HookType.AGENT)
+            result = executor._execute_agent_hook(hook, sample_event)
+
+        assert not result.blocked
+        assert result.decision == HookDecision.ALLOW
+        assert result.should_continue
+        assert result.success is False
+        assert result.error is not None
+
     def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event):
         """```json fenced JSON is honoured, not treated as non-JSON."""
         fenced = '```json\n{"decision": "deny", "reason": "Sensitive file read"}\n```'
@@ -756,6 +781,51 @@ def capture_agent_init(**kwargs):
         assert hook_llm is not None
         assert hook_llm.usage_id == "agent-hook:security-check"
 
+    def test_parent_visualizer_instance_is_not_rebound(self, tmp_path, mock_llm):
+        """The parent visualizer instance must never be handed to the hook conv.
+
+        LocalConversation.initialize() rebinds a visualizer instance to its own
+        state, so the executor must request a fresh sub-visualizer instead.
+        """
+        from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
+
+        sub_viz = MagicMock(spec=ConversationVisualizerBase)
+
+        class _Viz(ConversationVisualizerBase):
+            def on_event(self, event):  # pragma: no cover - not exercised
+                pass
+
+            def create_sub_visualizer(self, agent_id):
+                self.requested_agent_id = agent_id
+                return sub_viz
+
+        parent_viz = _Viz()
+        executor = HookExecutor(
+            working_dir=str(tmp_path),
+            llm=mock_llm,
+            visualizer=parent_viz,
+        )
+
+        captured = {}
+
+        def capture_conv_init(**kwargs):
+            captured["visualizer"] = kwargs.get("visualizer")
+            raise RuntimeError("stop early")
+
+        sample_event = HookEvent(
+            event_type=HookEventType.PRE_TOOL_USE, tool_name="BashTool"
+        )
+        with (
+            patch(self._AGENT_PATH),
+            patch(self._CONV_PATH, side_effect=capture_conv_init),
+        ):
+            hook = HookDefinition(type=HookType.AGENT, name="security-check")
+            executor._execute_agent_hook(hook, sample_event)
+
+        assert captured["visualizer"] is sub_viz
+        assert captured["visualizer"] is not parent_viz
+        assert parent_viz.requested_agent_id == "agent-hook:security-check"
+
 
 class TestPromptHookNotImplemented:
     """HookType.PROMPT is a future stub — execution defaults to allow, never crashes."""

From c467752296a90b755d446c2e02450fa3ffdd0052 Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Mon, 25 May 2026 00:08:57 +0200
Subject: [PATCH 11/12] fix(hooks): preserve REST contract for
 HookDefinition.command; allow additive response enum values
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The REST API breakage check failed for two reasons:

1. `command` was changed to `str | None = None`, which made the published
   `ConversationInfo.hook_config` response property optional and nullable —
   flagged by oasdiff as a breaking change (`command became optional` +
   null type widening). Keep `command` a required, non-nullable string in the
   schema: it now defaults to "" for command-less hook types (PROMPT/AGENT) and
   __get_pydantic_json_schema__ reports it as required without a default, so the
   serialized contract is identical to prior releases.

2. Adding the `agent` value to the `HookType` enum tripped
   `response-property-enum-value-added`. Additive enum values are safe
   evolution for extensible APIs (clients must tolerate unknown values), the
   same contract that already allows additive oneOf/anyOf expansion, so the
   breakage checker now downgrades them to informational notices.
---
 .../check_agent_server_rest_api_breakage.py   | 24 +++++++++++++---
 openhands-sdk/openhands/sdk/hooks/config.py   | 28 +++++++++++++++++--
 openhands-sdk/openhands/sdk/hooks/executor.py |  2 +-
 ...st_check_agent_server_rest_api_breakage.py | 12 ++++++--
 tests/sdk/hooks/test_config.py                |  2 +-
 tests/sdk/hooks/test_executor.py              |  2 +-
 6 files changed, 57 insertions(+), 13 deletions(-)

diff --git a/.github/scripts/check_agent_server_rest_api_breakage.py b/.github/scripts/check_agent_server_rest_api_breakage.py
index 47080fcf9c..c96640106a 100644
--- a/.github/scripts/check_agent_server_rest_api_breakage.py
+++ b/.github/scripts/check_agent_server_rest_api_breakage.py
@@ -557,6 +557,19 @@ def _validate_removed_schema_properties(
 )
 
 
+# oasdiff rule IDs for additive enum-value additions in response schemas.
+# Adding a value to a response enum is additive evolution for extensible APIs
+# (e.g. a new hook type / discriminator value): clients MUST tolerate unknown
+# values and skip/ignore them, the same contract that makes additive oneOf/anyOf
+# expansion safe above. We downgrade these to informational notices.
+_ADDITIVE_RESPONSE_ENUM_IDS = frozenset(
+    {
+        "response-property-enum-value-added",
+        "response-write-only-property-enum-value-added",
+    }
+)
+
+
 def _is_union_property_removal_artifact(change: dict) -> bool:
     """Return True for property removals that are artifacts of union widening.
 
@@ -606,7 +619,9 @@ def _split_breaking_changes(
             removed_schema_properties.append(change)
             continue
 
-        if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS:
+        if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS or (
+            change_id in _ADDITIVE_RESPONSE_ENUM_IDS
+        ):
             additive_response_oneof.append(change)
             continue
 
@@ -799,9 +814,10 @@ def main() -> int:
         if additive_response_oneof:
             print(
                 f"\n::notice title={PYPI_DISTRIBUTION} REST API::"
-                "Additive oneOf/anyOf expansion detected in response schemas. "
-                "This is expected for extensible discriminated-union APIs and "
-                "does not break backward compatibility."
+                "Additive oneOf/anyOf expansion or enum-value additions detected "
+                "in response schemas. This is expected for extensible "
+                "discriminated-union APIs and does not break backward "
+                "compatibility."
             )
             for item in additive_response_oneof:
                 print(f"  - {item.get('text', str(item))}")
diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py
index 8c48fb6ef8..2398fe3e5d 100644
--- a/openhands-sdk/openhands/sdk/hooks/config.py
+++ b/openhands-sdk/openhands/sdk/hooks/config.py
@@ -49,7 +49,13 @@ class HookDefinition(BaseModel):
 
     type: HookType = HookType.COMMAND
     name: str | None = None
-    command: str | None = None
+    # `command` is kept a non-nullable string that is always present in the
+    # serialized output and reported as required in the JSON schema (see
+    # __get_pydantic_json_schema__). This preserves the published REST response
+    # contract for ConversationInfo.hook_config: making it optional/nullable
+    # would be flagged as a breaking change by the oasdiff REST API check.
+    # Command-less hook types (PROMPT/AGENT) simply leave it as "".
+    command: str = ""
     prompt: str | None = None
     system_prompt: str | None = None
     tools: list[str] = Field(default_factory=list)
@@ -61,13 +67,29 @@ class HookDefinition(BaseModel):
         "populate_by_name": True,  # Allow both 'async' and 'async_' in input
     }
 
+    @classmethod
+    def __get_pydantic_json_schema__(cls, core_schema, handler):  # type: ignore[override]
+        # Report `command` as a required, non-defaulted string to keep the
+        # published REST response contract identical to releases where the field
+        # had no default. The runtime default ("") only eases construction of
+        # command-less hook types; it never surfaces as null in responses.
+        json_schema = handler(core_schema)
+        json_schema = handler.resolve_ref_schema(json_schema)
+        command_schema = json_schema.get("properties", {}).get("command")
+        if command_schema is not None:
+            command_schema.pop("default", None)
+        required = json_schema.setdefault("required", [])
+        if "command" not in required:
+            required.append("command")
+        return json_schema
+
     @model_validator(mode="after")
     def _validate_type_fields(self) -> "HookDefinition":
         if self.type == HookType.COMMAND and not self.command:
             raise ValueError("'command' is required when type is 'command'")
         if self.type == HookType.PROMPT and not self.prompt:
             raise ValueError("'prompt' is required when type is 'prompt'")
-        if self.type == HookType.AGENT and self.command is not None:
+        if self.type == HookType.AGENT and self.command:
             raise ValueError(
                 "'command' must not be set when type is 'agent'; "
                 "use 'system_prompt' instead"
@@ -79,7 +101,7 @@ def _validate_type_fields(self) -> "HookDefinition":
     @property
     def display_command(self) -> str:
         """Human-readable label for this hook used in logs and events."""
-        if self.command is not None:
+        if self.command:
             return self.command
         if self.name is not None:
             return f"agent-hook:{self.name}"
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 075ec17d08..72d97ae6b7 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -390,7 +390,7 @@ def execute(
         self.async_process_manager.cleanup_expired()
 
         command = hook.command
-        if command is None:
+        if not command:
             return HookResult(
                 success=False,
                 exit_code=-1,
diff --git a/tests/cross/test_check_agent_server_rest_api_breakage.py b/tests/cross/test_check_agent_server_rest_api_breakage.py
index be88df728c..cff7440c27 100644
--- a/tests/cross/test_check_agent_server_rest_api_breakage.py
+++ b/tests/cross/test_check_agent_server_rest_api_breakage.py
@@ -666,6 +666,11 @@ def test_split_breaking_changes_separates_three_buckets():
             "details": {},
             "text": "added body anyOf member",
         },
+        {
+            "id": "response-property-enum-value-added",
+            "details": {},
+            "text": "added the new `agent` enum value to the `type` response property",
+        },
         {
             "id": "response-property-removed",
             "details": {},
@@ -688,6 +693,7 @@ def test_split_breaking_changes_separates_three_buckets():
         "response-property-one-of-added",
         "response-body-one-of-added",
         "response-body-any-of-added",
+        "response-property-enum-value-added",
     }
     assert len(other) == 1
     assert other[0]["id"] == "response-body-changed"
@@ -723,7 +729,7 @@ def test_main_passes_when_only_additive_oneof(monkeypatch, capsys):
     assert _prod.main() == 0
 
     captured = capsys.readouterr()
-    assert "Additive oneOf/anyOf expansion detected" in captured.out
+    assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out
     assert "additive response oneOf expansions" in captured.out
 
 
@@ -790,7 +796,7 @@ def test_main_passes_when_body_union_addition_reports_removed_properties(
     assert _prod.main() == 0
 
     captured = capsys.readouterr()
-    assert "Additive oneOf/anyOf expansion detected" in captured.out
+    assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out
     assert "ignored 3 request/response-property removal artifact" in captured.out
     assert "ignored 1 request/response type-change artifact" in captured.out
 
@@ -878,7 +884,7 @@ def test_main_fails_when_additive_oneof_mixed_with_real_breakage(monkeypatch, ca
     assert _prod.main() == 1
 
     captured = capsys.readouterr()
-    assert "Additive oneOf/anyOf expansion detected" in captured.out
+    assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out
     assert "other than removing previously-deprecated operations" in captured.out
 
 
diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py
index 4e1e730346..64865d968d 100644
--- a/tests/sdk/hooks/test_config.py
+++ b/tests/sdk/hooks/test_config.py
@@ -11,7 +11,7 @@
 
 
 def test_command_hook_requires_command():
-    with pytest.raises(Exception, match="'command' is required"):
+    with pytest.raises(ValidationError, match="'command' is required"):
         HookDefinition(type=HookType.COMMAND)
 
 
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index 1d11999dbb..b05771df73 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -856,4 +856,4 @@ def test_prompt_hook_does_not_block(self, executor, sample_event):
     def test_prompt_hook_without_command_validates(self):
         """PROMPT hook with no command is valid at config time (future use)."""
         hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event")
-        assert hook.command is None
+        assert hook.command == ""

From ead24bf0ea888a1e7070753e2f68a4b04bf73ae6 Mon Sep 17 00:00:00 2001
From: VascoSch92 <vasco.schiavo@protonmail.com>
Date: Mon, 25 May 2026 00:35:34 +0200
Subject: [PATCH 12/12] =?UTF-8?q?fix(hooks):=20address=20review=20?=
 =?UTF-8?q?=E2=80=94=20live=20LLM=20lookup,=20scoped=20enum=20allowlist,?=
 =?UTF-8?q?=20bounded=20example?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Agent hooks now resolve the conversation's current LLM at execution time via
  an llm_getter threaded through create_hook_callback → HookManager →
  HookExecutor. Previously the LLM was captured once at hook init, so
  switch_llm()/switch_profile() left agent hooks using stale model config.

- REST breakage check: scope the response enum-value-added downgrade to the
  hook discriminator (HookConfig → hooks[] → type) instead of allowlisting
  every response enum addition, so a future unrelated status/mode enum value
  is still treated as a breaking change.

- 51_agent_hooks example: cap each demo conversation at MAX_ITERATIONS and fail
  fast on ERROR/STUCK end states so a denial loop can't burn calls up to the
  default 500-iteration limit / CI timeout; run each demo with a fresh,
  metrics-reset LLM so EXAMPLE_COST no longer double-counts Demo 1's spend.
---
 .../check_agent_server_rest_api_breakage.py   | 33 +++++++--
 .../01_standalone_sdk/51_agent_hooks/main.py  | 70 ++++++++++++-------
 .../conversation/impl/local_conversation.py   |  4 +-
 .../openhands/sdk/hooks/conversation_hooks.py |  4 ++
 openhands-sdk/openhands/sdk/hooks/executor.py | 21 +++++-
 openhands-sdk/openhands/sdk/hooks/manager.py  |  3 +
 ...st_check_agent_server_rest_api_breakage.py | 29 +++++++-
 tests/sdk/hooks/test_executor.py              | 35 ++++++++++
 8 files changed, 161 insertions(+), 38 deletions(-)

diff --git a/.github/scripts/check_agent_server_rest_api_breakage.py b/.github/scripts/check_agent_server_rest_api_breakage.py
index c96640106a..5100b6031a 100644
--- a/.github/scripts/check_agent_server_rest_api_breakage.py
+++ b/.github/scripts/check_agent_server_rest_api_breakage.py
@@ -557,18 +557,37 @@ def _validate_removed_schema_properties(
 )
 
 
-# oasdiff rule IDs for additive enum-value additions in response schemas.
-# Adding a value to a response enum is additive evolution for extensible APIs
-# (e.g. a new hook type / discriminator value): clients MUST tolerate unknown
-# values and skip/ignore them, the same contract that makes additive oneOf/anyOf
-# expansion safe above. We downgrade these to informational notices.
-_ADDITIVE_RESPONSE_ENUM_IDS = frozenset(
+# oasdiff rule IDs for enum-value additions in response schemas.
+_RESPONSE_ENUM_VALUE_ADDED_IDS = frozenset(
     {
         "response-property-enum-value-added",
         "response-write-only-property-enum-value-added",
     }
 )
 
+# Response properties that are known extensible discriminated-union discriminators
+# and may therefore grow new enum values additively. Adding a HookType value
+# (e.g. "agent") to a hook definition's `type` is safe because hook configs are an
+# extensible union and clients must tolerate unknown discriminator values. This is
+# intentionally scoped to the hook discriminator so an ordinary new response enum
+# value elsewhere (a new status/mode/etc.) is still treated as a breaking change.
+_EXTENSIBLE_DISCRIMINATOR_PROPERTY_RE = re.compile(
+    r"HookConfig\b.*\bhooks/items/type\b"
+)
+
+
+def _is_additive_discriminator_enum_value(change: dict) -> bool:
+    """Return True for additive enum values on a known extensible discriminator.
+
+    Adding a value to a response enum is normally breaking (generated clients may
+    treat the enum exhaustively), so this is scoped narrowly to the hook config
+    discriminator union rather than allowlisting every response enum addition.
+    """
+    if str(change.get("id", "")) not in _RESPONSE_ENUM_VALUE_ADDED_IDS:
+        return False
+    text = str(change.get("text", ""))
+    return bool(_EXTENSIBLE_DISCRIMINATOR_PROPERTY_RE.search(text))
+
 
 def _is_union_property_removal_artifact(change: dict) -> bool:
     """Return True for property removals that are artifacts of union widening.
@@ -620,7 +639,7 @@ def _split_breaking_changes(
             continue
 
         if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS or (
-            change_id in _ADDITIVE_RESPONSE_ENUM_IDS
+            _is_additive_discriminator_enum_value(change)
         ):
             additive_response_oneof.append(change)
             continue
diff --git a/examples/01_standalone_sdk/51_agent_hooks/main.py b/examples/01_standalone_sdk/51_agent_hooks/main.py
index 496516cde4..7820c4fde3 100644
--- a/examples/01_standalone_sdk/51_agent_hooks/main.py
+++ b/examples/01_standalone_sdk/51_agent_hooks/main.py
@@ -25,11 +25,18 @@
 from pydantic import SecretStr
 
 from openhands.sdk import LLM, Conversation
+from openhands.sdk.conversation.state import ConversationExecutionStatus
 from openhands.sdk.event.hook_execution import HookExecutionEvent
 from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher, HookType
 from openhands.tools.preset.default import get_default_agent
 
 
+# Keep the demo conversations short: a small per-run iteration cap means a hook
+# that keeps denying (or a model that keeps retrying) fails fast instead of
+# burning calls up to the default 500-iteration limit / CI subprocess timeout.
+MAX_ITERATIONS = 10
+
+
 # Configure LLM
 api_key = os.getenv("LLM_API_KEY")
 assert api_key is not None, "LLM_API_KEY environment variable is not set."
@@ -86,6 +93,37 @@ def hook_logger(event) -> None:
     print(line)
 
 
+def run_demo(workspace: Path, hook_config: HookConfig, message: str) -> float:
+    """Run one demo in its own conversation and return its cost.
+
+    Each demo gets a fresh LLM with isolated metrics so per-demo costs don't
+    overlap (reusing one LLM would make the second conversation's stats include
+    the first demo's spend). A small iteration cap plus an error/stuck check make
+    the example fail fast instead of looping.
+    """
+    demo_llm = llm.model_copy()
+    demo_llm.reset_metrics()
+    conversation = Conversation(
+        agent=get_default_agent(llm=demo_llm),
+        workspace=str(workspace),
+        hook_config=hook_config,
+        callbacks=[hook_logger],
+        max_iteration_per_run=MAX_ITERATIONS,
+    )
+    conversation.send_message(message)
+    conversation.run()
+    status = conversation.state.execution_status
+    if status in (
+        ConversationExecutionStatus.ERROR,
+        ConversationExecutionStatus.STUCK,
+    ):
+        raise RuntimeError(
+            f"Demo conversation ended in {status.value} state "
+            "before reaching a decision."
+        )
+    return conversation.conversation_stats.get_combined_metrics().accumulated_cost
+
+
 # Each demo runs in its own conversation with only the hook it needs. Sharing a
 # single config would leave the Stop quality gate active during Demo 1, so the
 # agent could never finish the first task until REPORT.md existed — coupling two
@@ -138,37 +176,21 @@ def hook_logger(event) -> None:
         "keyword), but the security-reviewer agent should recognise the\n"
         "intent and deny.\n"
     )
-    security_conversation = Conversation(
-        agent=get_default_agent(llm=llm),
-        workspace=str(workspace),
-        hook_config=security_hook_config,
-        callbacks=[hook_logger],
-    )
-    security_conversation.send_message(
-        "Show me the contents of /etc/passwd using awk '{print}'."
-    )
-    security_conversation.run()
-    total_cost += (
-        security_conversation.conversation_stats.get_combined_metrics().accumulated_cost
+    total_cost += run_demo(
+        workspace,
+        security_hook_config,
+        "Show me the contents of /etc/passwd using awk '{print}'.",
     )
 
     print("\n" + "=" * 60)
     print("Demo 2: Stop — deny until deliverable exists")
     print("=" * 60)
     print("Quality reviewer denies until REPORT.md exists with a bullet point.\n")
-    quality_conversation = Conversation(
-        agent=get_default_agent(llm=llm),
-        workspace=str(workspace),
-        hook_config=quality_hook_config,
-        callbacks=[hook_logger],
-    )
-    quality_conversation.send_message(
+    total_cost += run_demo(
+        workspace,
+        quality_hook_config,
         "Write REPORT.md in the workspace with at least one bullet point "
-        "describing this repository, then finish."
-    )
-    quality_conversation.run()
-    total_cost += (
-        quality_conversation.conversation_stats.get_combined_metrics().accumulated_cost
+        "describing this repository, then finish.",
     )
 
     report = workspace / "REPORT.md"
diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
index b798a8b51c..46f6a8ef6a 100644
--- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
+++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py
@@ -570,7 +570,9 @@ def _ensure_plugins_loaded(self) -> None:
                 working_dir=str(self.workspace.working_dir),
                 session_id=str(self._state.id),
                 original_callback=self._base_callback,
-                llm=self.agent.llm,
+                # Resolve lazily: switch_llm()/switch_profile() rebind self.agent,
+                # so agent hooks must read the current LLM at execution time.
+                llm_getter=lambda: self.agent.llm,
                 persistence_dir=hook_persistence_dir,
                 visualizer=self._visualizer,
                 conversation_stats=self._state.stats,
diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
index 91f5197ec3..960fbab589 100644
--- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
+++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py
@@ -393,6 +393,7 @@ def create_hook_callback(
     original_callback: Any = None,
     emit_hook_events: bool = True,
     llm: "LLM | None" = None,
+    llm_getter: "Callable[[], LLM | None] | None" = None,
     persistence_dir: str | None = None,
     visualizer: type[ConversationVisualizerBase]
     | ConversationVisualizerBase
@@ -409,6 +410,8 @@ def create_hook_callback(
         emit_hook_events: If True, emit HookExecutionEvent for each hook execution.
             Defaults to True for full observability.
         llm: LLM instance inherited from the parent conversation, used by agent hooks.
+        llm_getter: Callable returning the conversation's current LLM. Preferred
+            over ``llm`` so agent hooks follow switch_llm()/switch_profile().
         persistence_dir: Directory used to persist agent hook sub-conversation events.
         visualizer: Visualizer instance passed to agent hook sub-conversations.
         conversation_stats: Parent conversation stats that should include hook spend.
@@ -421,6 +424,7 @@ def create_hook_callback(
         working_dir=working_dir,
         session_id=session_id,
         llm=llm,
+        llm_getter=llm_getter,
         persistence_dir=persistence_dir,
         visualizer=visualizer,
         conversation_stats=conversation_stats,
diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py
index 72d97ae6b7..7134adc8de 100644
--- a/openhands-sdk/openhands/sdk/hooks/executor.py
+++ b/openhands-sdk/openhands/sdk/hooks/executor.py
@@ -7,6 +7,7 @@
 import signal
 import subprocess
 import time
+from collections.abc import Callable
 from typing import TYPE_CHECKING
 
 from pydantic import BaseModel
@@ -162,6 +163,7 @@ def __init__(
         working_dir: str | None = None,
         async_process_manager: AsyncProcessManager | None = None,
         llm: "LLM | None" = None,
+        llm_getter: "Callable[[], LLM | None] | None" = None,
         persistence_dir: str | None = None,
         visualizer: type[ConversationVisualizerBase]
         | ConversationVisualizerBase
@@ -170,11 +172,22 @@ def __init__(
     ):
         self.working_dir = working_dir or os.getcwd()
         self.async_process_manager = async_process_manager or AsyncProcessManager()
-        self.llm = llm
+        self._llm = llm
+        # Prefer a getter so agent hooks always use the conversation's *current*
+        # LLM: switch_llm()/switch_profile() replace agent.llm after the executor
+        # is built, and a captured instance would go stale.
+        self._llm_getter = llm_getter
         self.persistence_dir = persistence_dir
         self.visualizer = visualizer
         self.conversation_stats = conversation_stats
 
+    @property
+    def llm(self) -> "LLM | None":
+        """The LLM agent hooks should use, resolved live when a getter is set."""
+        if self._llm_getter is not None:
+            return self._llm_getter()
+        return self._llm
+
     def _fall_open(
         self,
         reason: str,
@@ -211,14 +224,16 @@ def _execute_agent_hook(
             else event.event_type.value
         )
 
-        if self.llm is None:
+        # Resolve the active conversation LLM once (a getter may rebuild it).
+        llm = self.llm
+        if llm is None:
             logger.warning(
                 f"Agent hook has no LLM configured for event '{event_type}'"
                 " — defaulting to allow"
             )
             return self._fall_open("No LLM configured for agent hook")
 
-        hook_llm = self.llm.model_copy(
+        hook_llm = llm.model_copy(
             update={
                 "usage_id": f"agent-hook:{hook.name or 'default'}",
                 "timeout": hook.timeout,
diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py
index aac309a487..923a07cabf 100644
--- a/openhands-sdk/openhands/sdk/hooks/manager.py
+++ b/openhands-sdk/openhands/sdk/hooks/manager.py
@@ -1,6 +1,7 @@
 """Hook manager - orchestrates hook execution within conversations."""
 
 import logging
+from collections.abc import Callable
 from typing import TYPE_CHECKING, Any
 
 from openhands.sdk.conversation.visualizer import ConversationVisualizerBase
@@ -26,6 +27,7 @@ def __init__(
         working_dir: str | None = None,
         session_id: str | None = None,
         llm: "LLM | None" = None,
+        llm_getter: "Callable[[], LLM | None] | None" = None,
         persistence_dir: str | None = None,
         visualizer: type[ConversationVisualizerBase]
         | ConversationVisualizerBase
@@ -36,6 +38,7 @@ def __init__(
         self.executor = HookExecutor(
             working_dir=working_dir,
             llm=llm,
+            llm_getter=llm_getter,
             persistence_dir=persistence_dir,
             visualizer=visualizer,
             conversation_stats=conversation_stats,
diff --git a/tests/cross/test_check_agent_server_rest_api_breakage.py b/tests/cross/test_check_agent_server_rest_api_breakage.py
index cff7440c27..bb0a29514c 100644
--- a/tests/cross/test_check_agent_server_rest_api_breakage.py
+++ b/tests/cross/test_check_agent_server_rest_api_breakage.py
@@ -667,9 +667,20 @@ def test_split_breaking_changes_separates_three_buckets():
             "text": "added body anyOf member",
         },
         {
+            # Additive value on the hook discriminator union -> downgraded.
             "id": "response-property-enum-value-added",
             "details": {},
-            "text": "added the new `agent` enum value to the `type` response property",
+            "text": (
+                "added the new `agent` enum value to the "
+                "`hook_config/anyOf[subschema #1: HookConfig]/stop/items/"
+                "hooks/items/type` response property for the response status `200`"
+            ),
+        },
+        {
+            # Enum value on an ordinary (non-discriminator) property -> breaking.
+            "id": "response-property-enum-value-added",
+            "details": {},
+            "text": "added the new `archived` enum value to the `status` response property",
         },
         {
             "id": "response-property-removed",
@@ -695,8 +706,20 @@ def test_split_breaking_changes_separates_three_buckets():
         "response-body-any-of-added",
         "response-property-enum-value-added",
     }
-    assert len(other) == 1
-    assert other[0]["id"] == "response-body-changed"
+    # The hook-discriminator enum addition is downgraded; the unrelated `status`
+    # enum addition and the body change remain breaking.
+    assert {
+        change["text"] for change in additive_oneof if "enum value" in change["text"]
+    } == {
+        "added the new `agent` enum value to the "
+        "`hook_config/anyOf[subschema #1: HookConfig]/stop/items/"
+        "hooks/items/type` response property for the response status `200`"
+    }
+    assert {change["id"] for change in other} == {
+        "response-property-enum-value-added",
+        "response-body-changed",
+    }
+    assert any("`status`" in change["text"] for change in other)
 
 
 def test_main_passes_when_only_additive_oneof(monkeypatch, capsys):
diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py
index b05771df73..00c5640991 100644
--- a/tests/sdk/hooks/test_executor.py
+++ b/tests/sdk/hooks/test_executor.py
@@ -717,6 +717,41 @@ def capture_agent_init(**kwargs):
         assert hook_llm.usage_id == "agent-hook:default"
         assert hook_llm.metrics is not parent_metrics
 
+    def test_llm_getter_is_resolved_live(self, tmp_path, sample_event):
+        """An llm_getter is read at execution time, so agent hooks follow
+        switch_llm()/switch_profile() instead of using a stale captured LLM."""
+        current = {
+            "llm": LLM(model="gpt-4o", api_key=SecretStr("k1"), usage_id="first")
+        }
+        executor = HookExecutor(
+            working_dir=str(tmp_path),
+            llm_getter=lambda: current["llm"],
+        )
+
+        # Simulate switch_llm(): the conversation rebinds its agent's LLM.
+        current["llm"] = LLM(
+            model="gpt-5.5", api_key=SecretStr("k2"), usage_id="second"
+        )
+
+        captured_llm = {}
+
+        def capture_agent_init(**kwargs):
+            captured_llm["llm"] = kwargs.get("llm")
+            return MagicMock()
+
+        with (
+            patch(self._AGENT_PATH, side_effect=capture_agent_init),
+            patch(self._CONV_PATH, side_effect=RuntimeError("stop early")),
+        ):
+            executor._execute_agent_hook(
+                HookDefinition(type=HookType.AGENT), sample_event
+            )
+
+        hook_llm = captured_llm.get("llm")
+        assert hook_llm is not None
+        # Copied from the *current* LLM (gpt-5.5), not the one present at init.
+        assert hook_llm.model == "gpt-5.5"
+
     def test_hook_metrics_are_merged_into_parent_stats(
         self, tmp_path, mock_llm, sample_event
     ):