diff --git a/.github/scripts/check_agent_server_rest_api_breakage.py b/.github/scripts/check_agent_server_rest_api_breakage.py index 47080fcf9c..5100b6031a 100644 --- a/.github/scripts/check_agent_server_rest_api_breakage.py +++ b/.github/scripts/check_agent_server_rest_api_breakage.py @@ -557,6 +557,38 @@ def _validate_removed_schema_properties( ) +# oasdiff rule IDs for enum-value additions in response schemas. +_RESPONSE_ENUM_VALUE_ADDED_IDS = frozenset( + { + "response-property-enum-value-added", + "response-write-only-property-enum-value-added", + } +) + +# Response properties that are known extensible discriminated-union discriminators +# and may therefore grow new enum values additively. Adding a HookType value +# (e.g. "agent") to a hook definition's `type` is safe because hook configs are an +# extensible union and clients must tolerate unknown discriminator values. This is +# intentionally scoped to the hook discriminator so an ordinary new response enum +# value elsewhere (a new status/mode/etc.) is still treated as a breaking change. +_EXTENSIBLE_DISCRIMINATOR_PROPERTY_RE = re.compile( + r"HookConfig\b.*\bhooks/items/type\b" +) + + +def _is_additive_discriminator_enum_value(change: dict) -> bool: + """Return True for additive enum values on a known extensible discriminator. + + Adding a value to a response enum is normally breaking (generated clients may + treat the enum exhaustively), so this is scoped narrowly to the hook config + discriminator union rather than allowlisting every response enum addition. + """ + if str(change.get("id", "")) not in _RESPONSE_ENUM_VALUE_ADDED_IDS: + return False + text = str(change.get("text", "")) + return bool(_EXTENSIBLE_DISCRIMINATOR_PROPERTY_RE.search(text)) + + def _is_union_property_removal_artifact(change: dict) -> bool: """Return True for property removals that are artifacts of union widening. @@ -606,7 +638,9 @@ def _split_breaking_changes( removed_schema_properties.append(change) continue - if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS: + if change_id in _ADDITIVE_RESPONSE_ONEOF_IDS or ( + _is_additive_discriminator_enum_value(change) + ): additive_response_oneof.append(change) continue @@ -799,9 +833,10 @@ def main() -> int: if additive_response_oneof: print( f"\n::notice title={PYPI_DISTRIBUTION} REST API::" - "Additive oneOf/anyOf expansion detected in response schemas. " - "This is expected for extensible discriminated-union APIs and " - "does not break backward compatibility." + "Additive oneOf/anyOf expansion or enum-value additions detected " + "in response schemas. This is expected for extensible " + "discriminated-union APIs and does not break backward " + "compatibility." ) for item in additive_response_oneof: print(f" - {item.get('text', str(item))}") diff --git a/examples/01_standalone_sdk/51_agent_hooks/README.md b/examples/01_standalone_sdk/51_agent_hooks/README.md new file mode 100644 index 0000000000..a3d6cd6d97 --- /dev/null +++ b/examples/01_standalone_sdk/51_agent_hooks/README.md @@ -0,0 +1,60 @@ +# Agent-based Hooks Example + +This folder demonstrates the `type="agent"` hook — a lifecycle hook whose +decision is produced by an LLM-driven sub-agent rather than a shell script. + +For shell-command hooks see [`../33_hooks/`](../33_hooks). + +## Why an agent hook + +A shell-based PreToolUse hook can only block what its blacklist literally +matches. The agent rewrites `cat /etc/passwd` as `awk '{print}' /etc/passwd` +and slips through. An agent hook reasons about the **semantic intent** of the +command — "reading a sensitive system file" — and denies regardless of the +exact tool name used. + +## Example + +- **main.py** — Two agent hooks, each in its own conversation: + - **PreToolUse** "security reviewer" denies a command whose intent is to + read `/etc/passwd`, even though no obvious keyword appears in a blacklist. + - **Stop** "quality reviewer" refuses to let the main agent finish until + the required deliverable (`REPORT.md`) is present in the workspace. + +Each hook decision is printed to the console via a `HookExecutionEvent` +callback, so you can watch the allow/deny outcomes as the demo runs. + +## Running + +```bash +export LLM_API_KEY="your-key" +export LLM_MODEL="anthropic/claude-sonnet-4-5-20250929" # optional +export LLM_BASE_URL="https://your-endpoint" # optional + +python main.py +``` + +## How an agent hook is configured + +```python +HookDefinition( + type=HookType.AGENT, + name="security-reviewer", # bucket for cost metrics (agent-hook:) + system_prompt="...", # instructs the hook agent; must request JSON + tools=["file_editor"], # optional tools the hook agent may use + # (use registered names, e.g. "file_editor", + # "terminal" — not class names like + # "FileEditorTool") + timeout=60, # forwarded to the per-hook LLM copy + max_iterations=3, # cap on hook sub-conversation steps +) +``` + +The hook agent receives the event JSON and must reply with: + +```json +{"decision": "allow" | "deny", "reason": ""} +``` + +Anything else (non-JSON, missing field, sub-conversation error) defaults to +`allow` so a broken hook cannot wedge the main agent. diff --git a/examples/01_standalone_sdk/51_agent_hooks/main.py b/examples/01_standalone_sdk/51_agent_hooks/main.py new file mode 100644 index 0000000000..7820c4fde3 --- /dev/null +++ b/examples/01_standalone_sdk/51_agent_hooks/main.py @@ -0,0 +1,204 @@ +"""OpenHands Agent SDK — Agent-based Hooks Example + +Demonstrates the `type="agent"` hook, which evaluates lifecycle events with an +LLM-driven sub-agent instead of a shell script. The hook agent receives the +event JSON, reasons about it semantically, and replies with a decision payload: + + {"decision": "allow" | "deny", "reason": "..."} + +Two demos: + +- PreToolUse (security reviewer): inspects the INTENT of a terminal command, + not just its syntax. A command like `awk '{print}' /etc/passwd` would slip + past a blacklist of command names, but the agent hook recognises it as + reading a sensitive system file and denies it. +- Stop (quality reviewer): refuses to let the main agent finish until the + required deliverable exists in the workspace. + +For shell-command hooks see ../33_hooks/33_hooks.py. +""" + +import os +import tempfile +from pathlib import Path + +from pydantic import SecretStr + +from openhands.sdk import LLM, Conversation +from openhands.sdk.conversation.state import ConversationExecutionStatus +from openhands.sdk.event.hook_execution import HookExecutionEvent +from openhands.sdk.hooks import HookConfig, HookDefinition, HookMatcher, HookType +from openhands.tools.preset.default import get_default_agent + + +# Keep the demo conversations short: a small per-run iteration cap means a hook +# that keeps denying (or a model that keeps retrying) fails fast instead of +# burning calls up to the default 500-iteration limit / CI subprocess timeout. +MAX_ITERATIONS = 10 + + +# Configure LLM +api_key = os.getenv("LLM_API_KEY") +assert api_key is not None, "LLM_API_KEY environment variable is not set." +model = os.getenv("LLM_MODEL", "anthropic/claude-sonnet-4-5-20250929") +base_url = os.getenv("LLM_BASE_URL") + +llm = LLM( + usage_id="agent", + model=model, + base_url=base_url, + api_key=SecretStr(api_key), +) + +SECURITY_REVIEWER_PROMPT = """You are a security reviewer for a coding agent. +You receive a PreToolUse hook event describing a terminal command the agent +is about to run. + +Reason about the SEMANTIC INTENT of the command, not just its syntax. Block: +- reading sensitive system files (/etc/passwd, /etc/shadow, ~/.ssh/*, credentials) +- broad filesystem discovery outside the workspace (rg --files /etc, find /, ls /root) +- exfiltrating data over the network +- modifying system state (sudo, apt install, systemctl) + +Allow ordinary inspection, builds, tests, and edits inside the workspace. +When unsure, prefer allow. + +Reply with a single JSON object and nothing else: +{"decision": "allow" | "deny", "reason": ""} +""" + +QUALITY_REVIEWER_PROMPT = """You are a quality reviewer enforcing task completion. +You receive a Stop hook event when the main agent tries to finish. + +The task requires the file REPORT.md to exist in the workspace and contain at +least one bullet point describing the repository. Use the file_editor tool to +check whether the file exists and inspect its contents. + +If the deliverable is missing or empty, deny so the main agent keeps working. +Otherwise allow. + +Reply with a single JSON object and nothing else: +{"decision": "allow" | "deny", "reason": ""} +""" + + +def hook_logger(event) -> None: + """Surface each hook decision so the demo output is self-explanatory.""" + if not isinstance(event, HookExecutionEvent): + return + status = "DENY " if event.blocked else ("ALLOW" if event.success else "FAIL ") + line = f" [hook] {event.hook_event_type} {status} -> {event.hook_command}" + if event.reason: + line += f"\n reason: {event.reason}" + print(line) + + +def run_demo(workspace: Path, hook_config: HookConfig, message: str) -> float: + """Run one demo in its own conversation and return its cost. + + Each demo gets a fresh LLM with isolated metrics so per-demo costs don't + overlap (reusing one LLM would make the second conversation's stats include + the first demo's spend). A small iteration cap plus an error/stuck check make + the example fail fast instead of looping. + """ + demo_llm = llm.model_copy() + demo_llm.reset_metrics() + conversation = Conversation( + agent=get_default_agent(llm=demo_llm), + workspace=str(workspace), + hook_config=hook_config, + callbacks=[hook_logger], + max_iteration_per_run=MAX_ITERATIONS, + ) + conversation.send_message(message) + conversation.run() + status = conversation.state.execution_status + if status in ( + ConversationExecutionStatus.ERROR, + ConversationExecutionStatus.STUCK, + ): + raise RuntimeError( + f"Demo conversation ended in {status.value} state " + "before reaching a decision." + ) + return conversation.conversation_stats.get_combined_metrics().accumulated_cost + + +# Each demo runs in its own conversation with only the hook it needs. Sharing a +# single config would leave the Stop quality gate active during Demo 1, so the +# agent could never finish the first task until REPORT.md existed — coupling two +# unrelated demos and burning iterations. +security_hook_config = HookConfig( + pre_tool_use=[ + HookMatcher( + matcher="terminal", + hooks=[ + HookDefinition( + type=HookType.AGENT, + name="security-reviewer", + system_prompt=SECURITY_REVIEWER_PROMPT, + timeout=60, + max_iterations=3, + ) + ], + ) + ], +) + +quality_hook_config = HookConfig( + stop=[ + HookMatcher( + hooks=[ + HookDefinition( + type=HookType.AGENT, + name="quality-reviewer", + system_prompt=QUALITY_REVIEWER_PROMPT, + tools=["file_editor"], + timeout=90, + max_iterations=5, + ) + ], + ) + ], +) + + +with tempfile.TemporaryDirectory() as tmpdir: + workspace = Path(tmpdir) + total_cost = 0.0 + + print("=" * 60) + print("Demo 1: PreToolUse — semantic deny") + print("=" * 60) + print( + "Asking the agent to read /etc/passwd via awk. The literal command\n" + "wouldn't match a syntactic blacklist (no `cat`, no `/etc/shadow`\n" + "keyword), but the security-reviewer agent should recognise the\n" + "intent and deny.\n" + ) + total_cost += run_demo( + workspace, + security_hook_config, + "Show me the contents of /etc/passwd using awk '{print}'.", + ) + + print("\n" + "=" * 60) + print("Demo 2: Stop — deny until deliverable exists") + print("=" * 60) + print("Quality reviewer denies until REPORT.md exists with a bullet point.\n") + total_cost += run_demo( + workspace, + quality_hook_config, + "Write REPORT.md in the workspace with at least one bullet point " + "describing this repository, then finish.", + ) + + report = workspace / "REPORT.md" + if report.exists(): + print(f"\n[REPORT.md preview: {report.read_text()[:120]!r}...]") + + print("\n" + "=" * 60) + print("Example Complete!") + print("=" * 60) + + print(f"\nEXAMPLE_COST: {total_cost}") diff --git a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py index a6bb1ec60e..46f6a8ef6a 100644 --- a/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py +++ b/openhands-sdk/openhands/sdk/conversation/impl/local_conversation.py @@ -559,12 +559,23 @@ def _ensure_plugins_loaded(self) -> None: if final_hook_config is not None: # Store final hook_config in state for observability self._state.hook_config = final_hook_config + hook_persistence_dir = ( + str(Path(self._state.persistence_dir).parent) + if self._state.persistence_dir is not None + else None + ) self._hook_processor, self._on_event = create_hook_callback( hook_config=final_hook_config, working_dir=str(self.workspace.working_dir), session_id=str(self._state.id), original_callback=self._base_callback, + # Resolve lazily: switch_llm()/switch_profile() rebind self.agent, + # so agent hooks must read the current LLM at execution time. + llm_getter=lambda: self.agent.llm, + persistence_dir=hook_persistence_dir, + visualizer=self._visualizer, + conversation_stats=self._state.stats, ) self._hook_processor.set_conversation_state(self._state) self._hook_processor.run_session_start() diff --git a/openhands-sdk/openhands/sdk/hooks/config.py b/openhands-sdk/openhands/sdk/hooks/config.py index 594597cf8b..2398fe3e5d 100644 --- a/openhands-sdk/openhands/sdk/hooks/config.py +++ b/openhands-sdk/openhands/sdk/hooks/config.py @@ -41,40 +41,74 @@ class HookType(StrEnum): COMMAND = "command" # Shell command executed via subprocess PROMPT = "prompt" # LLM-based evaluation (future) + AGENT = "agent" # Agent-based evaluation with tool access class HookDefinition(BaseModel): """A single hook definition.""" type: HookType = HookType.COMMAND - command: str + name: str | None = None + # `command` is kept a non-nullable string that is always present in the + # serialized output and reported as required in the JSON schema (see + # __get_pydantic_json_schema__). This preserves the published REST response + # contract for ConversationInfo.hook_config: making it optional/nullable + # would be flagged as a breaking change by the oasdiff REST API check. + # Command-less hook types (PROMPT/AGENT) simply leave it as "". + command: str = "" prompt: str | None = None + system_prompt: str | None = None + tools: list[str] = Field(default_factory=list) timeout: int = 60 + max_iterations: int = 3 async_: bool = Field(default=False, alias="async") # 'async' is a reserved keyword model_config = { "populate_by_name": True, # Allow both 'async' and 'async_' in input } - @model_validator(mode="before") @classmethod - def _set_command_for_prompt_hooks(cls, data: Any) -> Any: - if ( - isinstance(data, dict) - and data.get("type") == "prompt" - and "command" not in data - ): - data["command"] = "" - return data + def __get_pydantic_json_schema__(cls, core_schema, handler): # type: ignore[override] + # Report `command` as a required, non-defaulted string to keep the + # published REST response contract identical to releases where the field + # had no default. The runtime default ("") only eases construction of + # command-less hook types; it never surfaces as null in responses. + json_schema = handler(core_schema) + json_schema = handler.resolve_ref_schema(json_schema) + command_schema = json_schema.get("properties", {}).get("command") + if command_schema is not None: + command_schema.pop("default", None) + required = json_schema.setdefault("required", []) + if "command" not in required: + required.append("command") + return json_schema @model_validator(mode="after") - def _check_required_fields(self) -> "HookDefinition": + def _validate_type_fields(self) -> "HookDefinition": if self.type == HookType.COMMAND and not self.command: raise ValueError("'command' is required when type is 'command'") if self.type == HookType.PROMPT and not self.prompt: raise ValueError("'prompt' is required when type is 'prompt'") + if self.type == HookType.AGENT and self.command: + raise ValueError( + "'command' must not be set when type is 'agent'; " + "use 'system_prompt' instead" + ) + if self.type == HookType.AGENT and self.async_: + raise ValueError("'async' is not supported for agent hooks") return self + @property + def display_command(self) -> str: + """Human-readable label for this hook used in logs and events.""" + if self.command: + return self.command + if self.name is not None: + return f"agent-hook:{self.name}" + if self.system_prompt: + return f"agent-hook:{self.system_prompt[:20]}" + return "agent-hook:agent" + class HookMatcher(BaseModel): """Matches events to hooks based on patterns. diff --git a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py index 4eaac1c1ac..960fbab589 100644 --- a/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py +++ b/openhands-sdk/openhands/sdk/hooks/conversation_hooks.py @@ -3,6 +3,7 @@ from collections.abc import Callable from typing import TYPE_CHECKING, Any +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase from openhands.sdk.event import ( ActionEvent, Event, @@ -19,7 +20,9 @@ if TYPE_CHECKING: + from openhands.sdk.conversation.conversation_stats import ConversationStats from openhands.sdk.conversation.state import ConversationState + from openhands.sdk.llm import LLM logger = get_logger(__name__) @@ -149,7 +152,7 @@ def _handle_pre_tool_use(self, event: ActionEvent) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.PRE_TOOL_USE, - hook_command=hook.command, + hook_command=hook.display_command, result=result, tool_name=tool_name, action_id=event.id, @@ -224,7 +227,7 @@ def _handle_post_tool_use(self, event: ObservationEvent) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.POST_TOOL_USE, - hook_command=hook.command, + hook_command=hook.display_command, result=result, tool_name=tool_name, action_id=action_event.id, @@ -266,7 +269,7 @@ def _handle_user_prompt_submit(self, event: MessageEvent) -> MessageEvent: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.USER_PROMPT_SUBMIT, - hook_command=hook.command, + hook_command=hook.display_command, result=result, message_id=event.id, hook_input={"message": message}, @@ -328,7 +331,7 @@ def run_session_start(self) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.SESSION_START, - hook_command=hook.command, + hook_command=hook.display_command, result=result, ) if result.error: @@ -342,7 +345,7 @@ def run_session_end(self) -> None: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.SESSION_END, - hook_command=hook.command, + hook_command=hook.display_command, result=result, ) if result.error: @@ -360,7 +363,7 @@ def run_stop(self, reason: str | None = None) -> tuple[bool, str | None]: for hook, result in zip(hooks, results, strict=False): self._emit_hook_execution_event( hook_event_type=HookEventType.STOP, - hook_command=hook.command, + hook_command=hook.display_command, result=result, hook_input={"reason": reason} if reason else None, ) @@ -389,6 +392,13 @@ def create_hook_callback( session_id: str | None = None, original_callback: Any = None, emit_hook_events: bool = True, + llm: "LLM | None" = None, + llm_getter: "Callable[[], LLM | None] | None" = None, + persistence_dir: str | None = None, + visualizer: type[ConversationVisualizerBase] + | ConversationVisualizerBase + | None = None, + conversation_stats: "ConversationStats | None" = None, ) -> tuple[HookEventProcessor, Any]: """Create a hook-enabled event callback. Returns (processor, callback). @@ -399,6 +409,12 @@ def create_hook_callback( original_callback: Callback to chain after hook processing. emit_hook_events: If True, emit HookExecutionEvent for each hook execution. Defaults to True for full observability. + llm: LLM instance inherited from the parent conversation, used by agent hooks. + llm_getter: Callable returning the conversation's current LLM. Preferred + over ``llm`` so agent hooks follow switch_llm()/switch_profile(). + persistence_dir: Directory used to persist agent hook sub-conversation events. + visualizer: Visualizer instance passed to agent hook sub-conversations. + conversation_stats: Parent conversation stats that should include hook spend. Returns: Tuple of (HookEventProcessor, callback function). @@ -407,6 +423,11 @@ def create_hook_callback( config=hook_config, working_dir=working_dir, session_id=session_id, + llm=llm, + llm_getter=llm_getter, + persistence_dir=persistence_dir, + visualizer=visualizer, + conversation_stats=conversation_stats, ) processor = HookEventProcessor( diff --git a/openhands-sdk/openhands/sdk/hooks/executor.py b/openhands-sdk/openhands/sdk/hooks/executor.py index 6f3ce897b1..7134adc8de 100644 --- a/openhands-sdk/openhands/sdk/hooks/executor.py +++ b/openhands-sdk/openhands/sdk/hooks/executor.py @@ -1,19 +1,30 @@ -"""Hook executor - runs shell commands with JSON I/O.""" +"""Hook executor - runs shell commands and agent evaluations with JSON I/O.""" +import contextlib import json import logging import os import signal import subprocess import time +from collections.abc import Callable +from typing import TYPE_CHECKING from pydantic import BaseModel -from openhands.sdk.hooks.config import HookDefinition +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase +from openhands.sdk.hooks.config import HookDefinition, HookType from openhands.sdk.hooks.types import HookDecision, HookEvent +from openhands.sdk.observability.laminar import observe from openhands.sdk.utils import sanitized_env +if TYPE_CHECKING: + from openhands.sdk.conversation.base import BaseConversation + from openhands.sdk.conversation.conversation_stats import ConversationStats + from openhands.sdk.llm import LLM + + class HookResult(BaseModel): """Result from executing a hook. @@ -27,6 +38,11 @@ class HookResult(BaseModel): to ``False`` and the error is logged, but the operation still proceeds. In particular, exit code ``1`` does **not** block — only ``2`` does. Hooks intended to enforce a policy must exit with ``2``. + + For agent / prompt hooks, ``success=True`` means the hook produced a + deliberate verdict (parsed ``allow`` or ``deny``). Fall-open paths set + ``success=False`` with ``error`` populated, so a "we couldn't decide" allow + is detectable as ``decision == ALLOW and not success``. """ success: bool = True @@ -138,15 +154,215 @@ def cleanup_all(self) -> None: class HookExecutor: - """Executes hook commands with JSON I/O.""" + """Executes hook commands and agent evaluations with JSON I/O.""" + + _JSON_DECODER = json.JSONDecoder() def __init__( self, working_dir: str | None = None, async_process_manager: AsyncProcessManager | None = None, + llm: "LLM | None" = None, + llm_getter: "Callable[[], LLM | None] | None" = None, + persistence_dir: str | None = None, + visualizer: type[ConversationVisualizerBase] + | ConversationVisualizerBase + | None = None, + conversation_stats: "ConversationStats | None" = None, ): self.working_dir = working_dir or os.getcwd() self.async_process_manager = async_process_manager or AsyncProcessManager() + self._llm = llm + # Prefer a getter so agent hooks always use the conversation's *current* + # LLM: switch_llm()/switch_profile() replace agent.llm after the executor + # is built, and a captured instance would go stale. + self._llm_getter = llm_getter + self.persistence_dir = persistence_dir + self.visualizer = visualizer + self.conversation_stats = conversation_stats + + @property + def llm(self) -> "LLM | None": + """The LLM agent hooks should use, resolved live when a getter is set.""" + if self._llm_getter is not None: + return self._llm_getter() + return self._llm + + def _fall_open( + self, + reason: str, + *, + error: str | None = None, + ) -> HookResult: + return HookResult( + success=False, + decision=HookDecision.ALLOW, + reason=reason, + error=error or reason, + ) + + @observe( + name="hook.execute.agent", + ignore_inputs=["self", "hook", "event"], + ignore_output=True, + ) + def _execute_agent_hook( + self, + hook: HookDefinition, + event: HookEvent, + ) -> HookResult: + # Lazy imports to avoid circular dependency: + # executor <- manager <- conversation_hooks <- local_conversation -> executor + from openhands.sdk.agent import Agent # type: ignore[attr-defined] + from openhands.sdk.conversation.impl.local_conversation import LocalConversation + from openhands.sdk.conversation.response_utils import get_agent_final_response + from openhands.sdk.tool.spec import Tool + + event_type = ( + event.event_type + if isinstance(event.event_type, str) + else event.event_type.value + ) + + # Resolve the active conversation LLM once (a getter may rebuild it). + llm = self.llm + if llm is None: + logger.warning( + f"Agent hook has no LLM configured for event '{event_type}'" + " — defaulting to allow" + ) + return self._fall_open("No LLM configured for agent hook") + + hook_llm = llm.model_copy( + update={ + "usage_id": f"agent-hook:{hook.name or 'default'}", + "timeout": hook.timeout, + } + ) + # Isolate Metrics so hook spend doesn't accrue to the parent's bucket. + hook_llm.reset_metrics() + + # Never hand the parent's already-initialized visualizer instance to the + # sub-conversation: LocalConversation.__init__ calls initialize() on it, + # which would rebind the parent visualizer to the hook's child state. Mirror + # the delegate pattern and ask the parent visualizer for a fresh sub- + # visualizer (returns None for visualizers that don't support sub-agents). + hook_visualizer = self.visualizer + if isinstance(self.visualizer, ConversationVisualizerBase): + hook_visualizer = self.visualizer.create_sub_visualizer( + f"agent-hook:{hook.name or 'default'}" + ) + + conversation = None + try: + agent = Agent( + llm=hook_llm, + tools=[Tool(name=t) for t in hook.tools], + include_default_tools=["FinishTool"], + system_prompt=hook.system_prompt, + ) + # hook_config=None disables hooks in the sub-conversation (no recursion) + conversation = LocalConversation( + agent=agent, + workspace=self.working_dir, + plugins=None, + hook_config=None, + persistence_dir=self.persistence_dir, + visualizer=hook_visualizer, + max_iteration_per_run=hook.max_iterations, + ) + conversation.send_message( + f"Evaluate this {event_type} hook event and make your decision.\n\n" + f"## Hook Event\n```json\n{event.model_dump_json(indent=2)}\n```" + ) + conversation.run() + raw = get_agent_final_response(conversation.state.events) + except Exception as e: + logger.warning( + f"Agent hook sub-conversation failed for event '{event_type}'" + f" — defaulting to allow: {e}" + ) + return self._fall_open( + "Agent hook execution failed — defaulting to allow", + error=str(e), + ) + finally: + if conversation is not None: + self._merge_hook_conversation_stats(conversation) + conversation.close() + + return self._parse_decision(raw, event_type) + + def _extract_first_json_object(self, text: str) -> dict | None: + # Scan for the first decodable JSON object so prose / ```json fences + # around the payload don't defeat parsing. + for i, ch in enumerate(text): + if ch != "{": + continue + with contextlib.suppress(json.JSONDecodeError): + obj, _ = self._JSON_DECODER.raw_decode(text[i:]) + if isinstance(obj, dict): + return obj + return None + + def _parse_decision(self, raw: str, event_type: str) -> HookResult: + if not raw: + logger.warning( + f"Agent hook produced no final response for event '{event_type}'" + " — defaulting to allow" + ) + return self._fall_open( + "Agent hook produced no final response — defaulting to allow" + ) + + data = self._extract_first_json_object(raw) + if data is None: + logger.warning( + f"Agent hook returned no parseable JSON object for event" + f" '{event_type}' — defaulting to allow: {repr(raw)[:200]}" + ) + return self._fall_open( + "Agent hook returned no parseable JSON — defaulting to allow" + ) + + decision_str = str(data.get("decision", "")).lower() + reason = str(data.get("reason", "")) + if decision_str == "deny": + return HookResult( + success=True, + blocked=True, + decision=HookDecision.DENY, + reason=reason, + ) + if decision_str == "allow": + return HookResult( + success=True, + decision=HookDecision.ALLOW, + reason=reason, + ) + # Missing or unknown decision: this is not a deliberate verdict, so it + # must be a detectable fall-open (success=False) rather than a silent + # allow that masquerades as a real decision. + logger.warning( + f"Agent hook returned an invalid decision for event '{event_type}'" + f" — defaulting to allow: {repr(decision_str)[:200]}" + ) + return self._fall_open( + "Agent hook returned an invalid decision — defaulting to allow" + ) + + def _merge_hook_conversation_stats(self, conversation: "BaseConversation") -> None: + if self.conversation_stats is None: + return + + child_stats = conversation.conversation_stats + for usage_id, metrics in child_stats.usage_to_metrics.items(): + if usage_id in self.conversation_stats.usage_to_metrics: + existing = self.conversation_stats.usage_to_metrics[usage_id] + if existing is not metrics: + existing.merge(metrics) + else: + self.conversation_stats.usage_to_metrics[usage_id] = metrics.deep_copy() def execute( self, @@ -155,6 +371,22 @@ def execute( env: dict[str, str] | None = None, ) -> HookResult: """Execute a single hook.""" + if hook.type == HookType.AGENT: + return self._execute_agent_hook(hook, event) + if hook.type == HookType.PROMPT: + event_type = ( + event.event_type + if isinstance(event.event_type, str) + else event.event_type.value + ) + logger.warning( + f"PROMPT hooks are not yet implemented — defaulting to allow" + f" (event_type={event_type})" + ) + return self._fall_open( + "PROMPT hooks are not yet implemented — defaulting to allow" + ) + # Prepare environment hook_env = sanitized_env() hook_env["OPENHANDS_PROJECT_DIR"] = self.working_dir @@ -172,6 +404,14 @@ def execute( # Cleanup expired async processes before starting new ones self.async_process_manager.cleanup_expired() + command = hook.command + if not command: + return HookResult( + success=False, + exit_code=-1, + error="'command' is required when type is 'command'", + ) + # Handle async hooks: fire and forget if hook.async_: try: @@ -182,7 +422,7 @@ def execute( start_new_session = False process = subprocess.Popen( - hook.command, + command, shell=True, cwd=self.working_dir, env=hook_env, @@ -203,7 +443,7 @@ def execute( # Track for cleanup self.async_process_manager.add_process(process, hook.timeout) - logger.debug(f"Started async hook (PID {process.pid}): {hook.command}") + logger.debug(f"Started async hook (PID {process.pid}): {command}") # Return placeholder success result return HookResult( @@ -221,7 +461,7 @@ def execute( try: # Execute the hook command synchronously result = subprocess.run( - hook.command, + command, shell=True, cwd=self.working_dir, env=hook_env, diff --git a/openhands-sdk/openhands/sdk/hooks/manager.py b/openhands-sdk/openhands/sdk/hooks/manager.py index af57be2734..923a07cabf 100644 --- a/openhands-sdk/openhands/sdk/hooks/manager.py +++ b/openhands-sdk/openhands/sdk/hooks/manager.py @@ -1,13 +1,20 @@ """Hook manager - orchestrates hook execution within conversations.""" import logging -from typing import Any +from collections.abc import Callable +from typing import TYPE_CHECKING, Any +from openhands.sdk.conversation.visualizer import ConversationVisualizerBase from openhands.sdk.hooks.config import HookConfig from openhands.sdk.hooks.executor import HookExecutor, HookResult from openhands.sdk.hooks.types import HookEvent, HookEventType +if TYPE_CHECKING: + from openhands.sdk.conversation.conversation_stats import ConversationStats + from openhands.sdk.llm import LLM + + logger = logging.getLogger(__name__) @@ -19,9 +26,23 @@ def __init__( config: HookConfig | None = None, working_dir: str | None = None, session_id: str | None = None, + llm: "LLM | None" = None, + llm_getter: "Callable[[], LLM | None] | None" = None, + persistence_dir: str | None = None, + visualizer: type[ConversationVisualizerBase] + | ConversationVisualizerBase + | None = None, + conversation_stats: "ConversationStats | None" = None, ): self.config = config or HookConfig.load(working_dir=working_dir) - self.executor = HookExecutor(working_dir=working_dir) + self.executor = HookExecutor( + working_dir=working_dir, + llm=llm, + llm_getter=llm_getter, + persistence_dir=persistence_dir, + visualizer=visualizer, + conversation_stats=conversation_stats, + ) self.session_id = session_id self.working_dir = working_dir diff --git a/tests/cross/test_check_agent_server_rest_api_breakage.py b/tests/cross/test_check_agent_server_rest_api_breakage.py index be88df728c..bb0a29514c 100644 --- a/tests/cross/test_check_agent_server_rest_api_breakage.py +++ b/tests/cross/test_check_agent_server_rest_api_breakage.py @@ -666,6 +666,22 @@ def test_split_breaking_changes_separates_three_buckets(): "details": {}, "text": "added body anyOf member", }, + { + # Additive value on the hook discriminator union -> downgraded. + "id": "response-property-enum-value-added", + "details": {}, + "text": ( + "added the new `agent` enum value to the " + "`hook_config/anyOf[subschema #1: HookConfig]/stop/items/" + "hooks/items/type` response property for the response status `200`" + ), + }, + { + # Enum value on an ordinary (non-discriminator) property -> breaking. + "id": "response-property-enum-value-added", + "details": {}, + "text": "added the new `archived` enum value to the `status` response property", + }, { "id": "response-property-removed", "details": {}, @@ -688,9 +704,22 @@ def test_split_breaking_changes_separates_three_buckets(): "response-property-one-of-added", "response-body-one-of-added", "response-body-any-of-added", + "response-property-enum-value-added", + } + # The hook-discriminator enum addition is downgraded; the unrelated `status` + # enum addition and the body change remain breaking. + assert { + change["text"] for change in additive_oneof if "enum value" in change["text"] + } == { + "added the new `agent` enum value to the " + "`hook_config/anyOf[subschema #1: HookConfig]/stop/items/" + "hooks/items/type` response property for the response status `200`" + } + assert {change["id"] for change in other} == { + "response-property-enum-value-added", + "response-body-changed", } - assert len(other) == 1 - assert other[0]["id"] == "response-body-changed" + assert any("`status`" in change["text"] for change in other) def test_main_passes_when_only_additive_oneof(monkeypatch, capsys): @@ -723,7 +752,7 @@ def test_main_passes_when_only_additive_oneof(monkeypatch, capsys): assert _prod.main() == 0 captured = capsys.readouterr() - assert "Additive oneOf/anyOf expansion detected" in captured.out + assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out assert "additive response oneOf expansions" in captured.out @@ -790,7 +819,7 @@ def test_main_passes_when_body_union_addition_reports_removed_properties( assert _prod.main() == 0 captured = capsys.readouterr() - assert "Additive oneOf/anyOf expansion detected" in captured.out + assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out assert "ignored 3 request/response-property removal artifact" in captured.out assert "ignored 1 request/response type-change artifact" in captured.out @@ -878,7 +907,7 @@ def test_main_fails_when_additive_oneof_mixed_with_real_breakage(monkeypatch, ca assert _prod.main() == 1 captured = capsys.readouterr() - assert "Additive oneOf/anyOf expansion detected" in captured.out + assert "Additive oneOf/anyOf expansion or enum-value additions" in captured.out assert "other than removing previously-deprecated operations" in captured.out diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py index 8399aaabf7..82b94fc46b 100644 --- a/tests/examples/test_examples.py +++ b/tests/examples/test_examples.py @@ -28,6 +28,7 @@ # These examples live under subdirectories (each with a single `main.py`). EXAMPLES_ROOT / "01_standalone_sdk" / "33_hooks", EXAMPLES_ROOT / "01_standalone_sdk" / "37_llm_profile_store", + EXAMPLES_ROOT / "01_standalone_sdk" / "51_agent_hooks", EXAMPLES_ROOT / "01_standalone_sdk" / "43_mixed_marketplace_skills", EXAMPLES_ROOT / "02_remote_agent_server" / "06_custom_tool", EXAMPLES_ROOT / "05_skills_and_plugins" / "01_loading_agentskills", @@ -97,6 +98,9 @@ def test_directory_example_is_discovered() -> None: assert ( EXAMPLES_ROOT / "02_remote_agent_server" / "06_custom_tool" / "main.py" ) in EXAMPLES + assert ( + EXAMPLES_ROOT / "01_standalone_sdk" / "51_agent_hooks" / "main.py" + ) in EXAMPLES @pytest.mark.parametrize("example_path", EXAMPLES, ids=_normalize_path) diff --git a/tests/sdk/conversation/test_local_conversation_plugins.py b/tests/sdk/conversation/test_local_conversation_plugins.py index c58a74c6b9..cfa7b066de 100644 --- a/tests/sdk/conversation/test_local_conversation_plugins.py +++ b/tests/sdk/conversation/test_local_conversation_plugins.py @@ -2,6 +2,7 @@ import json from pathlib import Path +from unittest.mock import MagicMock, patch import pytest from pydantic import SecretStr @@ -188,6 +189,45 @@ def test_plugin_hooks_combined_with_explicit_hooks( # (The actual hook_processor is internal, but we trust the merging works) conversation.close() + def test_hook_sub_conversations_receive_persistence_base_dir( + self, tmp_path: Path, basic_agent + ): + """Agent hook persistence should not nest under the parent conversation id.""" + workspace = tmp_path / "workspace" + workspace.mkdir() + persistence_base = tmp_path / "state" + hook_config = HookConfig( + pre_tool_use=[ + HookMatcher(matcher="*", hooks=[HookDefinition(command="echo test")]) + ] + ) + + processor = MagicMock() + processor.on_event = MagicMock() + processor.set_conversation_state = MagicMock() + processor.run_session_start = MagicMock() + + conversation = LocalConversation( + agent=basic_agent, + workspace=workspace, + persistence_dir=persistence_base, + hook_config=hook_config, + visualizer=None, + ) + + with patch( + "openhands.sdk.conversation.impl.local_conversation.create_hook_callback", + return_value=(processor, processor.on_event), + ) as mock_create_hook_callback: + conversation._ensure_plugins_loaded() + + assert conversation.state.persistence_dir is not None + assert Path(conversation.state.persistence_dir).parent == persistence_base + assert mock_create_hook_callback.call_args.kwargs["persistence_dir"] == str( + persistence_base + ) + conversation.close() + def test_plugins_not_loaded_until_needed(self, tmp_path: Path, basic_agent): """Test that plugins are not loaded in constructor (lazy loading).""" plugin_dir = create_test_plugin( diff --git a/tests/sdk/hooks/test_config.py b/tests/sdk/hooks/test_config.py index f5a1c832e4..64865d968d 100644 --- a/tests/sdk/hooks/test_config.py +++ b/tests/sdk/hooks/test_config.py @@ -10,6 +10,124 @@ from openhands.sdk.hooks.types import HookEventType +def test_command_hook_requires_command(): + with pytest.raises(ValidationError, match="'command' is required"): + HookDefinition(type=HookType.COMMAND) + + +def test_command_hook_valid(): + h = HookDefinition(command="echo hi") + assert h.type == HookType.COMMAND + assert h.command == "echo hi" + + +@pytest.mark.parametrize( + "kwargs", + [ + {"type": "agent", "system_prompt": "Block writes to /etc"}, + {"type": "agent"}, + ], + ids=["with-system-prompt", "without-system-prompt"], +) +def test_agent_hook_valid(kwargs): + h = HookDefinition(**kwargs) + assert h.type == HookType.AGENT + + +@pytest.mark.parametrize( + "kwargs,expected", + [ + ({"command": "block.sh"}, "block.sh"), + ( + { + "type": "agent", + "name": "block-deletions", + "system_prompt": "Block rm -rf", + }, + "agent-hook:block-deletions", + ), + ( + {"type": "agent", "system_prompt": "Block network calls to external IPs"}, + "agent-hook:Block network calls ", + ), + ({"type": "agent", "system_prompt": "A" * 100}, f"agent-hook:{'A' * 20}"), + ({"type": "agent"}, "agent-hook:agent"), + ], + ids=[ + "command", + "agent-named", + "agent-prompt-prefix", + "agent-prompt-truncated", + "agent-fallback", + ], +) +def test_display_command(kwargs, expected): + h = HookDefinition(**kwargs) + assert h.display_command == expected + + +def test_multiple_agent_hooks_are_distinguishable(): + hooks = [ + HookDefinition( + type=HookType.AGENT, + name="block-deletions", + system_prompt="Block rm -rf", + ), + HookDefinition( + type=HookType.AGENT, + system_prompt="Block network calls to external IPs", + ), + HookDefinition( + type=HookType.AGENT, + system_prompt="Verify all tasks are complete", + ), + ] + assert len({h.display_command for h in hooks}) == 3 + + +@pytest.mark.parametrize( + "kwargs,match", + [ + ( + {"type": "agent", "command": "echo hi"}, + "'command' must not be set when type is 'agent'", + ), + ({"type": "command"}, "'command' is required"), + ], + ids=["agent-rejects-command", "command-requires-command"], +) +def test_hook_definition_validation_errors(kwargs, match): + with pytest.raises(Exception, match=match): + HookDefinition(**kwargs) + + +def test_agent_hook_rejects_async(): + with pytest.raises(Exception, match="not supported for agent hooks"): + HookDefinition.model_validate({"type": "agent", "async": True}) + + +def test_agent_hook_from_json(): + data = { + "stop": [ + { + "hooks": [ + { + "type": "agent", + "system_prompt": "Verify all tasks are done", + "timeout": 30, + } + ] + } + ] + } + config = HookConfig.from_dict(data) + hooks = config.get_hooks_for_event(HookEventType.STOP) + assert len(hooks) == 1 + assert hooks[0].type == HookType.AGENT + assert hooks[0].system_prompt == "Verify all tasks are done" + assert hooks[0].timeout == 30 + + class TestHookMatcher: """Tests for HookMatcher pattern matching.""" diff --git a/tests/sdk/hooks/test_executor.py b/tests/sdk/hooks/test_executor.py index 26d31d0565..00c5640991 100644 --- a/tests/sdk/hooks/test_executor.py +++ b/tests/sdk/hooks/test_executor.py @@ -3,12 +3,17 @@ import json import subprocess from unittest import mock +from unittest.mock import MagicMock, patch import pytest +from pydantic import SecretStr -from openhands.sdk.hooks.config import HookDefinition +from openhands.sdk.conversation.conversation_stats import ConversationStats +from openhands.sdk.hooks.config import HookDefinition, HookType from openhands.sdk.hooks.executor import HookExecutor from openhands.sdk.hooks.types import HookDecision, HookEvent, HookEventType +from openhands.sdk.llm import LLM +from openhands.sdk.llm.utils.metrics import Metrics from tests.command_utils import python_command @@ -421,3 +426,469 @@ def test_cleanup_expired_keeps_active_processes(self, tmp_path): # Clean up for test teardown process.terminate() + + +class TestAgentHookExecution: + """Tests for HookType.AGENT execution path.""" + + # Patch at the package/module level so lazy `from X import Y` picks up the mock. + _AGENT_PATH = "openhands.sdk.agent.Agent" + _CONV_PATH = "openhands.sdk.conversation.impl.local_conversation.LocalConversation" + _RESPONSE_PATH = ( + "openhands.sdk.conversation.response_utils.get_agent_final_response" + ) + + @pytest.fixture + def mock_llm(self): + return LLM(model="gpt-4o", api_key=SecretStr("test-key"), usage_id="test") + + @pytest.fixture + def executor(self, tmp_path, mock_llm): + return HookExecutor(working_dir=str(tmp_path), llm=mock_llm) + + @pytest.fixture + def executor_no_llm(self, tmp_path): + return HookExecutor(working_dir=str(tmp_path), llm=None) + + @pytest.fixture + def sample_event(self): + return HookEvent( + event_type=HookEventType.STOP, + session_id="test-session", + ) + + def test_execute_dispatches_to_agent_hook(self, executor, sample_event): + """execute() routes AGENT type to _execute_agent_hook, not subprocess.""" + from openhands.sdk.hooks.executor import HookResult + + hook = HookDefinition( + type=HookType.AGENT, + system_prompt="Verify task completion", + ) + + with patch.object( + executor, + "_execute_agent_hook", + return_value=HookResult( + success=True, decision=HookDecision.ALLOW, reason="ok" + ), + ) as mock_agent: + result = executor.execute(hook, sample_event) + mock_agent.assert_called_once_with(hook, sample_event) + + assert result.decision == HookDecision.ALLOW + assert result.should_continue + + def test_no_llm_defaults_to_allow(self, executor_no_llm, sample_event): + """Agent hook with no LLM configured defaults to allow without error.""" + hook = HookDefinition( + type=HookType.AGENT, + system_prompt="Check something", + ) + result = executor_no_llm.execute(hook, sample_event) + + assert not result.success + assert result.decision == HookDecision.ALLOW + assert not result.blocked + + def test_deny_decision_blocks(self, executor, sample_event): + """_execute_agent_hook parsing deny JSON produces a blocked HookResult.""" + deny_json = '{"decision": "deny", "reason": "Tasks not complete"}' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=deny_json), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition( + type=HookType.AGENT, + system_prompt="Check tasks", + ) + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "Tasks not complete" + assert not result.should_continue + + def test_allow_decision_passes(self, executor, sample_event): + """_execute_agent_hook parsing allow JSON produces a non-blocking HookResult.""" + allow_json = '{"decision": "allow", "reason": "Looks safe"}' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=allow_json), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.reason == "Looks safe" + + @pytest.mark.parametrize( + "payload", + [ + '{"reason": "no decision field"}', + '{"decision": "maybe", "reason": "not allow or deny"}', + '{"decision": "", "reason": "empty decision"}', + ], + ) + def test_invalid_decision_falls_open(self, executor, sample_event, payload): + """Missing/unknown decision is a fall-open, not a deliberate allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=payload), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.should_continue + assert result.success is False + assert result.error is not None + + def test_markdown_wrapped_deny_is_parsed(self, executor, sample_event): + """```json fenced JSON is honoured, not treated as non-JSON.""" + fenced = '```json\n{"decision": "deny", "reason": "Sensitive file read"}\n```' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=fenced), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "Sensitive file read" + + def test_plain_fence_without_language_tag_is_parsed(self, executor, sample_event): + """Some LLMs use ``` without a language tag — still extract the JSON.""" + fenced = '```\n{"decision": "allow", "reason": "ok"}\n```' + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=fenced), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.reason == "ok" + + def test_prose_prefix_before_json_is_parsed(self, executor, sample_event): + """Prose before the JSON object must not defeat the parser.""" + prose_then_json = ( + "After reviewing the workspace I found REPORT.md is missing.\n\n" + '{"decision": "deny", "reason": "missing deliverable"}' + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=prose_then_json), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "missing deliverable" + + def test_prose_suffix_after_json_is_parsed(self, executor, sample_event): + """Trailing chatter after the JSON object must not defeat the parser.""" + json_then_prose = ( + '{"decision": "deny", "reason": "sensitive file"}\n\n' + "Let me know if you need more details." + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=json_then_prose), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert result.blocked + assert result.decision == HookDecision.DENY + assert result.reason == "sensitive file" + + def test_invalid_json_defaults_to_allow(self, executor, sample_event): + """Non-JSON response falls open: ALLOW + success=False + error set.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value="I think you should allow this."), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.should_continue + assert result.success is False + assert result.error is not None + + def test_sub_conversation_failure_defaults_to_allow(self, executor, sample_event): + """_execute_agent_hook when sub-conversation raises defaults to allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH, side_effect=RuntimeError("workspace error")), + ): + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.error is not None + + def test_empty_response_defaults_to_allow(self, executor, sample_event): + """_execute_agent_hook when agent produces no response defaults to allow.""" + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH) as mock_conv_cls, + patch(self._RESPONSE_PATH, return_value=""), + ): + mock_conv_cls.return_value = MagicMock() + hook = HookDefinition(type=HookType.AGENT) + result = executor._execute_agent_hook(hook, sample_event) + + assert not result.blocked + assert result.decision == HookDecision.ALLOW + assert result.success is False + + def test_timeout_propagated_to_hook_llm(self, executor, sample_event): + """hook.timeout is forwarded to the copied LLM, parent's timeout untouched.""" + parent_timeout = executor.llm.timeout + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + hook = HookDefinition(type=HookType.AGENT, timeout=7) + executor._execute_agent_hook(hook, sample_event) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + assert hook_llm.timeout == 7 + assert executor.llm.timeout == parent_timeout + + def test_hook_metrics_under_usage_id(self, executor, sample_event): + """Hook LLM uses per-hook usage_id and an isolated Metrics object.""" + parent_metrics = executor.llm.metrics + + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + hook = HookDefinition(type=HookType.AGENT) + executor._execute_agent_hook(hook, sample_event) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + assert hook_llm is not executor.llm + assert hook_llm.usage_id == "agent-hook:default" + assert hook_llm.metrics is not parent_metrics + + def test_llm_getter_is_resolved_live(self, tmp_path, sample_event): + """An llm_getter is read at execution time, so agent hooks follow + switch_llm()/switch_profile() instead of using a stale captured LLM.""" + current = { + "llm": LLM(model="gpt-4o", api_key=SecretStr("k1"), usage_id="first") + } + executor = HookExecutor( + working_dir=str(tmp_path), + llm_getter=lambda: current["llm"], + ) + + # Simulate switch_llm(): the conversation rebinds its agent's LLM. + current["llm"] = LLM( + model="gpt-5.5", api_key=SecretStr("k2"), usage_id="second" + ) + + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + executor._execute_agent_hook( + HookDefinition(type=HookType.AGENT), sample_event + ) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + # Copied from the *current* LLM (gpt-5.5), not the one present at init. + assert hook_llm.model == "gpt-5.5" + + def test_hook_metrics_are_merged_into_parent_stats( + self, tmp_path, mock_llm, sample_event + ): + """Child agent-hook spend is included in parent conversation stats.""" + parent_stats = ConversationStats() + existing_hook_metrics = Metrics(model_name="gpt-4o") + existing_hook_metrics.add_cost(0.25) + parent_stats.usage_to_metrics["agent-hook:security-check"] = ( + existing_hook_metrics + ) + + child_hook_metrics = Metrics(model_name="gpt-4o") + child_hook_metrics.add_cost(0.75) + child_stats = ConversationStats( + usage_to_metrics={"agent-hook:security-check": child_hook_metrics} + ) + mock_conversation = MagicMock() + mock_conversation.conversation_stats = child_stats + mock_conversation.state.events = [] + + executor = HookExecutor( + working_dir=str(tmp_path), + llm=mock_llm, + conversation_stats=parent_stats, + ) + + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH, return_value=mock_conversation), + patch( + self._RESPONSE_PATH, + return_value='{"decision": "allow", "reason": "ok"}', + ), + ): + hook = HookDefinition(type=HookType.AGENT, name="security-check") + result = executor._execute_agent_hook(hook, sample_event) + + assert result.decision == HookDecision.ALLOW + assert parent_stats.usage_to_metrics[ + "agent-hook:security-check" + ].accumulated_cost == pytest.approx(1.0) + assert parent_stats.get_combined_metrics().accumulated_cost == pytest.approx( + 1.0 + ) + + def test_hook_usage_id_uses_hook_name(self, executor, sample_event): + """A named hook gets its own usage_id bucket: agent-hook:.""" + captured_llm = {} + + def capture_agent_init(**kwargs): + captured_llm["llm"] = kwargs.get("llm") + return MagicMock() + + with ( + patch(self._AGENT_PATH, side_effect=capture_agent_init), + patch(self._CONV_PATH, side_effect=RuntimeError("stop early")), + ): + hook = HookDefinition(type=HookType.AGENT, name="security-check") + executor._execute_agent_hook(hook, sample_event) + + hook_llm = captured_llm.get("llm") + assert hook_llm is not None + assert hook_llm.usage_id == "agent-hook:security-check" + + def test_parent_visualizer_instance_is_not_rebound(self, tmp_path, mock_llm): + """The parent visualizer instance must never be handed to the hook conv. + + LocalConversation.initialize() rebinds a visualizer instance to its own + state, so the executor must request a fresh sub-visualizer instead. + """ + from openhands.sdk.conversation.visualizer import ConversationVisualizerBase + + sub_viz = MagicMock(spec=ConversationVisualizerBase) + + class _Viz(ConversationVisualizerBase): + def on_event(self, event): # pragma: no cover - not exercised + pass + + def create_sub_visualizer(self, agent_id): + self.requested_agent_id = agent_id + return sub_viz + + parent_viz = _Viz() + executor = HookExecutor( + working_dir=str(tmp_path), + llm=mock_llm, + visualizer=parent_viz, + ) + + captured = {} + + def capture_conv_init(**kwargs): + captured["visualizer"] = kwargs.get("visualizer") + raise RuntimeError("stop early") + + sample_event = HookEvent( + event_type=HookEventType.PRE_TOOL_USE, tool_name="BashTool" + ) + with ( + patch(self._AGENT_PATH), + patch(self._CONV_PATH, side_effect=capture_conv_init), + ): + hook = HookDefinition(type=HookType.AGENT, name="security-check") + executor._execute_agent_hook(hook, sample_event) + + assert captured["visualizer"] is sub_viz + assert captured["visualizer"] is not parent_viz + assert parent_viz.requested_agent_id == "agent-hook:security-check" + + +class TestPromptHookNotImplemented: + """HookType.PROMPT is a future stub — execution defaults to allow, never crashes.""" + + @pytest.fixture + def executor(self, tmp_path): + return HookExecutor(working_dir=str(tmp_path)) + + @pytest.fixture + def sample_event(self): + return HookEvent(event_type=HookEventType.PRE_TOOL_USE, tool_name="BashTool") + + def test_prompt_hook_defaults_to_allow(self, executor, sample_event): + """Executing a PROMPT hook returns allow instead of crashing.""" + hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") + result = executor.execute(hook, sample_event) + assert result.decision == HookDecision.ALLOW + assert result.success is False + assert "not yet implemented" in (result.reason or "") + + def test_prompt_hook_does_not_block(self, executor, sample_event): + """PROMPT hook must not block the operation while unimplemented.""" + hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") + result = executor.execute(hook, sample_event) + assert result.blocked is False + assert result.should_continue is True + + def test_prompt_hook_without_command_validates(self): + """PROMPT hook with no command is valid at config time (future use).""" + hook = HookDefinition(type=HookType.PROMPT, prompt="evaluate this event") + assert hook.command == ""