From 97400cd781e0ca90fe1b48237f7f3469d6ce8bbe Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Wed, 6 May 2026 22:47:45 +0000
Subject: [PATCH 1/7] update yutori template to n1.5

- bump model id from `n1-latest` to `n1.5-latest`
- send `extra_body.tool_set: browser_tools_core-20260403` to use the
  coordinate-based tool set
- send `extra_body.disable_tools` to explicitly exclude the DOM/Playwright
  tools (`extract_elements`, `find`, `set_element_value`, `execute_js`)
  since this template runs computer-use only
- rename `hover` to `mouse_move`
- rename `key_press` parameter `key_comb` to `key`
- drop `press_enter_after` and `clear_before_typing` from `type` (n1.5
  emits separate `key_press` actions instead)
- add new actions: `middle_click`, `mouse_down`, `mouse_up`, `hold_key`,
  `go_forward`
- support optional `modifier` parameter on click actions via Kernel's
  `hold_keys`

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 pkg/create/templates.go                       |   4 +-
 pkg/templates/python/yutori/README.md         |  35 +++--
 pkg/templates/python/yutori/loop.py           |  29 +++-
 pkg/templates/python/yutori/main.py           |   4 +-
 pkg/templates/python/yutori/tools/__init__.py |   6 +-
 pkg/templates/python/yutori/tools/computer.py | 143 +++++++++++-------
 pkg/templates/typescript/yutori/README.md     |  35 +++--
 pkg/templates/typescript/yutori/index.ts      |   2 +-
 pkg/templates/typescript/yutori/loop.ts       |  36 +++--
 .../typescript/yutori/tools/computer.ts       | 143 ++++++++++++------
 10 files changed, 283 insertions(+), 154 deletions(-)

diff --git a/pkg/create/templates.go b/pkg/create/templates.go
index f8541a11..699b97f9 100644
--- a/pkg/create/templates.go
+++ b/pkg/create/templates.go
@@ -87,8 +87,8 @@ var Templates = map[string]TemplateInfo{
 		Languages:   []string{LanguageTypeScript, LanguagePython},
 	},
 	TemplateYutoriComputerUse: {
-		Name:        "Yutori n1 Computer Use",
-		Description: "Implements a Yutori n1 computer use agent",
+		Name:        "Yutori n1.5 Computer Use",
+		Description: "Implements a Yutori n1.5 computer use agent",
 		Languages:   []string{LanguageTypeScript, LanguagePython},
 	},
 	TemplateTzafonComputerUse: {
diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md
index 7523aff2..8f7df74a 100644
--- a/pkg/templates/python/yutori/README.md
+++ b/pkg/templates/python/yutori/README.md
@@ -1,8 +1,8 @@
-# Kernel Python Sample App - Yutori n1 Computer Use
+# Kernel Python Sample App - Yutori n1.5 Computer Use
 
-This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API.
+This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API.
 
-[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
+[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
 
 ## Setup
 
@@ -55,9 +55,9 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://exa
 
 ## Viewport Configuration
 
-Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
+Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
 
-> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
+> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
 
 See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations.
 
@@ -65,25 +65,36 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport
 
 Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori.
 
-## n1-latest Supported Actions
+## n1.5-latest Supported Actions
+
+This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only.
 
 | Action | Description |
 |--------|-------------|
-| `left_click` | Left mouse click at coordinates |
-| `double_click` | Double-click at coordinates |
-| `triple_click` | Triple-click at coordinates |
+| `left_click` | Left mouse click at coordinates (supports `modifier`) |
+| `double_click` | Double-click at coordinates (supports `modifier`) |
+| `triple_click` | Triple-click at coordinates (supports `modifier`) |
+| `middle_click` | Middle mouse click at coordinates |
 | `right_click` | Right mouse click at coordinates |
+| `mouse_move` | Move mouse to coordinates without clicking |
+| `mouse_down` | Press the left mouse button at coordinates |
+| `mouse_up` | Release the left mouse button at coordinates |
 | `scroll` | Scroll page in a direction |
 | `type` | Type text into focused element |
-| `key_press` | Send keyboard input |
-| `hover` | Move mouse without clicking |
+| `key_press` | Send a single key or key combination |
+| `hold_key` | Hold a key for a duration |
 | `drag` | Click-and-drag operation |
 | `wait` | Pause for UI to update |
 | `refresh` | Reload current page |
 | `go_back` | Navigate back in history |
+| `go_forward` | Navigate forward in history |
 | `goto_url` | Navigate to a URL |
 
+### Disabled tools
+
+The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model.
+
 ## Resources
 
-- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1)
+- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5)
 - [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py
index 066aafb5..ad39024b 100644
--- a/pkg/templates/python/yutori/loop.py
+++ b/pkg/templates/python/yutori/loop.py
@@ -1,14 +1,14 @@
 """
-Yutori n1 Sampling Loop
+Yutori n1.5 Sampling Loop
 
-Implements the agent loop for Yutori's n1-latest computer use model.
-n1-latest uses an OpenAI-compatible API with tool_calls:
+Implements the agent loop for Yutori's n1.5-latest computer use model.
+n1.5-latest uses an OpenAI-compatible API with tool_calls:
 - Actions are returned via tool_calls in the assistant message
 - Tool results use role: "tool" with matching tool_call_id
 - The model stops by returning content without tool_calls
 - Coordinates are returned in 1000x1000 space and need scaling
 
-@see https://docs.yutori.com/reference/n1
+@see https://docs.yutori.com/reference/n1-5
 """
 
 import json
@@ -17,12 +17,18 @@
 from kernel import Kernel
 from openai import OpenAI
 
-from tools import ComputerTool, N1Action, ToolResult
+from tools import ComputerTool, N15Action, ToolResult
+
+# Tools that require a Playwright page / DOM access. The default core tool set
+# already excludes them, but we also list them in `disable_tools` so the
+# exclusion is explicit and survives if the default ever changes.
+DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"]
+TOOL_SET = "browser_tools_core-20260403"
 
 
 async def sampling_loop(
     *,
-    model: str = "n1-latest",
+    model: str = "n1.5-latest",
     task: str,
     api_key: str,
     kernel: Kernel,
@@ -69,6 +75,13 @@ async def sampling_loop(
                 messages=conversation_messages,
                 max_completion_tokens=max_completion_tokens,
                 temperature=0.3,
+                # n1.5-specific knobs go in extra_body.
+                # tool_set selects the core (coordinate-based) tools.
+                # disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
+                extra_body={
+                    "tool_set": TOOL_SET,
+                    "disable_tools": DISABLED_TOOLS,
+                },
             )
         except Exception as api_error:
             print(f"API call failed: {api_error}")
@@ -108,7 +121,7 @@ async def sampling_loop(
                 })
                 continue
 
-            action: N1Action = {"action_type": action_name, **args}
+            action: N15Action = {"action_type": action_name, **args}
             print(f"Executing action: {action_name}", args)
 
             scaled_action = _scale_coordinates(action, viewport_width, viewport_height)
@@ -155,7 +168,7 @@ async def sampling_loop(
     }
 
 
-def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action:
+def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action:
     scaled = dict(action)
 
     if "coordinates" in scaled and scaled["coordinates"]:
diff --git a/pkg/templates/python/yutori/main.py b/pkg/templates/python/yutori/main.py
index 4679df8c..21543d9b 100644
--- a/pkg/templates/python/yutori/main.py
+++ b/pkg/templates/python/yutori/main.py
@@ -30,7 +30,7 @@ async def cua_task(
     payload: QueryInput,
 ) -> QueryOutput:
     """
-    Process a user query using Yutori n1 Computer Use with Kernel's browser automation.
+    Process a user query using Yutori n1.5 Computer Use with Kernel's browser automation.
 
     Args:
         ctx: Kernel context containing invocation information
@@ -58,7 +58,7 @@ async def cua_task(
         print("Kernel browser live view url:", session.live_view_url)
 
         loop_result = await sampling_loop(
-            model="n1-latest",
+            model="n1.5-latest",
             task=payload["query"],
             api_key=str(api_key),
             kernel=session.kernel,
diff --git a/pkg/templates/python/yutori/tools/__init__.py b/pkg/templates/python/yutori/tools/__init__.py
index 63da5188..5a1a4285 100644
--- a/pkg/templates/python/yutori/tools/__init__.py
+++ b/pkg/templates/python/yutori/tools/__init__.py
@@ -1,11 +1,11 @@
-"""Yutori n1 Computer Tools."""
+"""Yutori n1.5 Computer Tools."""
 
 from .base import ToolError, ToolResult
-from .computer import ComputerTool, N1Action
+from .computer import ComputerTool, N15Action
 
 __all__ = [
     "ToolError",
     "ToolResult",
     "ComputerTool",
-    "N1Action",
+    "N15Action",
 ]
diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py
index d078aded..18b3504c 100644
--- a/pkg/templates/python/yutori/tools/computer.py
+++ b/pkg/templates/python/yutori/tools/computer.py
@@ -1,15 +1,17 @@
 """
-Yutori n1 Computer Tool
+Yutori n1.5 Computer Tool
 
-Maps n1-latest action format to Kernel's Computer Controls API.
+Maps n1.5-latest action format to Kernel's Computer Controls API.
 Screenshots are converted to WebP for better compression across multi-step trajectories.
+
+@see https://docs.yutori.com/reference/n1-5
 """
 
 import asyncio
 import base64
 import json
 from io import BytesIO
-from typing import Literal, TypedDict
+from typing import Any, Literal, TypedDict
 
 from kernel import Kernel
 from PIL import Image
@@ -20,33 +22,38 @@
 SCREENSHOT_DELAY_S = 0.3
 ACTION_DELAY_S = 0.3
 
-N1ActionType = Literal[
+N15ActionType = Literal[
     "left_click",
     "double_click",
     "triple_click",
+    "middle_click",
     "right_click",
+    "mouse_move",
+    "mouse_down",
+    "mouse_up",
     "scroll",
     "type",
     "key_press",
-    "hover",
+    "hold_key",
     "drag",
     "wait",
     "refresh",
     "go_back",
+    "go_forward",
     "goto_url",
 ]
 
 
-class N1Action(TypedDict, total=False):
-    action_type: N1ActionType
+class N15Action(TypedDict, total=False):
+    action_type: N15ActionType
     coordinates: tuple[int, int] | list[int]
     start_coordinates: tuple[int, int] | list[int]
     direction: Literal["up", "down", "left", "right"]
     amount: int
     text: str
-    press_enter_after: bool
-    clear_before_typing: bool
-    key_comb: str
+    key: str
+    modifier: str
+    duration: int
     url: str
 
 
@@ -97,22 +104,27 @@ def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: i
         self.height = height
         self.kiosk_mode = kiosk_mode
 
-    async def execute(self, action: N1Action) -> ToolResult:
+    async def execute(self, action: N15Action) -> ToolResult:
         action_type = action.get("action_type")
 
         handlers = {
             "left_click": lambda a: self._handle_click(a, "left", 1),
             "double_click": lambda a: self._handle_click(a, "left", 2),
             "triple_click": lambda a: self._handle_click(a, "left", 3),
+            "middle_click": lambda a: self._handle_click(a, "middle", 1),
             "right_click": lambda a: self._handle_click(a, "right", 1),
+            "mouse_move": self._handle_mouse_move,
+            "mouse_down": lambda a: self._handle_mouse_button(a, "down"),
+            "mouse_up": lambda a: self._handle_mouse_button(a, "up"),
             "scroll": self._handle_scroll,
             "type": self._handle_type,
             "key_press": self._handle_key_press,
-            "hover": self._handle_hover,
+            "hold_key": self._handle_hold_key,
             "drag": self._handle_drag,
             "wait": self._handle_wait,
             "refresh": self._handle_refresh,
             "go_back": self._handle_go_back,
+            "go_forward": self._handle_go_forward,
             "goto_url": self._handle_goto_url,
         }
 
@@ -122,22 +134,51 @@ async def execute(self, action: N1Action) -> ToolResult:
 
         return await handler(action)
 
-    async def _handle_click(self, action: N1Action, button: str, num_clicks: int) -> ToolResult:
+    async def _handle_click(self, action: N15Action, button: str, num_clicks: int) -> ToolResult:
+        coords = self._get_coordinates(action.get("coordinates"))
+        modifier = action.get("modifier")
+        kwargs: dict[str, Any] = {
+            "x": coords["x"],
+            "y": coords["y"],
+            "button": button,
+            "click_type": "click",
+            "num_clicks": num_clicks,
+        }
+        if modifier:
+            kwargs["hold_keys"] = [self._map_key(modifier)]
+
+        self.kernel.browsers.computer.click_mouse(self.session_id, **kwargs)
+
+        await asyncio.sleep(SCREENSHOT_DELAY_S)
+        return await self.screenshot()
+
+    async def _handle_mouse_move(self, action: N15Action) -> ToolResult:
+        coords = self._get_coordinates(action.get("coordinates"))
+
+        self.kernel.browsers.computer.move_mouse(
+            self.session_id,
+            x=coords["x"],
+            y=coords["y"],
+        )
+
+        await asyncio.sleep(SCREENSHOT_DELAY_S)
+        return await self.screenshot()
+
+    async def _handle_mouse_button(self, action: N15Action, click_type: str) -> ToolResult:
         coords = self._get_coordinates(action.get("coordinates"))
 
         self.kernel.browsers.computer.click_mouse(
             self.session_id,
             x=coords["x"],
             y=coords["y"],
-            button=button,
-            click_type="click",
-            num_clicks=num_clicks,
+            button="left",
+            click_type=click_type,
         )
 
         await asyncio.sleep(SCREENSHOT_DELAY_S)
         return await self.screenshot()
 
-    async def _handle_scroll(self, action: N1Action) -> ToolResult:
+    async def _handle_scroll(self, action: N15Action) -> ToolResult:
         coords = self._get_coordinates(action.get("coordinates"))
         direction = action.get("direction")
         notches = max(action.get("amount", 3), 1)
@@ -171,45 +212,26 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult:
         screenshot_result["output"] = f"Scrolled {notches} wheel unit(s) {direction}."
         return screenshot_result
 
-    async def _handle_type(self, action: N1Action) -> ToolResult:
+    async def _handle_type(self, action: N15Action) -> ToolResult:
         text = action.get("text")
         if not text:
             raise ToolError("text is required for type action")
 
-        if action.get("clear_before_typing"):
-            self.kernel.browsers.computer.press_key(
-                self.session_id,
-                keys=["ctrl+a"],
-            )
-            await asyncio.sleep(0.1)
-            self.kernel.browsers.computer.press_key(
-                self.session_id,
-                keys=["BackSpace"],
-            )
-            await asyncio.sleep(0.1)
-
         self.kernel.browsers.computer.type_text(
             self.session_id,
             text=text,
             delay=TYPING_DELAY_MS,
         )
 
-        if action.get("press_enter_after"):
-            await asyncio.sleep(0.1)
-            self.kernel.browsers.computer.press_key(
-                self.session_id,
-                keys=["Return"],
-            )
-
         await asyncio.sleep(SCREENSHOT_DELAY_S)
         return await self.screenshot()
 
-    async def _handle_key_press(self, action: N1Action) -> ToolResult:
-        key_comb = action.get("key_comb")
-        if not key_comb:
-            raise ToolError("key_comb is required for key_press action")
+    async def _handle_key_press(self, action: N15Action) -> ToolResult:
+        key = action.get("key")
+        if not key:
+            raise ToolError("key is required for key_press action")
 
-        mapped_key = self._map_key(key_comb)
+        mapped_key = self._map_key(key)
 
         self.kernel.browsers.computer.press_key(
             self.session_id,
@@ -219,19 +241,24 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult:
         await asyncio.sleep(SCREENSHOT_DELAY_S)
         return await self.screenshot()
 
-    async def _handle_hover(self, action: N1Action) -> ToolResult:
-        coords = self._get_coordinates(action.get("coordinates"))
+    async def _handle_hold_key(self, action: N15Action) -> ToolResult:
+        key = action.get("key")
+        if not key:
+            raise ToolError("key is required for hold_key action")
 
-        self.kernel.browsers.computer.move_mouse(
+        mapped_key = self._map_key(key)
+        duration = action.get("duration") or 1000
+
+        self.kernel.browsers.computer.press_key(
             self.session_id,
-            x=coords["x"],
-            y=coords["y"],
+            keys=[mapped_key],
+            duration=duration,
         )
 
         await asyncio.sleep(SCREENSHOT_DELAY_S)
         return await self.screenshot()
 
-    async def _handle_drag(self, action: N1Action) -> ToolResult:
+    async def _handle_drag(self, action: N15Action) -> ToolResult:
         start_coords = self._get_coordinates(action.get("start_coordinates"))
         end_coords = self._get_coordinates(action.get("coordinates"))
 
@@ -244,11 +271,13 @@ async def _handle_drag(self, action: N1Action) -> ToolResult:
         await asyncio.sleep(SCREENSHOT_DELAY_S)
         return await self.screenshot()
 
-    async def _handle_wait(self, action: N1Action) -> ToolResult:
-        await asyncio.sleep(2)
+    async def _handle_wait(self, action: N15Action) -> ToolResult:
+        duration = action.get("duration")
+        seconds = (duration / 1000) if duration and duration > 0 else 2
+        await asyncio.sleep(seconds)
         return await self.screenshot()
 
-    async def _handle_refresh(self, action: N1Action) -> ToolResult:
+    async def _handle_refresh(self, action: N15Action) -> ToolResult:
         self.kernel.browsers.computer.press_key(
             self.session_id,
             keys=["F5"],
@@ -256,7 +285,7 @@ async def _handle_refresh(self, action: N1Action) -> ToolResult:
         await asyncio.sleep(2)
         return await self.screenshot()
 
-    async def _handle_go_back(self, action: N1Action) -> ToolResult:
+    async def _handle_go_back(self, action: N15Action) -> ToolResult:
         self.kernel.browsers.computer.press_key(
             self.session_id,
             keys=["alt+Left"],
@@ -264,7 +293,15 @@ async def _handle_go_back(self, action: N1Action) -> ToolResult:
         await asyncio.sleep(1.5)
         return await self.screenshot()
 
-    async def _handle_goto_url(self, action: N1Action) -> ToolResult:
+    async def _handle_go_forward(self, action: N15Action) -> ToolResult:
+        self.kernel.browsers.computer.press_key(
+            self.session_id,
+            keys=["alt+Right"],
+        )
+        await asyncio.sleep(1.5)
+        return await self.screenshot()
+
+    async def _handle_goto_url(self, action: N15Action) -> ToolResult:
         url = action.get("url")
         if not url:
             raise ToolError("url is required for goto_url action")
diff --git a/pkg/templates/typescript/yutori/README.md b/pkg/templates/typescript/yutori/README.md
index 92c009d3..2d9f781e 100644
--- a/pkg/templates/typescript/yutori/README.md
+++ b/pkg/templates/typescript/yutori/README.md
@@ -1,8 +1,8 @@
-# Kernel TypeScript Sample App - Yutori n1 Computer Use
+# Kernel TypeScript Sample App - Yutori n1.5 Computer Use
 
-This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API.
+This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API.
 
-[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
+[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
 
 ## Setup
 
@@ -55,9 +55,9 @@ kernel invoke ts-yutori-cua cua-task --payload '{"query": "Enter https://example
 
 ## Viewport Configuration
 
-Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
+Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
 
-> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
+> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
 
 See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations.
 
@@ -65,25 +65,36 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport
 
 Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori.
 
-## n1-latest Supported Actions
+## n1.5-latest Supported Actions
+
+This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only.
 
 | Action | Description |
 |--------|-------------|
-| `left_click` | Left mouse click at coordinates |
-| `double_click` | Double-click at coordinates |
-| `triple_click` | Triple-click at coordinates |
+| `left_click` | Left mouse click at coordinates (supports `modifier`) |
+| `double_click` | Double-click at coordinates (supports `modifier`) |
+| `triple_click` | Triple-click at coordinates (supports `modifier`) |
+| `middle_click` | Middle mouse click at coordinates |
 | `right_click` | Right mouse click at coordinates |
+| `mouse_move` | Move mouse to coordinates without clicking |
+| `mouse_down` | Press the left mouse button at coordinates |
+| `mouse_up` | Release the left mouse button at coordinates |
 | `scroll` | Scroll page in a direction |
 | `type` | Type text into focused element |
-| `key_press` | Send keyboard input |
-| `hover` | Move mouse without clicking |
+| `key_press` | Send a single key or key combination |
+| `hold_key` | Hold a key for a duration |
 | `drag` | Click-and-drag operation |
 | `wait` | Pause for UI to update |
 | `refresh` | Reload current page |
 | `go_back` | Navigate back in history |
+| `go_forward` | Navigate forward in history |
 | `goto_url` | Navigate to a URL |
 
+### Disabled tools
+
+The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model.
+
 ## Resources
 
-- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1)
+- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5)
 - [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
diff --git a/pkg/templates/typescript/yutori/index.ts b/pkg/templates/typescript/yutori/index.ts
index 364e7237..c38a1b53 100644
--- a/pkg/templates/typescript/yutori/index.ts
+++ b/pkg/templates/typescript/yutori/index.ts
@@ -47,7 +47,7 @@ app.action<QueryInput, QueryOutput>(
     try {
       // Run the sampling loop
       const { finalAnswer, messages } = await samplingLoop({
-        model: 'n1-latest',
+        model: 'n1.5-latest',
         task: payload.query,
         apiKey: YUTORI_API_KEY,
         kernel,
diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts
index e0f94795..f113b61e 100644
--- a/pkg/templates/typescript/yutori/loop.ts
+++ b/pkg/templates/typescript/yutori/loop.ts
@@ -1,19 +1,25 @@
 /**
- * Yutori n1 Sampling Loop
- * 
- * Implements the agent loop for Yutori's n1-latest computer use model.
- * n1-latest uses an OpenAI-compatible API with tool_calls:
+ * Yutori n1.5 Sampling Loop
+ *
+ * Implements the agent loop for Yutori's n1.5-latest computer use model.
+ * n1.5-latest uses an OpenAI-compatible API with tool_calls:
  * - Actions are returned via tool_calls in the assistant message
  * - Tool results use role: "tool" with matching tool_call_id
  * - The model stops by returning content without tool_calls
  * - Coordinates are returned in 1000x1000 space and need scaling
- * 
- * @see https://docs.yutori.com/reference/n1
+ *
+ * @see https://docs.yutori.com/reference/n1-5
  */
 
 import OpenAI from 'openai';
 import type { Kernel } from '@onkernel/sdk';
-import { ComputerTool, type N1Action, type ToolResult } from './tools/computer';
+import { ComputerTool, type N15Action, type ToolResult } from './tools/computer';
+
+// Tools that require a Playwright page / DOM access. The default core tool set
+// already excludes them, but we also list them in `disable_tools` so the
+// exclusion is explicit and survives if the default ever changes.
+const DISABLED_TOOLS = ['extract_elements', 'find', 'set_element_value', 'execute_js'];
+const TOOL_SET = 'browser_tools_core-20260403';
 
 interface SamplingLoopOptions {
   model?: string;
@@ -34,7 +40,7 @@ interface SamplingLoopResult {
 }
 
 export async function samplingLoop({
-  model = 'n1-latest',
+  model = 'n1.5-latest',
   task,
   apiKey,
   kernel,
@@ -85,6 +91,14 @@ export async function samplingLoop({
         messages: conversationMessages,
         max_completion_tokens: maxCompletionTokens,
         temperature: 0.3,
+        // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types).
+        // tool_set selects the core (coordinate-based) tools.
+        // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
+        // @ts-expect-error extra_body is a Yutori extension
+        extra_body: {
+          tool_set: TOOL_SET,
+          disable_tools: DISABLED_TOOLS,
+        },
       });
     } catch (apiError) {
       console.error('API call failed:', apiError);
@@ -131,8 +145,8 @@ export async function samplingLoop({
         continue;
       }
 
-      const action: N1Action = {
-        action_type: actionName as N1Action['action_type'],
+      const action: N15Action = {
+        action_type: actionName as N15Action['action_type'],
         ...args,
       };
 
@@ -190,7 +204,7 @@ export async function samplingLoop({
   };
 }
 
-function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action {
+function scaleCoordinates(action: N15Action, viewportWidth: number, viewportHeight: number): N15Action {
   const scaled = { ...action };
 
   if (scaled.coordinates) {
diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts
index 5ba8e09e..7e64b2bd 100644
--- a/pkg/templates/typescript/yutori/tools/computer.ts
+++ b/pkg/templates/typescript/yutori/tools/computer.ts
@@ -1,8 +1,10 @@
 /**
- * Yutori n1 Computer Tool
- * 
- * Maps n1-latest action format to Kernel's Computer Controls API.
+ * Yutori n1.5 Computer Tool
+ *
+ * Maps n1.5-latest action format to Kernel's Computer Controls API.
  * Screenshots are converted to WebP for better compression across multi-step trajectories.
+ *
+ * @see https://docs.yutori.com/reference/n1-5
  */
 
 import { Buffer } from 'buffer';
@@ -26,31 +28,36 @@ export class ToolError extends Error {
   }
 }
 
-export type N1ActionType =
+export type N15ActionType =
   | 'left_click'
   | 'double_click'
   | 'triple_click'
+  | 'middle_click'
   | 'right_click'
+  | 'mouse_move'
+  | 'mouse_down'
+  | 'mouse_up'
   | 'scroll'
   | 'type'
   | 'key_press'
-  | 'hover'
+  | 'hold_key'
   | 'drag'
   | 'wait'
   | 'refresh'
   | 'go_back'
+  | 'go_forward'
   | 'goto_url';
 
-export interface N1Action {
-  action_type: N1ActionType;
+export interface N15Action {
+  action_type: N15ActionType;
   coordinates?: [number, number];
   start_coordinates?: [number, number];
   direction?: 'up' | 'down' | 'left' | 'right';
   amount?: number;
   text?: string;
-  press_enter_after?: boolean;
-  clear_before_typing?: boolean;
-  key_comb?: string;
+  key?: string;
+  modifier?: string;
+  duration?: number;
   url?: string;
 }
 
@@ -107,7 +114,7 @@ export class ComputerTool {
     this.kioskMode = kioskMode;
   }
 
-  async execute(action: N1Action): Promise<ToolResult> {
+  async execute(action: N15Action): Promise<ToolResult> {
     const { action_type } = action;
 
     switch (action_type) {
@@ -117,24 +124,34 @@ export class ComputerTool {
         return this.handleClick(action, 'left', 2);
       case 'triple_click':
         return this.handleClick(action, 'left', 3);
+      case 'middle_click':
+        return this.handleClick(action, 'middle', 1);
       case 'right_click':
         return this.handleClick(action, 'right', 1);
+      case 'mouse_move':
+        return this.handleMouseMove(action);
+      case 'mouse_down':
+        return this.handleMouseButton(action, 'down');
+      case 'mouse_up':
+        return this.handleMouseButton(action, 'up');
       case 'scroll':
         return this.handleScroll(action);
       case 'type':
         return this.handleType(action);
       case 'key_press':
         return this.handleKeyPress(action);
-      case 'hover':
-        return this.handleHover(action);
+      case 'hold_key':
+        return this.handleHoldKey(action);
       case 'drag':
         return this.handleDrag(action);
       case 'wait':
-        return this.handleWait();
+        return this.handleWait(action);
       case 'refresh':
         return this.handleRefresh();
       case 'go_back':
         return this.handleGoBack();
+      case 'go_forward':
+        return this.handleGoForward();
       case 'goto_url':
         return this.handleGotoUrl(action);
       default:
@@ -142,22 +159,50 @@ export class ComputerTool {
     }
   }
 
-  private async handleClick(action: N1Action, button: 'left' | 'right', numClicks: number): Promise<ToolResult> {
+  private async handleClick(action: N15Action, button: 'left' | 'right' | 'middle', numClicks: number): Promise<ToolResult> {
     const coords = this.getCoordinates(action.coordinates);
-    
+    const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined;
+
     await this.kernel.browsers.computer.clickMouse(this.sessionId, {
       x: coords.x,
       y: coords.y,
       button,
       click_type: 'click',
       num_clicks: numClicks,
+      ...(holdKeys ? { hold_keys: holdKeys } : {}),
+    });
+
+    await this.sleep(SCREENSHOT_DELAY_MS);
+    return this.screenshot();
+  }
+
+  private async handleMouseMove(action: N15Action): Promise<ToolResult> {
+    const coords = this.getCoordinates(action.coordinates);
+
+    await this.kernel.browsers.computer.moveMouse(this.sessionId, {
+      x: coords.x,
+      y: coords.y,
+    });
+
+    await this.sleep(SCREENSHOT_DELAY_MS);
+    return this.screenshot();
+  }
+
+  private async handleMouseButton(action: N15Action, clickType: 'down' | 'up'): Promise<ToolResult> {
+    const coords = this.getCoordinates(action.coordinates);
+
+    await this.kernel.browsers.computer.clickMouse(this.sessionId, {
+      x: coords.x,
+      y: coords.y,
+      button: 'left',
+      click_type: clickType,
     });
 
     await this.sleep(SCREENSHOT_DELAY_MS);
     return this.screenshot();
   }
 
-  private async handleScroll(action: N1Action): Promise<ToolResult> {
+  private async handleScroll(action: N15Action): Promise<ToolResult> {
     const coords = this.getCoordinates(action.coordinates);
     const direction = action.direction;
     const notches = Math.max(action.amount ?? 3, 1);
@@ -199,46 +244,28 @@ export class ComputerTool {
     };
   }
 
-  private async handleType(action: N1Action): Promise<ToolResult> {
+  private async handleType(action: N15Action): Promise<ToolResult> {
     const text = action.text;
     if (!text) {
       throw new ToolError('text is required for type action');
     }
 
-    if (action.clear_before_typing) {
-      await this.kernel.browsers.computer.pressKey(this.sessionId, {
-        keys: ['ctrl+a'],
-      });
-      await this.sleep(100);
-      await this.kernel.browsers.computer.pressKey(this.sessionId, {
-        keys: ['BackSpace'],
-      });
-      await this.sleep(100);
-    }
-
     await this.kernel.browsers.computer.typeText(this.sessionId, {
       text,
       delay: TYPING_DELAY_MS,
     });
 
-    if (action.press_enter_after) {
-      await this.sleep(100);
-      await this.kernel.browsers.computer.pressKey(this.sessionId, {
-        keys: ['Return'],
-      });
-    }
-
     await this.sleep(SCREENSHOT_DELAY_MS);
     return this.screenshot();
   }
 
-  private async handleKeyPress(action: N1Action): Promise<ToolResult> {
-    const keyComb = action.key_comb;
-    if (!keyComb) {
-      throw new ToolError('key_comb is required for key_press action');
+  private async handleKeyPress(action: N15Action): Promise<ToolResult> {
+    const key = action.key;
+    if (!key) {
+      throw new ToolError('key is required for key_press action');
     }
 
-    const mappedKey = this.mapKey(keyComb);
+    const mappedKey = this.mapKey(key);
 
     await this.kernel.browsers.computer.pressKey(this.sessionId, {
       keys: [mappedKey],
@@ -248,19 +275,25 @@ export class ComputerTool {
     return this.screenshot();
   }
 
-  private async handleHover(action: N1Action): Promise<ToolResult> {
-    const coords = this.getCoordinates(action.coordinates);
+  private async handleHoldKey(action: N15Action): Promise<ToolResult> {
+    const key = action.key;
+    if (!key) {
+      throw new ToolError('key is required for hold_key action');
+    }
 
-    await this.kernel.browsers.computer.moveMouse(this.sessionId, {
-      x: coords.x,
-      y: coords.y,
+    const mappedKey = this.mapKey(key);
+    const durationMs = action.duration && action.duration > 0 ? action.duration : 1000;
+
+    await this.kernel.browsers.computer.pressKey(this.sessionId, {
+      keys: [mappedKey],
+      duration: durationMs,
     });
 
     await this.sleep(SCREENSHOT_DELAY_MS);
     return this.screenshot();
   }
 
-  private async handleDrag(action: N1Action): Promise<ToolResult> {
+  private async handleDrag(action: N15Action): Promise<ToolResult> {
     const startCoords = this.getCoordinates(action.start_coordinates);
     const endCoords = this.getCoordinates(action.coordinates);
 
@@ -273,8 +306,9 @@ export class ComputerTool {
     return this.screenshot();
   }
 
-  private async handleWait(): Promise<ToolResult> {
-    await this.sleep(2000);
+  private async handleWait(action: N15Action): Promise<ToolResult> {
+    const durationMs = action.duration && action.duration > 0 ? action.duration : 2000;
+    await this.sleep(durationMs);
     return this.screenshot();
   }
 
@@ -296,7 +330,16 @@ export class ComputerTool {
     return this.screenshot();
   }
 
-  private async handleGotoUrl(action: N1Action): Promise<ToolResult> {
+  private async handleGoForward(): Promise<ToolResult> {
+    await this.kernel.browsers.computer.pressKey(this.sessionId, {
+      keys: ['alt+Right'],
+    });
+
+    await this.sleep(1500);
+    return this.screenshot();
+  }
+
+  private async handleGotoUrl(action: N15Action): Promise<ToolResult> {
     const url = action.url;
     if (!url) {
       throw new ToolError('url is required for goto_url action');

From 0f8468bb529ee14664b9b05e104243d899235e67 Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Thu, 7 May 2026 16:06:11 +0000
Subject: [PATCH 2/7] wire scroll modifier through to hold_keys

n1.5's scroll action accepts an optional `modifier` (e.g., shift) that
on browsers translates a vertical wheel into a horizontal scroll. Plumb
it into Kernel's `ComputerScrollParams.hold_keys` so the OS-level
modifier+wheel event is dispatched correctly.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 pkg/templates/python/yutori/tools/computer.py  | 18 +++++++++++-------
 .../typescript/yutori/tools/computer.ts        |  3 +++
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py
index 18b3504c..5f285b87 100644
--- a/pkg/templates/python/yutori/tools/computer.py
+++ b/pkg/templates/python/yutori/tools/computer.py
@@ -199,13 +199,17 @@ async def _handle_scroll(self, action: N15Action) -> ToolResult:
         elif direction == "right":
             delta_x = notches
 
-        self.kernel.browsers.computer.scroll(
-            self.session_id,
-            x=coords["x"],
-            y=coords["y"],
-            delta_x=delta_x,
-            delta_y=delta_y,
-        )
+        modifier = action.get("modifier")
+        scroll_kwargs: dict[str, Any] = {
+            "x": coords["x"],
+            "y": coords["y"],
+            "delta_x": delta_x,
+            "delta_y": delta_y,
+        }
+        if modifier:
+            scroll_kwargs["hold_keys"] = [self._map_key(modifier)]
+
+        self.kernel.browsers.computer.scroll(self.session_id, **scroll_kwargs)
 
         await asyncio.sleep(SCREENSHOT_DELAY_S)
         screenshot_result = await self.screenshot()
diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts
index 7e64b2bd..9cc792c5 100644
--- a/pkg/templates/typescript/yutori/tools/computer.ts
+++ b/pkg/templates/typescript/yutori/tools/computer.ts
@@ -229,11 +229,14 @@ export class ComputerTool {
         break;
     }
 
+    const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined;
+
     await this.kernel.browsers.computer.scroll(this.sessionId, {
       x: coords.x,
       y: coords.y,
       delta_x,
       delta_y,
+      ...(holdKeys ? { hold_keys: holdKeys } : {}),
     });
 
     await this.sleep(SCREENSHOT_DELAY_MS);

From 8d3f065ab820feb455f28cf16ea12d92fe4fcabd Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Thu, 7 May 2026 16:36:53 +0000
Subject: [PATCH 3/7] frame readme with n1.5 benchmarks and scope

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 pkg/templates/python/yutori/README.md     | 6 ++++--
 pkg/templates/typescript/yutori/README.md | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md
index 8f7df74a..3fd63d5b 100644
--- a/pkg/templates/python/yutori/README.md
+++ b/pkg/templates/python/yutori/README.md
@@ -1,8 +1,10 @@
 # Kernel Python Sample App - Yutori n1.5 Computer Use
 
-This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API.
+This Kernel app implements a prompt loop using Yutori's Navigator n1.5 with Kernel's Computer Controls API.
 
-[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
+[Navigator n1.5](https://yutori.com/blog/introducing-n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. Yutori reports **94.5% on Online-Mind2Web** and **88.0% on Navi-Bench v2** at **$1.50 per 1M input tokens** — currently among the most accurate and cost-effective computer-use models.
+
+This template runs n1.5 in **computer-use-only mode**. n1.5 also supports a hybrid vision + DOM/JavaScript path (page-state extraction, custom JS, structured JSON output) for multi-field forms and bulk data extraction, but those tools are intentionally disabled here — see [Disabled tools](#disabled-tools).
 
 ## Setup
 
diff --git a/pkg/templates/typescript/yutori/README.md b/pkg/templates/typescript/yutori/README.md
index 2d9f781e..efdfbc61 100644
--- a/pkg/templates/typescript/yutori/README.md
+++ b/pkg/templates/typescript/yutori/README.md
@@ -1,8 +1,10 @@
 # Kernel TypeScript Sample App - Yutori n1.5 Computer Use
 
-This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API.
+This Kernel app implements a prompt loop using Yutori's Navigator n1.5 with Kernel's Computer Controls API.
 
-[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
+[Navigator n1.5](https://yutori.com/blog/introducing-n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. Yutori reports **94.5% on Online-Mind2Web** and **88.0% on Navi-Bench v2** at **$1.50 per 1M input tokens** — currently among the most accurate and cost-effective computer-use models.
+
+This template runs n1.5 in **computer-use-only mode**. n1.5 also supports a hybrid vision + DOM/JavaScript path (page-state extraction, custom JS, structured JSON output) for multi-field forms and bulk data extraction, but those tools are intentionally disabled here — see [Disabled tools](#disabled-tools).
 
 ## Setup
 

From f4f31a6a4ca599876b93cb1e809a17364608c18c Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Thu, 7 May 2026 17:00:28 +0000
Subject: [PATCH 4/7] trim old screenshots from request history

Mirror yutori-sdk-python's reference loop: deep-copy a request-only
view of the messages and strip old image_url blocks once the JSON
payload exceeds ~9.5 MB, while always preserving the most recent
6 screenshots and the very latest one. The caller's full history
is left intact for the return value.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 pkg/templates/python/yutori/loop.py     |  93 ++++++++++++++++++++-
 pkg/templates/typescript/yutori/loop.ts | 104 +++++++++++++++++++++++-
 2 files changed, 195 insertions(+), 2 deletions(-)

diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py
index ad39024b..447ebff6 100644
--- a/pkg/templates/python/yutori/loop.py
+++ b/pkg/templates/python/yutori/loop.py
@@ -11,6 +11,7 @@
 @see https://docs.yutori.com/reference/n1-5
 """
 
+import copy
 import json
 from typing import Any, Optional
 
@@ -25,6 +26,13 @@
 DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"]
 TOOL_SET = "browser_tools_core-20260403"
 
+# Screenshot-trimming defaults mirror Yutori's reference loop:
+# https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py
+# Trimming is size-triggered — we only drop old screenshots when the payload
+# exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS.
+MAX_REQUEST_BYTES = 9_500_000
+KEEP_RECENT_SCREENSHOTS = 6
+
 
 async def sampling_loop(
     *,
@@ -69,10 +77,14 @@ async def sampling_loop(
         iteration += 1
         print(f"\n=== Iteration {iteration} ===")
 
+        request_messages, dropped = _trimmed_for_request(conversation_messages)
+        if dropped:
+            print(f"Trimmed {dropped} old screenshot(s) to fit request size limit")
+
         try:
             response = client.chat.completions.create(
                 model=model,
-                messages=conversation_messages,
+                messages=request_messages,
                 max_completion_tokens=max_completion_tokens,
                 temperature=0.3,
                 # n1.5-specific knobs go in extra_body.
@@ -168,6 +180,85 @@ async def sampling_loop(
     }
 
 
+def _trimmed_for_request(
+    messages: list[dict[str, Any]],
+) -> tuple[list[dict[str, Any]], int]:
+    """Return a deep-copied messages list with old screenshots stripped to fit MAX_REQUEST_BYTES.
+
+    The most recent KEEP_RECENT_SCREENSHOTS screenshots are protected. The full
+    `messages` list is preserved unchanged for the caller's return value.
+    """
+    trimmed = copy.deepcopy(messages)
+    size = _estimate_size(trimmed)
+    if size <= MAX_REQUEST_BYTES:
+        return trimmed, 0
+
+    image_indices = [i for i, m in enumerate(trimmed) if _message_has_image(m)]
+    if not image_indices:
+        return trimmed, 0
+
+    protected = set(image_indices[-max(1, KEEP_RECENT_SCREENSHOTS):])
+    removed = 0
+
+    for idx in image_indices:
+        if size <= MAX_REQUEST_BYTES:
+            break
+        if idx in protected:
+            continue
+        if _strip_one_image(trimmed[idx]):
+            removed += 1
+            size = _estimate_size(trimmed)
+
+    # If still over, strip from the protected window too — but always keep the latest.
+    if size > MAX_REQUEST_BYTES:
+        last_idx = image_indices[-1]
+        for idx in image_indices:
+            if size <= MAX_REQUEST_BYTES:
+                break
+            if idx == last_idx:
+                continue
+            if _strip_one_image(trimmed[idx]):
+                removed += 1
+                size = _estimate_size(trimmed)
+
+    return trimmed, removed
+
+
+def _estimate_size(messages: list[dict[str, Any]]) -> int:
+    return len(json.dumps(messages, separators=(",", ":"), ensure_ascii=False).encode("utf-8"))
+
+
+def _message_has_image(msg: dict[str, Any]) -> bool:
+    content = msg.get("content")
+    if not isinstance(content, list):
+        return False
+    return any(isinstance(p, dict) and p.get("type") == "image_url" for p in content)
+
+
+def _strip_one_image(msg: dict[str, Any]) -> bool:
+    content = msg.get("content")
+    if not isinstance(content, list):
+        return False
+
+    removed = False
+    new_content: list[dict[str, Any]] = []
+    for part in content:
+        if not removed and isinstance(part, dict) and part.get("type") == "image_url":
+            removed = True
+            continue
+        new_content.append(part)
+
+    if not removed:
+        return False
+
+    has_text = any(isinstance(p, dict) and p.get("type") == "text" for p in new_content)
+    if not has_text:
+        new_content.append({"type": "text", "text": "Screenshot omitted to stay under request size limit."})
+
+    msg["content"] = new_content
+    return True
+
+
 def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action:
     scaled = dict(action)
 
diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts
index f113b61e..a0f3aef8 100644
--- a/pkg/templates/typescript/yutori/loop.ts
+++ b/pkg/templates/typescript/yutori/loop.ts
@@ -21,6 +21,13 @@ import { ComputerTool, type N15Action, type ToolResult } from './tools/computer'
 const DISABLED_TOOLS = ['extract_elements', 'find', 'set_element_value', 'execute_js'];
 const TOOL_SET = 'browser_tools_core-20260403';
 
+// Screenshot-trimming defaults mirror Yutori's reference loop:
+// https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py
+// Trimming is size-triggered — we only drop old screenshots when the payload
+// exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS.
+const MAX_REQUEST_BYTES = 9_500_000;
+const KEEP_RECENT_SCREENSHOTS = 6;
+
 interface SamplingLoopOptions {
   model?: string;
   task: string;
@@ -84,11 +91,16 @@ export async function samplingLoop({
     iteration++;
     console.log(`\n=== Iteration ${iteration} ===`);
 
+    const { messages: requestMessages, removed } = trimmedForRequest(conversationMessages);
+    if (removed > 0) {
+      console.log(`Trimmed ${removed} old screenshot(s) to fit request size limit`);
+    }
+
     let response;
     try {
       response = await client.chat.completions.create({
         model,
-        messages: conversationMessages,
+        messages: requestMessages,
         max_completion_tokens: maxCompletionTokens,
         temperature: 0.3,
         // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types).
@@ -223,3 +235,93 @@ function scaleCoordinates(action: N15Action, viewportWidth: number, viewportHeig
 
   return scaled;
 }
+
+interface ImagePart {
+  type: 'image_url';
+  image_url: { url: string };
+}
+
+interface TextPart {
+  type: 'text';
+  text: string;
+}
+
+type ContentPart = ImagePart | TextPart | Record<string, unknown>;
+
+function estimateSize(messages: OpenAI.ChatCompletionMessageParam[]): number {
+  return Buffer.byteLength(JSON.stringify(messages), 'utf-8');
+}
+
+function messageHasImage(msg: OpenAI.ChatCompletionMessageParam): boolean {
+  const content = (msg as { content?: unknown }).content;
+  if (!Array.isArray(content)) return false;
+  return content.some((p) => typeof p === 'object' && p !== null && (p as { type?: unknown }).type === 'image_url');
+}
+
+function stripOneImage(msg: OpenAI.ChatCompletionMessageParam): boolean {
+  const content = (msg as { content?: unknown }).content;
+  if (!Array.isArray(content)) return false;
+
+  let removed = false;
+  const next: ContentPart[] = [];
+  for (const part of content as ContentPart[]) {
+    if (!removed && typeof part === 'object' && part !== null && (part as { type?: unknown }).type === 'image_url') {
+      removed = true;
+      continue;
+    }
+    next.push(part);
+  }
+  if (!removed) return false;
+
+  const hasText = next.some((p) => typeof p === 'object' && p !== null && (p as { type?: unknown }).type === 'text');
+  if (!hasText) {
+    next.push({ type: 'text', text: 'Screenshot omitted to stay under request size limit.' });
+  }
+
+  (msg as { content: unknown }).content = next;
+  return true;
+}
+
+function trimmedForRequest(
+  messages: OpenAI.ChatCompletionMessageParam[],
+): { messages: OpenAI.ChatCompletionMessageParam[]; removed: number } {
+  // Deep-copy so the caller's full history is preserved unchanged.
+  const trimmed = JSON.parse(JSON.stringify(messages)) as OpenAI.ChatCompletionMessageParam[];
+
+  let size = estimateSize(trimmed);
+  if (size <= MAX_REQUEST_BYTES) return { messages: trimmed, removed: 0 };
+
+  const imageIndices: number[] = [];
+  for (let i = 0; i < trimmed.length; i++) {
+    if (messageHasImage(trimmed[i])) imageIndices.push(i);
+  }
+  if (imageIndices.length === 0) return { messages: trimmed, removed: 0 };
+
+  const keep = Math.max(1, KEEP_RECENT_SCREENSHOTS);
+  const protectedIdx = new Set(imageIndices.slice(-keep));
+  let removed = 0;
+
+  for (const idx of imageIndices) {
+    if (size <= MAX_REQUEST_BYTES) break;
+    if (protectedIdx.has(idx)) continue;
+    if (stripOneImage(trimmed[idx])) {
+      removed++;
+      size = estimateSize(trimmed);
+    }
+  }
+
+  // If still over, strip from the protected window too — but always keep the latest.
+  if (size > MAX_REQUEST_BYTES) {
+    const lastIdx = imageIndices[imageIndices.length - 1];
+    for (const idx of imageIndices) {
+      if (size <= MAX_REQUEST_BYTES) break;
+      if (idx === lastIdx) continue;
+      if (stripOneImage(trimmed[idx])) {
+        removed++;
+        size = estimateSize(trimmed);
+      }
+    }
+  }
+
+  return { messages: trimmed, removed };
+}

From d716a9af5478fce23c3d24a103d041192e0407b8 Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Thu, 7 May 2026 17:01:04 +0000
Subject: [PATCH 5/7] shrink post-action screenshot delay to 150ms
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

300ms after every action was conservative — at 50 iterations that is
~15s of pure wall-clock waiting before model calls. 150ms still gives
the page enough time to settle for typical interactions while halving
the per-step overhead.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 pkg/templates/python/yutori/tools/computer.py     | 2 +-
 pkg/templates/typescript/yutori/tools/computer.ts | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py
index 5f285b87..58d5a4a3 100644
--- a/pkg/templates/python/yutori/tools/computer.py
+++ b/pkg/templates/python/yutori/tools/computer.py
@@ -19,7 +19,7 @@
 from .base import ToolError, ToolResult
 
 TYPING_DELAY_MS = 12
-SCREENSHOT_DELAY_S = 0.3
+SCREENSHOT_DELAY_S = 0.15
 ACTION_DELAY_S = 0.3
 
 N15ActionType = Literal[
diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts
index 9cc792c5..13e144c0 100644
--- a/pkg/templates/typescript/yutori/tools/computer.ts
+++ b/pkg/templates/typescript/yutori/tools/computer.ts
@@ -12,7 +12,7 @@ import type { Kernel } from '@onkernel/sdk';
 import sharp from 'sharp';
 
 const TYPING_DELAY_MS = 12;
-const SCREENSHOT_DELAY_MS = 300;
+const SCREENSHOT_DELAY_MS = 150;
 const ACTION_DELAY_MS = 300;
 
 export interface ToolResult {

From 23b63f0026c6bbd5ed80c0aeccfd1abfcb566c6e Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Thu, 7 May 2026 18:27:28 +0000
Subject: [PATCH 6/7] fix(yutori): send tool_set + disable_tools at top level
 in TS loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The openai-node SDK does not have a Python-style `extra_body` kwarg —
it serializes the body as-is. Passing `extra_body: {...}` as a body
field made Yutori receive a literal `{"extra_body": ...}` and silently
drop the tool_set pin and disable_tools defense-in-depth.

Hoist both fields and apply them via a typed spread (mirrors the
anthropic-computer-use loop pattern).
---
 pkg/templates/typescript/yutori/loop.ts | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts
index a0f3aef8..7db2e4ca 100644
--- a/pkg/templates/typescript/yutori/loop.ts
+++ b/pkg/templates/typescript/yutori/loop.ts
@@ -28,6 +28,11 @@ const TOOL_SET = 'browser_tools_core-20260403';
 const MAX_REQUEST_BYTES = 9_500_000;
 const KEEP_RECENT_SCREENSHOTS = 6;
 
+interface YutoriExtras {
+  tool_set: string;
+  disable_tools: string[];
+}
+
 interface SamplingLoopOptions {
   model?: string;
   task: string;
@@ -98,19 +103,21 @@ export async function samplingLoop({
 
     let response;
     try {
+      // n1.5-specific knobs (not in OpenAI SDK types). The openai-node SDK
+      // serializes the body as-is, so these go at the top level via a spread —
+      // unlike the Python SDK, there is no `extra_body` kwarg here.
+      // tool_set selects the core (coordinate-based) tools.
+      // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
+      const yutoriExtras: YutoriExtras = {
+        tool_set: TOOL_SET,
+        disable_tools: DISABLED_TOOLS,
+      };
       response = await client.chat.completions.create({
         model,
         messages: requestMessages,
         max_completion_tokens: maxCompletionTokens,
         temperature: 0.3,
-        // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types).
-        // tool_set selects the core (coordinate-based) tools.
-        // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
-        // @ts-expect-error extra_body is a Yutori extension
-        extra_body: {
-          tool_set: TOOL_SET,
-          disable_tools: DISABLED_TOOLS,
-        },
+        ...yutoriExtras,
       });
     } catch (apiError) {
       console.error('API call failed:', apiError);

From d2affba3c718815f5cdfd6a52db24fbfa752d8b8 Mon Sep 17 00:00:00 2001
From: dprevoznik <58714078+dprevoznik@users.noreply.github.com>
Date: Thu, 7 May 2026 18:27:32 +0000
Subject: [PATCH 7/7] fix(yutori): map standalone modifier names through
 MODIFIER_MAP
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

key_press / hold_key compound keys already consulted MODIFIER_MAP, but
the new `modifier` parameter on click and scroll actions passes a bare
modifier name ("control", "meta", "command") that bypassed the lookup
and went out as-is. Kernel's hold_keys wants "ctrl" and "super" — so
ctrl-click and cmd-click silently dropped the modifier.

Unify the per-part mapping into a single helper applied to both the
compound and single-key paths in TS and Python.
---
 pkg/templates/python/yutori/tools/computer.py | 23 +++++++----------
 .../typescript/yutori/tools/computer.ts       | 25 ++++++++-----------
 2 files changed, 20 insertions(+), 28 deletions(-)

diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py
index 58d5a4a3..afe41696 100644
--- a/pkg/templates/python/yutori/tools/computer.py
+++ b/pkg/templates/python/yutori/tools/computer.py
@@ -374,18 +374,13 @@ def _get_coordinates(
         return {"x": int(x), "y": int(y)}
 
     def _map_key(self, key: str) -> str:
+        def map_part(part: str) -> str:
+            trimmed = part.strip()
+            lower = trimmed.lower()
+            if lower in MODIFIER_MAP:
+                return MODIFIER_MAP[lower]
+            return KEY_MAP.get(trimmed, trimmed)
+
         if "+" in key:
-            parts = key.split("+")
-            mapped_parts = []
-            for part in parts:
-                trimmed = part.strip()
-                lower = trimmed.lower()
-                
-                if lower in MODIFIER_MAP:
-                    mapped_parts.append(MODIFIER_MAP[lower])
-                else:
-                    mapped_parts.append(KEY_MAP.get(trimmed, trimmed))
-            
-            return "+".join(mapped_parts)
-
-        return KEY_MAP.get(key, key)
+            return "+".join(map_part(p) for p in key.split("+"))
+        return map_part(key)
diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts
index 13e144c0..d4a33573 100644
--- a/pkg/templates/typescript/yutori/tools/computer.ts
+++ b/pkg/templates/typescript/yutori/tools/computer.ts
@@ -414,22 +414,19 @@ export class ComputerTool {
   }
 
   private mapKey(key: string): string {
+    const mapPart = (part: string): string => {
+      const trimmed = part.trim();
+      const lower = trimmed.toLowerCase();
+      if (MODIFIER_MAP[lower]) {
+        return MODIFIER_MAP[lower];
+      }
+      return KEY_MAP[trimmed] || trimmed;
+    };
+
     if (key.includes('+')) {
-      const parts = key.split('+');
-      const mappedParts = parts.map(part => {
-        const trimmed = part.trim();
-        const lower = trimmed.toLowerCase();
-        
-        if (MODIFIER_MAP[lower]) {
-          return MODIFIER_MAP[lower];
-        }
-        
-        return KEY_MAP[trimmed] || trimmed;
-      });
-      return mappedParts.join('+');
+      return key.split('+').map(mapPart).join('+');
     }
-
-    return KEY_MAP[key] || key;
+    return mapPart(key);
   }
 
   private sleep(ms: number): Promise<void> {