kernel · dprevoznik · May 6, 2026 · May 7, 2026 · May 7, 2026 · May 7, 2026
diff --git a/pkg/create/templates.go b/pkg/create/templates.go
@@ -87,8 +87,8 @@ var Templates = map[string]TemplateInfo{
 		Languages:   []string{LanguageTypeScript, LanguagePython},
 	},
 	TemplateYutoriComputerUse: {
-		Name:        "Yutori n1 Computer Use",
-		Description: "Implements a Yutori n1 computer use agent",
+		Name:        "Yutori n1.5 Computer Use",
+		Description: "Implements a Yutori n1.5 computer use agent",
 		Languages:   []string{LanguageTypeScript, LanguagePython},
 	},
 	TemplateTzafonComputerUse: {

diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md
@@ -1,8 +1,10 @@
-# Kernel Python Sample App - Yutori n1 Computer Use
+# Kernel Python Sample App - Yutori n1.5 Computer Use
 
-This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API.
+This Kernel app implements a prompt loop using Yutori's Navigator n1.5 with Kernel's Computer Controls API.
 
-[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots.
+[Navigator n1.5](https://yutori.com/blog/introducing-n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. Yutori reports **94.5% on Online-Mind2Web** and **88.0% on Navi-Bench v2** at **$1.50 per 1M input tokens** — currently among the most accurate and cost-effective computer-use models.
+
+This template runs n1.5 in **computer-use-only mode**. n1.5 also supports a hybrid vision + DOM/JavaScript path (page-state extraction, custom JS, structured JSON output) for multi-field forms and bulk data extraction, but those tools are intentionally disabled here — see [Disabled tools](#disabled-tools).
 
 ## Setup
 
@@ -55,35 +57,46 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://exa
 
 ## Viewport Configuration
 
-Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
+Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy.
 
-> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
+> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions.
 
 See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations.
 
 ## Screenshots
 
 Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori.
 
-## n1-latest Supported Actions
+## n1.5-latest Supported Actions
+
+This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only.
 
 | Action | Description |
 |--------|-------------|
-| `left_click` | Left mouse click at coordinates |
-| `double_click` | Double-click at coordinates |
-| `triple_click` | Triple-click at coordinates |
+| `left_click` | Left mouse click at coordinates (supports `modifier`) |
+| `double_click` | Double-click at coordinates (supports `modifier`) |
+| `triple_click` | Triple-click at coordinates (supports `modifier`) |
+| `middle_click` | Middle mouse click at coordinates |
 | `right_click` | Right mouse click at coordinates |
+| `mouse_move` | Move mouse to coordinates without clicking |
+| `mouse_down` | Press the left mouse button at coordinates |
+| `mouse_up` | Release the left mouse button at coordinates |
 | `scroll` | Scroll page in a direction |
 | `type` | Type text into focused element |
-| `key_press` | Send keyboard input |
-| `hover` | Move mouse without clicking |
+| `key_press` | Send a single key or key combination |
+| `hold_key` | Hold a key for a duration |
 | `drag` | Click-and-drag operation |
 | `wait` | Pause for UI to update |
 | `refresh` | Reload current page |
 | `go_back` | Navigate back in history |
+| `go_forward` | Navigate forward in history |
 | `goto_url` | Navigate to a URL |
 
+### Disabled tools
+
+The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model.
+
 ## Resources
 
-- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1)
+- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5)
 - [Kernel Documentation](https://www.kernel.sh/docs/quickstart)
diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py
@@ -1,28 +1,42 @@
 """
-Yutori n1 Sampling Loop
+Yutori n1.5 Sampling Loop
 
-Implements the agent loop for Yutori's n1-latest computer use model.
-n1-latest uses an OpenAI-compatible API with tool_calls:
+Implements the agent loop for Yutori's n1.5-latest computer use model.
+n1.5-latest uses an OpenAI-compatible API with tool_calls:
 - Actions are returned via tool_calls in the assistant message
 - Tool results use role: "tool" with matching tool_call_id
 - The model stops by returning content without tool_calls
 - Coordinates are returned in 1000x1000 space and need scaling
 
-@see https://docs.yutori.com/reference/n1
+@see https://docs.yutori.com/reference/n1-5
 """
 
+import copy
 import json
 from typing import Any, Optional
 
 from kernel import Kernel
 from openai import OpenAI
 
-from tools import ComputerTool, N1Action, ToolResult
+from tools import ComputerTool, N15Action, ToolResult
+
+# Tools that require a Playwright page / DOM access. The default core tool set
+# already excludes them, but we also list them in `disable_tools` so the
+# exclusion is explicit and survives if the default ever changes.
+DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"]
+TOOL_SET = "browser_tools_core-20260403"
+
+# Screenshot-trimming defaults mirror Yutori's reference loop:
+# https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py
+# Trimming is size-triggered — we only drop old screenshots when the payload
+# exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS.
+MAX_REQUEST_BYTES = 9_500_000
+KEEP_RECENT_SCREENSHOTS = 6
 
 
 async def sampling_loop(
     *,
-    model: str = "n1-latest",
+    model: str = "n1.5-latest",
     task: str,
     api_key: str,
     kernel: Kernel,
@@ -63,12 +77,23 @@ async def sampling_loop(
         iteration += 1
         print(f"\n=== Iteration {iteration} ===")
 
+        request_messages, dropped = _trimmed_for_request(conversation_messages)
+        if dropped:
+            print(f"Trimmed {dropped} old screenshot(s) to fit request size limit")
+
         try:
             response = client.chat.completions.create(
                 model=model,
-                messages=conversation_messages,
+                messages=request_messages,
                 max_completion_tokens=max_completion_tokens,
                 temperature=0.3,
+                # n1.5-specific knobs go in extra_body.
+                # tool_set selects the core (coordinate-based) tools.
+                # disable_tools is a defense-in-depth exclusion of DOM/Playwright tools.
+                extra_body={
+                    "tool_set": TOOL_SET,
+                    "disable_tools": DISABLED_TOOLS,
+                },
             )
         except Exception as api_error:
             print(f"API call failed: {api_error}")
@@ -108,7 +133,7 @@ async def sampling_loop(
                 })
                 continue
 
-            action: N1Action = {"action_type": action_name, **args}
+            action: N15Action = {"action_type": action_name, **args}
             print(f"Executing action: {action_name}", args)
 
             scaled_action = _scale_coordinates(action, viewport_width, viewport_height)
@@ -155,7 +180,86 @@ async def sampling_loop(
     }
 
 
-def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action:
+def _trimmed_for_request(
+    messages: list[dict[str, Any]],
+) -> tuple[list[dict[str, Any]], int]:
+    """Return a deep-copied messages list with old screenshots stripped to fit MAX_REQUEST_BYTES.
+
+    The most recent KEEP_RECENT_SCREENSHOTS screenshots are protected. The full
+    `messages` list is preserved unchanged for the caller's return value.
+    """
+    trimmed = copy.deepcopy(messages)
+    size = _estimate_size(trimmed)
+    if size <= MAX_REQUEST_BYTES:
+        return trimmed, 0
+
+    image_indices = [i for i, m in enumerate(trimmed) if _message_has_image(m)]
+    if not image_indices:
+        return trimmed, 0
+
+    protected = set(image_indices[-max(1, KEEP_RECENT_SCREENSHOTS):])
+    removed = 0
+
+    for idx in image_indices:
+        if size <= MAX_REQUEST_BYTES:
+            break
+        if idx in protected:
+            continue
+        if _strip_one_image(trimmed[idx]):
+            removed += 1
+            size = _estimate_size(trimmed)
+
+    # If still over, strip from the protected window too — but always keep the latest.
+    if size > MAX_REQUEST_BYTES:
+        last_idx = image_indices[-1]
+        for idx in image_indices:
+            if size <= MAX_REQUEST_BYTES:
+                break
+            if idx == last_idx:
+                continue
+            if _strip_one_image(trimmed[idx]):
+                removed += 1
+                size = _estimate_size(trimmed)
+
+    return trimmed, removed
+
+
+def _estimate_size(messages: list[dict[str, Any]]) -> int:
+    return len(json.dumps(messages, separators=(",", ":"), ensure_ascii=False).encode("utf-8"))
+
+
+def _message_has_image(msg: dict[str, Any]) -> bool:
+    content = msg.get("content")
+    if not isinstance(content, list):
+        return False
+    return any(isinstance(p, dict) and p.get("type") == "image_url" for p in content)
+
+
+def _strip_one_image(msg: dict[str, Any]) -> bool:
+    content = msg.get("content")
+    if not isinstance(content, list):
+        return False
+
+    removed = False
+    new_content: list[dict[str, Any]] = []
+    for part in content:
+        if not removed and isinstance(part, dict) and part.get("type") == "image_url":
+            removed = True
+            continue
+        new_content.append(part)
+
+    if not removed:
+        return False
+
+    has_text = any(isinstance(p, dict) and p.get("type") == "text" for p in new_content)
+    if not has_text:
+        new_content.append({"type": "text", "text": "Screenshot omitted to stay under request size limit."})
+
+    msg["content"] = new_content
+    return True
+
+
+def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action:
     scaled = dict(action)
 
     if "coordinates" in scaled and scaled["coordinates"]:

diff --git a/pkg/templates/python/yutori/main.py b/pkg/templates/python/yutori/main.py
@@ -30,7 +30,7 @@ async def cua_task(
     payload: QueryInput,
 ) -> QueryOutput:
     """
-    Process a user query using Yutori n1 Computer Use with Kernel's browser automation.
+    Process a user query using Yutori n1.5 Computer Use with Kernel's browser automation.
 
     Args:
         ctx: Kernel context containing invocation information
@@ -58,7 +58,7 @@ async def cua_task(
         print("Kernel browser live view url:", session.live_view_url)
 
         loop_result = await sampling_loop(
-            model="n1-latest",
+            model="n1.5-latest",
             task=payload["query"],
             api_key=str(api_key),
             kernel=session.kernel,

diff --git a/pkg/templates/python/yutori/tools/__init__.py b/pkg/templates/python/yutori/tools/__init__.py
@@ -1,11 +1,11 @@
-"""Yutori n1 Computer Tools."""
+"""Yutori n1.5 Computer Tools."""
 
 from .base import ToolError, ToolResult
-from .computer import ComputerTool, N1Action
+from .computer import ComputerTool, N15Action
 
 __all__ = [
     "ToolError",
     "ToolResult",
     "ComputerTool",
-    "N1Action",
+    "N15Action",
 ]