From 97400cd781e0ca90fe1b48237f7f3469d6ce8bbe Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Wed, 6 May 2026 22:47:45 +0000 Subject: [PATCH 1/7] update yutori template to n1.5 - bump model id from `n1-latest` to `n1.5-latest` - send `extra_body.tool_set: browser_tools_core-20260403` to use the coordinate-based tool set - send `extra_body.disable_tools` to explicitly exclude the DOM/Playwright tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) since this template runs computer-use only - rename `hover` to `mouse_move` - rename `key_press` parameter `key_comb` to `key` - drop `press_enter_after` and `clear_before_typing` from `type` (n1.5 emits separate `key_press` actions instead) - add new actions: `middle_click`, `mouse_down`, `mouse_up`, `hold_key`, `go_forward` - support optional `modifier` parameter on click actions via Kernel's `hold_keys` Co-Authored-By: Claude Opus 4.7 --- pkg/create/templates.go | 4 +- pkg/templates/python/yutori/README.md | 35 +++-- pkg/templates/python/yutori/loop.py | 29 +++- pkg/templates/python/yutori/main.py | 4 +- pkg/templates/python/yutori/tools/__init__.py | 6 +- pkg/templates/python/yutori/tools/computer.py | 143 +++++++++++------- pkg/templates/typescript/yutori/README.md | 35 +++-- pkg/templates/typescript/yutori/index.ts | 2 +- pkg/templates/typescript/yutori/loop.ts | 36 +++-- .../typescript/yutori/tools/computer.ts | 143 ++++++++++++------ 10 files changed, 283 insertions(+), 154 deletions(-) diff --git a/pkg/create/templates.go b/pkg/create/templates.go index f8541a11..699b97f9 100644 --- a/pkg/create/templates.go +++ b/pkg/create/templates.go @@ -87,8 +87,8 @@ var Templates = map[string]TemplateInfo{ Languages: []string{LanguageTypeScript, LanguagePython}, }, TemplateYutoriComputerUse: { - Name: "Yutori n1 Computer Use", - Description: "Implements a Yutori n1 computer use agent", + Name: "Yutori n1.5 Computer Use", + Description: "Implements a Yutori n1.5 computer use agent", Languages: []string{LanguageTypeScript, LanguagePython}, }, TemplateTzafonComputerUse: { diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md index 7523aff2..8f7df74a 100644 --- a/pkg/templates/python/yutori/README.md +++ b/pkg/templates/python/yutori/README.md @@ -1,8 +1,8 @@ -# Kernel Python Sample App - Yutori n1 Computer Use +# Kernel Python Sample App - Yutori n1.5 Computer Use -This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. +This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API. -[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. +[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. ## Setup @@ -55,9 +55,9 @@ kernel invoke python-yutori-cua cua-task --payload '{"query": "Enter https://exa ## Viewport Configuration -Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. +Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. -> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. +> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. @@ -65,25 +65,36 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori. -## n1-latest Supported Actions +## n1.5-latest Supported Actions + +This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only. | Action | Description | |--------|-------------| -| `left_click` | Left mouse click at coordinates | -| `double_click` | Double-click at coordinates | -| `triple_click` | Triple-click at coordinates | +| `left_click` | Left mouse click at coordinates (supports `modifier`) | +| `double_click` | Double-click at coordinates (supports `modifier`) | +| `triple_click` | Triple-click at coordinates (supports `modifier`) | +| `middle_click` | Middle mouse click at coordinates | | `right_click` | Right mouse click at coordinates | +| `mouse_move` | Move mouse to coordinates without clicking | +| `mouse_down` | Press the left mouse button at coordinates | +| `mouse_up` | Release the left mouse button at coordinates | | `scroll` | Scroll page in a direction | | `type` | Type text into focused element | -| `key_press` | Send keyboard input | -| `hover` | Move mouse without clicking | +| `key_press` | Send a single key or key combination | +| `hold_key` | Hold a key for a duration | | `drag` | Click-and-drag operation | | `wait` | Pause for UI to update | | `refresh` | Reload current page | | `go_back` | Navigate back in history | +| `go_forward` | Navigate forward in history | | `goto_url` | Navigate to a URL | +### Disabled tools + +The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model. + ## Resources -- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5) - [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py index 066aafb5..ad39024b 100644 --- a/pkg/templates/python/yutori/loop.py +++ b/pkg/templates/python/yutori/loop.py @@ -1,14 +1,14 @@ """ -Yutori n1 Sampling Loop +Yutori n1.5 Sampling Loop -Implements the agent loop for Yutori's n1-latest computer use model. -n1-latest uses an OpenAI-compatible API with tool_calls: +Implements the agent loop for Yutori's n1.5-latest computer use model. +n1.5-latest uses an OpenAI-compatible API with tool_calls: - Actions are returned via tool_calls in the assistant message - Tool results use role: "tool" with matching tool_call_id - The model stops by returning content without tool_calls - Coordinates are returned in 1000x1000 space and need scaling -@see https://docs.yutori.com/reference/n1 +@see https://docs.yutori.com/reference/n1-5 """ import json @@ -17,12 +17,18 @@ from kernel import Kernel from openai import OpenAI -from tools import ComputerTool, N1Action, ToolResult +from tools import ComputerTool, N15Action, ToolResult + +# Tools that require a Playwright page / DOM access. The default core tool set +# already excludes them, but we also list them in `disable_tools` so the +# exclusion is explicit and survives if the default ever changes. +DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"] +TOOL_SET = "browser_tools_core-20260403" async def sampling_loop( *, - model: str = "n1-latest", + model: str = "n1.5-latest", task: str, api_key: str, kernel: Kernel, @@ -69,6 +75,13 @@ async def sampling_loop( messages=conversation_messages, max_completion_tokens=max_completion_tokens, temperature=0.3, + # n1.5-specific knobs go in extra_body. + # tool_set selects the core (coordinate-based) tools. + # disable_tools is a defense-in-depth exclusion of DOM/Playwright tools. + extra_body={ + "tool_set": TOOL_SET, + "disable_tools": DISABLED_TOOLS, + }, ) except Exception as api_error: print(f"API call failed: {api_error}") @@ -108,7 +121,7 @@ async def sampling_loop( }) continue - action: N1Action = {"action_type": action_name, **args} + action: N15Action = {"action_type": action_name, **args} print(f"Executing action: {action_name}", args) scaled_action = _scale_coordinates(action, viewport_width, viewport_height) @@ -155,7 +168,7 @@ async def sampling_loop( } -def _scale_coordinates(action: N1Action, viewport_width: int, viewport_height: int) -> N1Action: +def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action: scaled = dict(action) if "coordinates" in scaled and scaled["coordinates"]: diff --git a/pkg/templates/python/yutori/main.py b/pkg/templates/python/yutori/main.py index 4679df8c..21543d9b 100644 --- a/pkg/templates/python/yutori/main.py +++ b/pkg/templates/python/yutori/main.py @@ -30,7 +30,7 @@ async def cua_task( payload: QueryInput, ) -> QueryOutput: """ - Process a user query using Yutori n1 Computer Use with Kernel's browser automation. + Process a user query using Yutori n1.5 Computer Use with Kernel's browser automation. Args: ctx: Kernel context containing invocation information @@ -58,7 +58,7 @@ async def cua_task( print("Kernel browser live view url:", session.live_view_url) loop_result = await sampling_loop( - model="n1-latest", + model="n1.5-latest", task=payload["query"], api_key=str(api_key), kernel=session.kernel, diff --git a/pkg/templates/python/yutori/tools/__init__.py b/pkg/templates/python/yutori/tools/__init__.py index 63da5188..5a1a4285 100644 --- a/pkg/templates/python/yutori/tools/__init__.py +++ b/pkg/templates/python/yutori/tools/__init__.py @@ -1,11 +1,11 @@ -"""Yutori n1 Computer Tools.""" +"""Yutori n1.5 Computer Tools.""" from .base import ToolError, ToolResult -from .computer import ComputerTool, N1Action +from .computer import ComputerTool, N15Action __all__ = [ "ToolError", "ToolResult", "ComputerTool", - "N1Action", + "N15Action", ] diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py index d078aded..18b3504c 100644 --- a/pkg/templates/python/yutori/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -1,15 +1,17 @@ """ -Yutori n1 Computer Tool +Yutori n1.5 Computer Tool -Maps n1-latest action format to Kernel's Computer Controls API. +Maps n1.5-latest action format to Kernel's Computer Controls API. Screenshots are converted to WebP for better compression across multi-step trajectories. + +@see https://docs.yutori.com/reference/n1-5 """ import asyncio import base64 import json from io import BytesIO -from typing import Literal, TypedDict +from typing import Any, Literal, TypedDict from kernel import Kernel from PIL import Image @@ -20,33 +22,38 @@ SCREENSHOT_DELAY_S = 0.3 ACTION_DELAY_S = 0.3 -N1ActionType = Literal[ +N15ActionType = Literal[ "left_click", "double_click", "triple_click", + "middle_click", "right_click", + "mouse_move", + "mouse_down", + "mouse_up", "scroll", "type", "key_press", - "hover", + "hold_key", "drag", "wait", "refresh", "go_back", + "go_forward", "goto_url", ] -class N1Action(TypedDict, total=False): - action_type: N1ActionType +class N15Action(TypedDict, total=False): + action_type: N15ActionType coordinates: tuple[int, int] | list[int] start_coordinates: tuple[int, int] | list[int] direction: Literal["up", "down", "left", "right"] amount: int text: str - press_enter_after: bool - clear_before_typing: bool - key_comb: str + key: str + modifier: str + duration: int url: str @@ -97,22 +104,27 @@ def __init__(self, kernel: Kernel, session_id: str, width: int = 1280, height: i self.height = height self.kiosk_mode = kiosk_mode - async def execute(self, action: N1Action) -> ToolResult: + async def execute(self, action: N15Action) -> ToolResult: action_type = action.get("action_type") handlers = { "left_click": lambda a: self._handle_click(a, "left", 1), "double_click": lambda a: self._handle_click(a, "left", 2), "triple_click": lambda a: self._handle_click(a, "left", 3), + "middle_click": lambda a: self._handle_click(a, "middle", 1), "right_click": lambda a: self._handle_click(a, "right", 1), + "mouse_move": self._handle_mouse_move, + "mouse_down": lambda a: self._handle_mouse_button(a, "down"), + "mouse_up": lambda a: self._handle_mouse_button(a, "up"), "scroll": self._handle_scroll, "type": self._handle_type, "key_press": self._handle_key_press, - "hover": self._handle_hover, + "hold_key": self._handle_hold_key, "drag": self._handle_drag, "wait": self._handle_wait, "refresh": self._handle_refresh, "go_back": self._handle_go_back, + "go_forward": self._handle_go_forward, "goto_url": self._handle_goto_url, } @@ -122,22 +134,51 @@ async def execute(self, action: N1Action) -> ToolResult: return await handler(action) - async def _handle_click(self, action: N1Action, button: str, num_clicks: int) -> ToolResult: + async def _handle_click(self, action: N15Action, button: str, num_clicks: int) -> ToolResult: + coords = self._get_coordinates(action.get("coordinates")) + modifier = action.get("modifier") + kwargs: dict[str, Any] = { + "x": coords["x"], + "y": coords["y"], + "button": button, + "click_type": "click", + "num_clicks": num_clicks, + } + if modifier: + kwargs["hold_keys"] = [self._map_key(modifier)] + + self.kernel.browsers.computer.click_mouse(self.session_id, **kwargs) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_mouse_move(self, action: N15Action) -> ToolResult: + coords = self._get_coordinates(action.get("coordinates")) + + self.kernel.browsers.computer.move_mouse( + self.session_id, + x=coords["x"], + y=coords["y"], + ) + + await asyncio.sleep(SCREENSHOT_DELAY_S) + return await self.screenshot() + + async def _handle_mouse_button(self, action: N15Action, click_type: str) -> ToolResult: coords = self._get_coordinates(action.get("coordinates")) self.kernel.browsers.computer.click_mouse( self.session_id, x=coords["x"], y=coords["y"], - button=button, - click_type="click", - num_clicks=num_clicks, + button="left", + click_type=click_type, ) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_scroll(self, action: N1Action) -> ToolResult: + async def _handle_scroll(self, action: N15Action) -> ToolResult: coords = self._get_coordinates(action.get("coordinates")) direction = action.get("direction") notches = max(action.get("amount", 3), 1) @@ -171,45 +212,26 @@ async def _handle_scroll(self, action: N1Action) -> ToolResult: screenshot_result["output"] = f"Scrolled {notches} wheel unit(s) {direction}." return screenshot_result - async def _handle_type(self, action: N1Action) -> ToolResult: + async def _handle_type(self, action: N15Action) -> ToolResult: text = action.get("text") if not text: raise ToolError("text is required for type action") - if action.get("clear_before_typing"): - self.kernel.browsers.computer.press_key( - self.session_id, - keys=["ctrl+a"], - ) - await asyncio.sleep(0.1) - self.kernel.browsers.computer.press_key( - self.session_id, - keys=["BackSpace"], - ) - await asyncio.sleep(0.1) - self.kernel.browsers.computer.type_text( self.session_id, text=text, delay=TYPING_DELAY_MS, ) - if action.get("press_enter_after"): - await asyncio.sleep(0.1) - self.kernel.browsers.computer.press_key( - self.session_id, - keys=["Return"], - ) - await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_key_press(self, action: N1Action) -> ToolResult: - key_comb = action.get("key_comb") - if not key_comb: - raise ToolError("key_comb is required for key_press action") + async def _handle_key_press(self, action: N15Action) -> ToolResult: + key = action.get("key") + if not key: + raise ToolError("key is required for key_press action") - mapped_key = self._map_key(key_comb) + mapped_key = self._map_key(key) self.kernel.browsers.computer.press_key( self.session_id, @@ -219,19 +241,24 @@ async def _handle_key_press(self, action: N1Action) -> ToolResult: await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_hover(self, action: N1Action) -> ToolResult: - coords = self._get_coordinates(action.get("coordinates")) + async def _handle_hold_key(self, action: N15Action) -> ToolResult: + key = action.get("key") + if not key: + raise ToolError("key is required for hold_key action") - self.kernel.browsers.computer.move_mouse( + mapped_key = self._map_key(key) + duration = action.get("duration") or 1000 + + self.kernel.browsers.computer.press_key( self.session_id, - x=coords["x"], - y=coords["y"], + keys=[mapped_key], + duration=duration, ) await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_drag(self, action: N1Action) -> ToolResult: + async def _handle_drag(self, action: N15Action) -> ToolResult: start_coords = self._get_coordinates(action.get("start_coordinates")) end_coords = self._get_coordinates(action.get("coordinates")) @@ -244,11 +271,13 @@ async def _handle_drag(self, action: N1Action) -> ToolResult: await asyncio.sleep(SCREENSHOT_DELAY_S) return await self.screenshot() - async def _handle_wait(self, action: N1Action) -> ToolResult: - await asyncio.sleep(2) + async def _handle_wait(self, action: N15Action) -> ToolResult: + duration = action.get("duration") + seconds = (duration / 1000) if duration and duration > 0 else 2 + await asyncio.sleep(seconds) return await self.screenshot() - async def _handle_refresh(self, action: N1Action) -> ToolResult: + async def _handle_refresh(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, keys=["F5"], @@ -256,7 +285,7 @@ async def _handle_refresh(self, action: N1Action) -> ToolResult: await asyncio.sleep(2) return await self.screenshot() - async def _handle_go_back(self, action: N1Action) -> ToolResult: + async def _handle_go_back(self, action: N15Action) -> ToolResult: self.kernel.browsers.computer.press_key( self.session_id, keys=["alt+Left"], @@ -264,7 +293,15 @@ async def _handle_go_back(self, action: N1Action) -> ToolResult: await asyncio.sleep(1.5) return await self.screenshot() - async def _handle_goto_url(self, action: N1Action) -> ToolResult: + async def _handle_go_forward(self, action: N15Action) -> ToolResult: + self.kernel.browsers.computer.press_key( + self.session_id, + keys=["alt+Right"], + ) + await asyncio.sleep(1.5) + return await self.screenshot() + + async def _handle_goto_url(self, action: N15Action) -> ToolResult: url = action.get("url") if not url: raise ToolError("url is required for goto_url action") diff --git a/pkg/templates/typescript/yutori/README.md b/pkg/templates/typescript/yutori/README.md index 92c009d3..2d9f781e 100644 --- a/pkg/templates/typescript/yutori/README.md +++ b/pkg/templates/typescript/yutori/README.md @@ -1,8 +1,8 @@ -# Kernel TypeScript Sample App - Yutori n1 Computer Use +# Kernel TypeScript Sample App - Yutori n1.5 Computer Use -This is a Kernel application that implements a prompt loop using Yutori's n1 computer use model with Kernel's Computer Controls API. +This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API. -[n1](https://yutori.com/blog/introducing-navigator) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. +[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. ## Setup @@ -55,9 +55,9 @@ kernel invoke ts-yutori-cua cua-task --payload '{"query": "Enter https://example ## Viewport Configuration -Yutori n1 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. +Yutori n1.5 recommends a **1280×800 (WXGA, 16:10)** viewport for best grounding accuracy. -> **Note:** n1 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. +> **Note:** n1.5 outputs coordinates in a 1000×1000 relative space, which are automatically scaled to the actual viewport dimensions. See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport) for all supported configurations. @@ -65,25 +65,36 @@ See [Kernel Viewport Documentation](https://www.kernel.sh/docs/browsers/viewport Screenshots are automatically converted to WebP format for better compression across multi-step trajectories, as recommended by Yutori. -## n1-latest Supported Actions +## n1.5-latest Supported Actions + +This template uses the `browser_tools_core-20260403` tool set — coordinate-based browser actions that operate on screenshots only. | Action | Description | |--------|-------------| -| `left_click` | Left mouse click at coordinates | -| `double_click` | Double-click at coordinates | -| `triple_click` | Triple-click at coordinates | +| `left_click` | Left mouse click at coordinates (supports `modifier`) | +| `double_click` | Double-click at coordinates (supports `modifier`) | +| `triple_click` | Triple-click at coordinates (supports `modifier`) | +| `middle_click` | Middle mouse click at coordinates | | `right_click` | Right mouse click at coordinates | +| `mouse_move` | Move mouse to coordinates without clicking | +| `mouse_down` | Press the left mouse button at coordinates | +| `mouse_up` | Release the left mouse button at coordinates | | `scroll` | Scroll page in a direction | | `type` | Type text into focused element | -| `key_press` | Send keyboard input | -| `hover` | Move mouse without clicking | +| `key_press` | Send a single key or key combination | +| `hold_key` | Hold a key for a duration | | `drag` | Click-and-drag operation | | `wait` | Pause for UI to update | | `refresh` | Reload current page | | `go_back` | Navigate back in history | +| `go_forward` | Navigate forward in history | | `goto_url` | Navigate to a URL | +### Disabled tools + +The DOM/Playwright-based "expanded" tools (`extract_elements`, `find`, `set_element_value`, `execute_js`) are intentionally disabled via the `disable_tools` request parameter — this template runs computer-use only and does not expose a Playwright page to the model. + ## Resources -- [Yutori n1 API Documentation](https://docs.yutori.com/reference/n1) +- [Yutori n1.5 API Documentation](https://docs.yutori.com/reference/n1-5) - [Kernel Documentation](https://www.kernel.sh/docs/quickstart) diff --git a/pkg/templates/typescript/yutori/index.ts b/pkg/templates/typescript/yutori/index.ts index 364e7237..c38a1b53 100644 --- a/pkg/templates/typescript/yutori/index.ts +++ b/pkg/templates/typescript/yutori/index.ts @@ -47,7 +47,7 @@ app.action( try { // Run the sampling loop const { finalAnswer, messages } = await samplingLoop({ - model: 'n1-latest', + model: 'n1.5-latest', task: payload.query, apiKey: YUTORI_API_KEY, kernel, diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts index e0f94795..f113b61e 100644 --- a/pkg/templates/typescript/yutori/loop.ts +++ b/pkg/templates/typescript/yutori/loop.ts @@ -1,19 +1,25 @@ /** - * Yutori n1 Sampling Loop - * - * Implements the agent loop for Yutori's n1-latest computer use model. - * n1-latest uses an OpenAI-compatible API with tool_calls: + * Yutori n1.5 Sampling Loop + * + * Implements the agent loop for Yutori's n1.5-latest computer use model. + * n1.5-latest uses an OpenAI-compatible API with tool_calls: * - Actions are returned via tool_calls in the assistant message * - Tool results use role: "tool" with matching tool_call_id * - The model stops by returning content without tool_calls * - Coordinates are returned in 1000x1000 space and need scaling - * - * @see https://docs.yutori.com/reference/n1 + * + * @see https://docs.yutori.com/reference/n1-5 */ import OpenAI from 'openai'; import type { Kernel } from '@onkernel/sdk'; -import { ComputerTool, type N1Action, type ToolResult } from './tools/computer'; +import { ComputerTool, type N15Action, type ToolResult } from './tools/computer'; + +// Tools that require a Playwright page / DOM access. The default core tool set +// already excludes them, but we also list them in `disable_tools` so the +// exclusion is explicit and survives if the default ever changes. +const DISABLED_TOOLS = ['extract_elements', 'find', 'set_element_value', 'execute_js']; +const TOOL_SET = 'browser_tools_core-20260403'; interface SamplingLoopOptions { model?: string; @@ -34,7 +40,7 @@ interface SamplingLoopResult { } export async function samplingLoop({ - model = 'n1-latest', + model = 'n1.5-latest', task, apiKey, kernel, @@ -85,6 +91,14 @@ export async function samplingLoop({ messages: conversationMessages, max_completion_tokens: maxCompletionTokens, temperature: 0.3, + // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types). + // tool_set selects the core (coordinate-based) tools. + // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools. + // @ts-expect-error extra_body is a Yutori extension + extra_body: { + tool_set: TOOL_SET, + disable_tools: DISABLED_TOOLS, + }, }); } catch (apiError) { console.error('API call failed:', apiError); @@ -131,8 +145,8 @@ export async function samplingLoop({ continue; } - const action: N1Action = { - action_type: actionName as N1Action['action_type'], + const action: N15Action = { + action_type: actionName as N15Action['action_type'], ...args, }; @@ -190,7 +204,7 @@ export async function samplingLoop({ }; } -function scaleCoordinates(action: N1Action, viewportWidth: number, viewportHeight: number): N1Action { +function scaleCoordinates(action: N15Action, viewportWidth: number, viewportHeight: number): N15Action { const scaled = { ...action }; if (scaled.coordinates) { diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts index 5ba8e09e..7e64b2bd 100644 --- a/pkg/templates/typescript/yutori/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -1,8 +1,10 @@ /** - * Yutori n1 Computer Tool - * - * Maps n1-latest action format to Kernel's Computer Controls API. + * Yutori n1.5 Computer Tool + * + * Maps n1.5-latest action format to Kernel's Computer Controls API. * Screenshots are converted to WebP for better compression across multi-step trajectories. + * + * @see https://docs.yutori.com/reference/n1-5 */ import { Buffer } from 'buffer'; @@ -26,31 +28,36 @@ export class ToolError extends Error { } } -export type N1ActionType = +export type N15ActionType = | 'left_click' | 'double_click' | 'triple_click' + | 'middle_click' | 'right_click' + | 'mouse_move' + | 'mouse_down' + | 'mouse_up' | 'scroll' | 'type' | 'key_press' - | 'hover' + | 'hold_key' | 'drag' | 'wait' | 'refresh' | 'go_back' + | 'go_forward' | 'goto_url'; -export interface N1Action { - action_type: N1ActionType; +export interface N15Action { + action_type: N15ActionType; coordinates?: [number, number]; start_coordinates?: [number, number]; direction?: 'up' | 'down' | 'left' | 'right'; amount?: number; text?: string; - press_enter_after?: boolean; - clear_before_typing?: boolean; - key_comb?: string; + key?: string; + modifier?: string; + duration?: number; url?: string; } @@ -107,7 +114,7 @@ export class ComputerTool { this.kioskMode = kioskMode; } - async execute(action: N1Action): Promise { + async execute(action: N15Action): Promise { const { action_type } = action; switch (action_type) { @@ -117,24 +124,34 @@ export class ComputerTool { return this.handleClick(action, 'left', 2); case 'triple_click': return this.handleClick(action, 'left', 3); + case 'middle_click': + return this.handleClick(action, 'middle', 1); case 'right_click': return this.handleClick(action, 'right', 1); + case 'mouse_move': + return this.handleMouseMove(action); + case 'mouse_down': + return this.handleMouseButton(action, 'down'); + case 'mouse_up': + return this.handleMouseButton(action, 'up'); case 'scroll': return this.handleScroll(action); case 'type': return this.handleType(action); case 'key_press': return this.handleKeyPress(action); - case 'hover': - return this.handleHover(action); + case 'hold_key': + return this.handleHoldKey(action); case 'drag': return this.handleDrag(action); case 'wait': - return this.handleWait(); + return this.handleWait(action); case 'refresh': return this.handleRefresh(); case 'go_back': return this.handleGoBack(); + case 'go_forward': + return this.handleGoForward(); case 'goto_url': return this.handleGotoUrl(action); default: @@ -142,22 +159,50 @@ export class ComputerTool { } } - private async handleClick(action: N1Action, button: 'left' | 'right', numClicks: number): Promise { + private async handleClick(action: N15Action, button: 'left' | 'right' | 'middle', numClicks: number): Promise { const coords = this.getCoordinates(action.coordinates); - + const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined; + await this.kernel.browsers.computer.clickMouse(this.sessionId, { x: coords.x, y: coords.y, button, click_type: 'click', num_clicks: numClicks, + ...(holdKeys ? { hold_keys: holdKeys } : {}), + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleMouseMove(action: N15Action): Promise { + const coords = this.getCoordinates(action.coordinates); + + await this.kernel.browsers.computer.moveMouse(this.sessionId, { + x: coords.x, + y: coords.y, + }); + + await this.sleep(SCREENSHOT_DELAY_MS); + return this.screenshot(); + } + + private async handleMouseButton(action: N15Action, clickType: 'down' | 'up'): Promise { + const coords = this.getCoordinates(action.coordinates); + + await this.kernel.browsers.computer.clickMouse(this.sessionId, { + x: coords.x, + y: coords.y, + button: 'left', + click_type: clickType, }); await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); } - private async handleScroll(action: N1Action): Promise { + private async handleScroll(action: N15Action): Promise { const coords = this.getCoordinates(action.coordinates); const direction = action.direction; const notches = Math.max(action.amount ?? 3, 1); @@ -199,46 +244,28 @@ export class ComputerTool { }; } - private async handleType(action: N1Action): Promise { + private async handleType(action: N15Action): Promise { const text = action.text; if (!text) { throw new ToolError('text is required for type action'); } - if (action.clear_before_typing) { - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['ctrl+a'], - }); - await this.sleep(100); - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['BackSpace'], - }); - await this.sleep(100); - } - await this.kernel.browsers.computer.typeText(this.sessionId, { text, delay: TYPING_DELAY_MS, }); - if (action.press_enter_after) { - await this.sleep(100); - await this.kernel.browsers.computer.pressKey(this.sessionId, { - keys: ['Return'], - }); - } - await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); } - private async handleKeyPress(action: N1Action): Promise { - const keyComb = action.key_comb; - if (!keyComb) { - throw new ToolError('key_comb is required for key_press action'); + private async handleKeyPress(action: N15Action): Promise { + const key = action.key; + if (!key) { + throw new ToolError('key is required for key_press action'); } - const mappedKey = this.mapKey(keyComb); + const mappedKey = this.mapKey(key); await this.kernel.browsers.computer.pressKey(this.sessionId, { keys: [mappedKey], @@ -248,19 +275,25 @@ export class ComputerTool { return this.screenshot(); } - private async handleHover(action: N1Action): Promise { - const coords = this.getCoordinates(action.coordinates); + private async handleHoldKey(action: N15Action): Promise { + const key = action.key; + if (!key) { + throw new ToolError('key is required for hold_key action'); + } - await this.kernel.browsers.computer.moveMouse(this.sessionId, { - x: coords.x, - y: coords.y, + const mappedKey = this.mapKey(key); + const durationMs = action.duration && action.duration > 0 ? action.duration : 1000; + + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: [mappedKey], + duration: durationMs, }); await this.sleep(SCREENSHOT_DELAY_MS); return this.screenshot(); } - private async handleDrag(action: N1Action): Promise { + private async handleDrag(action: N15Action): Promise { const startCoords = this.getCoordinates(action.start_coordinates); const endCoords = this.getCoordinates(action.coordinates); @@ -273,8 +306,9 @@ export class ComputerTool { return this.screenshot(); } - private async handleWait(): Promise { - await this.sleep(2000); + private async handleWait(action: N15Action): Promise { + const durationMs = action.duration && action.duration > 0 ? action.duration : 2000; + await this.sleep(durationMs); return this.screenshot(); } @@ -296,7 +330,16 @@ export class ComputerTool { return this.screenshot(); } - private async handleGotoUrl(action: N1Action): Promise { + private async handleGoForward(): Promise { + await this.kernel.browsers.computer.pressKey(this.sessionId, { + keys: ['alt+Right'], + }); + + await this.sleep(1500); + return this.screenshot(); + } + + private async handleGotoUrl(action: N15Action): Promise { const url = action.url; if (!url) { throw new ToolError('url is required for goto_url action'); From 0f8468bb529ee14664b9b05e104243d899235e67 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Thu, 7 May 2026 16:06:11 +0000 Subject: [PATCH 2/7] wire scroll modifier through to hold_keys n1.5's scroll action accepts an optional `modifier` (e.g., shift) that on browsers translates a vertical wheel into a horizontal scroll. Plumb it into Kernel's `ComputerScrollParams.hold_keys` so the OS-level modifier+wheel event is dispatched correctly. Co-Authored-By: Claude Opus 4.7 --- pkg/templates/python/yutori/tools/computer.py | 18 +++++++++++------- .../typescript/yutori/tools/computer.ts | 3 +++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py index 18b3504c..5f285b87 100644 --- a/pkg/templates/python/yutori/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -199,13 +199,17 @@ async def _handle_scroll(self, action: N15Action) -> ToolResult: elif direction == "right": delta_x = notches - self.kernel.browsers.computer.scroll( - self.session_id, - x=coords["x"], - y=coords["y"], - delta_x=delta_x, - delta_y=delta_y, - ) + modifier = action.get("modifier") + scroll_kwargs: dict[str, Any] = { + "x": coords["x"], + "y": coords["y"], + "delta_x": delta_x, + "delta_y": delta_y, + } + if modifier: + scroll_kwargs["hold_keys"] = [self._map_key(modifier)] + + self.kernel.browsers.computer.scroll(self.session_id, **scroll_kwargs) await asyncio.sleep(SCREENSHOT_DELAY_S) screenshot_result = await self.screenshot() diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts index 7e64b2bd..9cc792c5 100644 --- a/pkg/templates/typescript/yutori/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -229,11 +229,14 @@ export class ComputerTool { break; } + const holdKeys = action.modifier ? [this.mapKey(action.modifier)] : undefined; + await this.kernel.browsers.computer.scroll(this.sessionId, { x: coords.x, y: coords.y, delta_x, delta_y, + ...(holdKeys ? { hold_keys: holdKeys } : {}), }); await this.sleep(SCREENSHOT_DELAY_MS); From 8d3f065ab820feb455f28cf16ea12d92fe4fcabd Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Thu, 7 May 2026 16:36:53 +0000 Subject: [PATCH 3/7] frame readme with n1.5 benchmarks and scope Co-Authored-By: Claude Opus 4.7 --- pkg/templates/python/yutori/README.md | 6 ++++-- pkg/templates/typescript/yutori/README.md | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/pkg/templates/python/yutori/README.md b/pkg/templates/python/yutori/README.md index 8f7df74a..3fd63d5b 100644 --- a/pkg/templates/python/yutori/README.md +++ b/pkg/templates/python/yutori/README.md @@ -1,8 +1,10 @@ # Kernel Python Sample App - Yutori n1.5 Computer Use -This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API. +This Kernel app implements a prompt loop using Yutori's Navigator n1.5 with Kernel's Computer Controls API. -[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. +[Navigator n1.5](https://yutori.com/blog/introducing-n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. Yutori reports **94.5% on Online-Mind2Web** and **88.0% on Navi-Bench v2** at **$1.50 per 1M input tokens** — currently among the most accurate and cost-effective computer-use models. + +This template runs n1.5 in **computer-use-only mode**. n1.5 also supports a hybrid vision + DOM/JavaScript path (page-state extraction, custom JS, structured JSON output) for multi-field forms and bulk data extraction, but those tools are intentionally disabled here — see [Disabled tools](#disabled-tools). ## Setup diff --git a/pkg/templates/typescript/yutori/README.md b/pkg/templates/typescript/yutori/README.md index 2d9f781e..efdfbc61 100644 --- a/pkg/templates/typescript/yutori/README.md +++ b/pkg/templates/typescript/yutori/README.md @@ -1,8 +1,10 @@ # Kernel TypeScript Sample App - Yutori n1.5 Computer Use -This is a Kernel application that implements a prompt loop using Yutori's n1.5 computer use model with Kernel's Computer Controls API. +This Kernel app implements a prompt loop using Yutori's Navigator n1.5 with Kernel's Computer Controls API. -[Navigator n1.5](https://docs.yutori.com/reference/n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. +[Navigator n1.5](https://yutori.com/blog/introducing-n1-5) is Yutori's pixels-to-actions LLM that predicts browser actions from screenshots. Yutori reports **94.5% on Online-Mind2Web** and **88.0% on Navi-Bench v2** at **$1.50 per 1M input tokens** — currently among the most accurate and cost-effective computer-use models. + +This template runs n1.5 in **computer-use-only mode**. n1.5 also supports a hybrid vision + DOM/JavaScript path (page-state extraction, custom JS, structured JSON output) for multi-field forms and bulk data extraction, but those tools are intentionally disabled here — see [Disabled tools](#disabled-tools). ## Setup From f4f31a6a4ca599876b93cb1e809a17364608c18c Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Thu, 7 May 2026 17:00:28 +0000 Subject: [PATCH 4/7] trim old screenshots from request history Mirror yutori-sdk-python's reference loop: deep-copy a request-only view of the messages and strip old image_url blocks once the JSON payload exceeds ~9.5 MB, while always preserving the most recent 6 screenshots and the very latest one. The caller's full history is left intact for the return value. Co-Authored-By: Claude Opus 4.7 --- pkg/templates/python/yutori/loop.py | 93 ++++++++++++++++++++- pkg/templates/typescript/yutori/loop.ts | 104 +++++++++++++++++++++++- 2 files changed, 195 insertions(+), 2 deletions(-) diff --git a/pkg/templates/python/yutori/loop.py b/pkg/templates/python/yutori/loop.py index ad39024b..447ebff6 100644 --- a/pkg/templates/python/yutori/loop.py +++ b/pkg/templates/python/yutori/loop.py @@ -11,6 +11,7 @@ @see https://docs.yutori.com/reference/n1-5 """ +import copy import json from typing import Any, Optional @@ -25,6 +26,13 @@ DISABLED_TOOLS = ["extract_elements", "find", "set_element_value", "execute_js"] TOOL_SET = "browser_tools_core-20260403" +# Screenshot-trimming defaults mirror Yutori's reference loop: +# https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py +# Trimming is size-triggered — we only drop old screenshots when the payload +# exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS. +MAX_REQUEST_BYTES = 9_500_000 +KEEP_RECENT_SCREENSHOTS = 6 + async def sampling_loop( *, @@ -69,10 +77,14 @@ async def sampling_loop( iteration += 1 print(f"\n=== Iteration {iteration} ===") + request_messages, dropped = _trimmed_for_request(conversation_messages) + if dropped: + print(f"Trimmed {dropped} old screenshot(s) to fit request size limit") + try: response = client.chat.completions.create( model=model, - messages=conversation_messages, + messages=request_messages, max_completion_tokens=max_completion_tokens, temperature=0.3, # n1.5-specific knobs go in extra_body. @@ -168,6 +180,85 @@ async def sampling_loop( } +def _trimmed_for_request( + messages: list[dict[str, Any]], +) -> tuple[list[dict[str, Any]], int]: + """Return a deep-copied messages list with old screenshots stripped to fit MAX_REQUEST_BYTES. + + The most recent KEEP_RECENT_SCREENSHOTS screenshots are protected. The full + `messages` list is preserved unchanged for the caller's return value. + """ + trimmed = copy.deepcopy(messages) + size = _estimate_size(trimmed) + if size <= MAX_REQUEST_BYTES: + return trimmed, 0 + + image_indices = [i for i, m in enumerate(trimmed) if _message_has_image(m)] + if not image_indices: + return trimmed, 0 + + protected = set(image_indices[-max(1, KEEP_RECENT_SCREENSHOTS):]) + removed = 0 + + for idx in image_indices: + if size <= MAX_REQUEST_BYTES: + break + if idx in protected: + continue + if _strip_one_image(trimmed[idx]): + removed += 1 + size = _estimate_size(trimmed) + + # If still over, strip from the protected window too — but always keep the latest. + if size > MAX_REQUEST_BYTES: + last_idx = image_indices[-1] + for idx in image_indices: + if size <= MAX_REQUEST_BYTES: + break + if idx == last_idx: + continue + if _strip_one_image(trimmed[idx]): + removed += 1 + size = _estimate_size(trimmed) + + return trimmed, removed + + +def _estimate_size(messages: list[dict[str, Any]]) -> int: + return len(json.dumps(messages, separators=(",", ":"), ensure_ascii=False).encode("utf-8")) + + +def _message_has_image(msg: dict[str, Any]) -> bool: + content = msg.get("content") + if not isinstance(content, list): + return False + return any(isinstance(p, dict) and p.get("type") == "image_url" for p in content) + + +def _strip_one_image(msg: dict[str, Any]) -> bool: + content = msg.get("content") + if not isinstance(content, list): + return False + + removed = False + new_content: list[dict[str, Any]] = [] + for part in content: + if not removed and isinstance(part, dict) and part.get("type") == "image_url": + removed = True + continue + new_content.append(part) + + if not removed: + return False + + has_text = any(isinstance(p, dict) and p.get("type") == "text" for p in new_content) + if not has_text: + new_content.append({"type": "text", "text": "Screenshot omitted to stay under request size limit."}) + + msg["content"] = new_content + return True + + def _scale_coordinates(action: N15Action, viewport_width: int, viewport_height: int) -> N15Action: scaled = dict(action) diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts index f113b61e..a0f3aef8 100644 --- a/pkg/templates/typescript/yutori/loop.ts +++ b/pkg/templates/typescript/yutori/loop.ts @@ -21,6 +21,13 @@ import { ComputerTool, type N15Action, type ToolResult } from './tools/computer' const DISABLED_TOOLS = ['extract_elements', 'find', 'set_element_value', 'execute_js']; const TOOL_SET = 'browser_tools_core-20260403'; +// Screenshot-trimming defaults mirror Yutori's reference loop: +// https://github.com/yutori-ai/yutori-sdk-python/blob/main/yutori/navigator/payload.py +// Trimming is size-triggered — we only drop old screenshots when the payload +// exceeds MAX_REQUEST_BYTES, and we always keep at least KEEP_RECENT_SCREENSHOTS. +const MAX_REQUEST_BYTES = 9_500_000; +const KEEP_RECENT_SCREENSHOTS = 6; + interface SamplingLoopOptions { model?: string; task: string; @@ -84,11 +91,16 @@ export async function samplingLoop({ iteration++; console.log(`\n=== Iteration ${iteration} ===`); + const { messages: requestMessages, removed } = trimmedForRequest(conversationMessages); + if (removed > 0) { + console.log(`Trimmed ${removed} old screenshot(s) to fit request size limit`); + } + let response; try { response = await client.chat.completions.create({ model, - messages: conversationMessages, + messages: requestMessages, max_completion_tokens: maxCompletionTokens, temperature: 0.3, // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types). @@ -223,3 +235,93 @@ function scaleCoordinates(action: N15Action, viewportWidth: number, viewportHeig return scaled; } + +interface ImagePart { + type: 'image_url'; + image_url: { url: string }; +} + +interface TextPart { + type: 'text'; + text: string; +} + +type ContentPart = ImagePart | TextPart | Record; + +function estimateSize(messages: OpenAI.ChatCompletionMessageParam[]): number { + return Buffer.byteLength(JSON.stringify(messages), 'utf-8'); +} + +function messageHasImage(msg: OpenAI.ChatCompletionMessageParam): boolean { + const content = (msg as { content?: unknown }).content; + if (!Array.isArray(content)) return false; + return content.some((p) => typeof p === 'object' && p !== null && (p as { type?: unknown }).type === 'image_url'); +} + +function stripOneImage(msg: OpenAI.ChatCompletionMessageParam): boolean { + const content = (msg as { content?: unknown }).content; + if (!Array.isArray(content)) return false; + + let removed = false; + const next: ContentPart[] = []; + for (const part of content as ContentPart[]) { + if (!removed && typeof part === 'object' && part !== null && (part as { type?: unknown }).type === 'image_url') { + removed = true; + continue; + } + next.push(part); + } + if (!removed) return false; + + const hasText = next.some((p) => typeof p === 'object' && p !== null && (p as { type?: unknown }).type === 'text'); + if (!hasText) { + next.push({ type: 'text', text: 'Screenshot omitted to stay under request size limit.' }); + } + + (msg as { content: unknown }).content = next; + return true; +} + +function trimmedForRequest( + messages: OpenAI.ChatCompletionMessageParam[], +): { messages: OpenAI.ChatCompletionMessageParam[]; removed: number } { + // Deep-copy so the caller's full history is preserved unchanged. + const trimmed = JSON.parse(JSON.stringify(messages)) as OpenAI.ChatCompletionMessageParam[]; + + let size = estimateSize(trimmed); + if (size <= MAX_REQUEST_BYTES) return { messages: trimmed, removed: 0 }; + + const imageIndices: number[] = []; + for (let i = 0; i < trimmed.length; i++) { + if (messageHasImage(trimmed[i])) imageIndices.push(i); + } + if (imageIndices.length === 0) return { messages: trimmed, removed: 0 }; + + const keep = Math.max(1, KEEP_RECENT_SCREENSHOTS); + const protectedIdx = new Set(imageIndices.slice(-keep)); + let removed = 0; + + for (const idx of imageIndices) { + if (size <= MAX_REQUEST_BYTES) break; + if (protectedIdx.has(idx)) continue; + if (stripOneImage(trimmed[idx])) { + removed++; + size = estimateSize(trimmed); + } + } + + // If still over, strip from the protected window too — but always keep the latest. + if (size > MAX_REQUEST_BYTES) { + const lastIdx = imageIndices[imageIndices.length - 1]; + for (const idx of imageIndices) { + if (size <= MAX_REQUEST_BYTES) break; + if (idx === lastIdx) continue; + if (stripOneImage(trimmed[idx])) { + removed++; + size = estimateSize(trimmed); + } + } + } + + return { messages: trimmed, removed }; +} From d716a9af5478fce23c3d24a103d041192e0407b8 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Thu, 7 May 2026 17:01:04 +0000 Subject: [PATCH 5/7] shrink post-action screenshot delay to 150ms MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 300ms after every action was conservative — at 50 iterations that is ~15s of pure wall-clock waiting before model calls. 150ms still gives the page enough time to settle for typical interactions while halving the per-step overhead. Co-Authored-By: Claude Opus 4.7 --- pkg/templates/python/yutori/tools/computer.py | 2 +- pkg/templates/typescript/yutori/tools/computer.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py index 5f285b87..58d5a4a3 100644 --- a/pkg/templates/python/yutori/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -19,7 +19,7 @@ from .base import ToolError, ToolResult TYPING_DELAY_MS = 12 -SCREENSHOT_DELAY_S = 0.3 +SCREENSHOT_DELAY_S = 0.15 ACTION_DELAY_S = 0.3 N15ActionType = Literal[ diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts index 9cc792c5..13e144c0 100644 --- a/pkg/templates/typescript/yutori/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -12,7 +12,7 @@ import type { Kernel } from '@onkernel/sdk'; import sharp from 'sharp'; const TYPING_DELAY_MS = 12; -const SCREENSHOT_DELAY_MS = 300; +const SCREENSHOT_DELAY_MS = 150; const ACTION_DELAY_MS = 300; export interface ToolResult { From 23b63f0026c6bbd5ed80c0aeccfd1abfcb566c6e Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Thu, 7 May 2026 18:27:28 +0000 Subject: [PATCH 6/7] fix(yutori): send tool_set + disable_tools at top level in TS loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The openai-node SDK does not have a Python-style `extra_body` kwarg — it serializes the body as-is. Passing `extra_body: {...}` as a body field made Yutori receive a literal `{"extra_body": ...}` and silently drop the tool_set pin and disable_tools defense-in-depth. Hoist both fields and apply them via a typed spread (mirrors the anthropic-computer-use loop pattern). --- pkg/templates/typescript/yutori/loop.ts | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/pkg/templates/typescript/yutori/loop.ts b/pkg/templates/typescript/yutori/loop.ts index a0f3aef8..7db2e4ca 100644 --- a/pkg/templates/typescript/yutori/loop.ts +++ b/pkg/templates/typescript/yutori/loop.ts @@ -28,6 +28,11 @@ const TOOL_SET = 'browser_tools_core-20260403'; const MAX_REQUEST_BYTES = 9_500_000; const KEEP_RECENT_SCREENSHOTS = 6; +interface YutoriExtras { + tool_set: string; + disable_tools: string[]; +} + interface SamplingLoopOptions { model?: string; task: string; @@ -98,19 +103,21 @@ export async function samplingLoop({ let response; try { + // n1.5-specific knobs (not in OpenAI SDK types). The openai-node SDK + // serializes the body as-is, so these go at the top level via a spread — + // unlike the Python SDK, there is no `extra_body` kwarg here. + // tool_set selects the core (coordinate-based) tools. + // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools. + const yutoriExtras: YutoriExtras = { + tool_set: TOOL_SET, + disable_tools: DISABLED_TOOLS, + }; response = await client.chat.completions.create({ model, messages: requestMessages, max_completion_tokens: maxCompletionTokens, temperature: 0.3, - // n1.5-specific knobs go in extra_body (not yet in OpenAI SDK types). - // tool_set selects the core (coordinate-based) tools. - // disable_tools is a defense-in-depth exclusion of DOM/Playwright tools. - // @ts-expect-error extra_body is a Yutori extension - extra_body: { - tool_set: TOOL_SET, - disable_tools: DISABLED_TOOLS, - }, + ...yutoriExtras, }); } catch (apiError) { console.error('API call failed:', apiError); From d2affba3c718815f5cdfd6a52db24fbfa752d8b8 Mon Sep 17 00:00:00 2001 From: dprevoznik <58714078+dprevoznik@users.noreply.github.com> Date: Thu, 7 May 2026 18:27:32 +0000 Subject: [PATCH 7/7] fix(yutori): map standalone modifier names through MODIFIER_MAP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit key_press / hold_key compound keys already consulted MODIFIER_MAP, but the new `modifier` parameter on click and scroll actions passes a bare modifier name ("control", "meta", "command") that bypassed the lookup and went out as-is. Kernel's hold_keys wants "ctrl" and "super" — so ctrl-click and cmd-click silently dropped the modifier. Unify the per-part mapping into a single helper applied to both the compound and single-key paths in TS and Python. --- pkg/templates/python/yutori/tools/computer.py | 23 +++++++---------- .../typescript/yutori/tools/computer.ts | 25 ++++++++----------- 2 files changed, 20 insertions(+), 28 deletions(-) diff --git a/pkg/templates/python/yutori/tools/computer.py b/pkg/templates/python/yutori/tools/computer.py index 58d5a4a3..afe41696 100644 --- a/pkg/templates/python/yutori/tools/computer.py +++ b/pkg/templates/python/yutori/tools/computer.py @@ -374,18 +374,13 @@ def _get_coordinates( return {"x": int(x), "y": int(y)} def _map_key(self, key: str) -> str: + def map_part(part: str) -> str: + trimmed = part.strip() + lower = trimmed.lower() + if lower in MODIFIER_MAP: + return MODIFIER_MAP[lower] + return KEY_MAP.get(trimmed, trimmed) + if "+" in key: - parts = key.split("+") - mapped_parts = [] - for part in parts: - trimmed = part.strip() - lower = trimmed.lower() - - if lower in MODIFIER_MAP: - mapped_parts.append(MODIFIER_MAP[lower]) - else: - mapped_parts.append(KEY_MAP.get(trimmed, trimmed)) - - return "+".join(mapped_parts) - - return KEY_MAP.get(key, key) + return "+".join(map_part(p) for p in key.split("+")) + return map_part(key) diff --git a/pkg/templates/typescript/yutori/tools/computer.ts b/pkg/templates/typescript/yutori/tools/computer.ts index 13e144c0..d4a33573 100644 --- a/pkg/templates/typescript/yutori/tools/computer.ts +++ b/pkg/templates/typescript/yutori/tools/computer.ts @@ -414,22 +414,19 @@ export class ComputerTool { } private mapKey(key: string): string { + const mapPart = (part: string): string => { + const trimmed = part.trim(); + const lower = trimmed.toLowerCase(); + if (MODIFIER_MAP[lower]) { + return MODIFIER_MAP[lower]; + } + return KEY_MAP[trimmed] || trimmed; + }; + if (key.includes('+')) { - const parts = key.split('+'); - const mappedParts = parts.map(part => { - const trimmed = part.trim(); - const lower = trimmed.toLowerCase(); - - if (MODIFIER_MAP[lower]) { - return MODIFIER_MAP[lower]; - } - - return KEY_MAP[trimmed] || trimmed; - }); - return mappedParts.join('+'); + return key.split('+').map(mapPart).join('+'); } - - return KEY_MAP[key] || key; + return mapPart(key); } private sleep(ms: number): Promise {