From 4dbb65873951093cf5d6bc0bd7fc2542e1b46ed8 Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 17:34:29 -0700 Subject: [PATCH 01/17] Add login fenced block parser + payload schema --- app/src/renderer/hub/chat-v2/htmlBlocks.ts | 58 ++++++++++++++++++++-- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/app/src/renderer/hub/chat-v2/htmlBlocks.ts b/app/src/renderer/hub/chat-v2/htmlBlocks.ts index 624da8e4..d213c281 100644 --- a/app/src/renderer/hub/chat-v2/htmlBlocks.ts +++ b/app/src/renderer/hub/chat-v2/htmlBlocks.ts @@ -113,13 +113,27 @@ export interface AskFormPayload { questions: AskQuestion[]; } -export type FenceTag = 'html' | 'htmlview' | 'options' | 'ask'; +/** + * Payload for a ```login fence — credential collection during browsing. + * `site` is the brand token (e.g. "Amazon", not "amazon.com"). `url` is + * required so the manual-login escape hatch knows where to deep-link. + */ +export interface LoginPayload { + site: string; + url: string; + prompt?: string; + usernameLabel?: string; + passwordLabel?: string; +} + +export type FenceTag = 'html' | 'htmlview' | 'options' | 'ask' | 'login'; export type ExtractEvent = | { kind: 'text'; text: string } | { kind: 'html_block'; content: string; tag: 'html' | 'htmlview'; complete: boolean } | { kind: 'option_list'; complete: boolean; raw: string; parsed: OptionListPayload | null; error?: string } - | { kind: 'ask_form'; complete: boolean; raw: string; parsed: AskFormPayload | null; error?: string }; + | { kind: 'ask_form'; complete: boolean; raw: string; parsed: AskFormPayload | null; error?: string } + | { kind: 'login_form'; complete: boolean; raw: string; parsed: LoginPayload | null; error?: string }; /** * Stateful, chunk-fed extractor. Safe to call `feed` once per streamed @@ -244,8 +258,8 @@ export function extractAll(chunks: string[]): ExtractEvent[] { * newline arrives in the next chunk); LAX also accepts end-of-input * (used only during the final `end()` flush). */ -const OPENER_STRICT = /(^|\n)```(html|htmlview|options|ask)[ \t]*\r?\n/; -const OPENER_LAX = /(^|\n)```(html|htmlview|options|ask)[ \t]*(\r?\n|$)/; +const OPENER_STRICT = /(^|\n)```(html|htmlview|options|ask|login)[ \t]*\r?\n/; +const OPENER_LAX = /(^|\n)```(html|htmlview|options|ask|login)[ \t]*(\r?\n|$)/; function findOpener(buf: string, flush: boolean): { start: number; end: number; tag: FenceTag } | null { const re = flush ? OPENER_LAX : OPENER_STRICT; @@ -321,9 +335,45 @@ function emitBlock(tag: FenceTag, content: string, complete: boolean): ExtractEv const { parsed, error } = parseAskForm(content, { partial: !complete }); return { kind: 'ask_form', complete, raw: content, parsed, error }; } + if (tag === 'login') { + const { parsed, error } = parseLoginBlock(content); + return { kind: 'login_form', complete, raw: content, parsed, error }; + } return { kind: 'html_block', content, tag: tag as 'html' | 'htmlview', complete }; } +/** + * Parse + validate a login block body. The payload is a small flat object; + * there's no partial-streaming path because the body is tiny and the form + * has nothing useful to render until the closing fence resolves the JSON. + */ +export function parseLoginBlock(raw: string): { parsed: LoginPayload | null; error?: string } { + let data: unknown; + try { + data = JSON.parse(raw); + } catch { + return { parsed: null, error: 'invalid json' }; + } + if (!data || typeof data !== 'object' || Array.isArray(data)) { + return { parsed: null, error: 'expected json object at top level' }; + } + const obj = data as Record; + const site = typeof obj.site === 'string' ? obj.site.trim() : ''; + const url = typeof obj.url === 'string' ? obj.url.trim() : ''; + if (!site) return { parsed: null, error: 'missing required field "site"' }; + if (!url) return { parsed: null, error: 'missing required field "url"' }; + if (!isAbsoluteHttpUrl(url)) return { parsed: null, error: 'url must be an absolute http(s) URL' }; + return { + parsed: { + site, + url, + prompt: typeof obj.prompt === 'string' && obj.prompt.trim().length > 0 ? obj.prompt.trim() : undefined, + usernameLabel: typeof obj.usernameLabel === 'string' && obj.usernameLabel.trim().length > 0 ? obj.usernameLabel.trim() : undefined, + passwordLabel: typeof obj.passwordLabel === 'string' && obj.passwordLabel.trim().length > 0 ? obj.passwordLabel.trim() : undefined, + }, + }; +} + /** * Parse + validate an options block body. Returns the canonical payload * with defaults filled in, or an error string explaining why it was From 875323ecc7d9d4c16571d40424637e187950f2f1 Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 17:34:32 -0700 Subject: [PATCH 02/17] Add LoginForm component for in-chat credential entry --- app/src/renderer/hub/chat-v2/LoginForm.tsx | 304 +++++++++++++++++++++ app/src/renderer/hub/chat-v2/loginForm.css | 255 +++++++++++++++++ 2 files changed, 559 insertions(+) create mode 100644 app/src/renderer/hub/chat-v2/LoginForm.tsx create mode 100644 app/src/renderer/hub/chat-v2/loginForm.css diff --git a/app/src/renderer/hub/chat-v2/LoginForm.tsx b/app/src/renderer/hub/chat-v2/LoginForm.tsx new file mode 100644 index 00000000..6eab017a --- /dev/null +++ b/app/src/renderer/hub/chat-v2/LoginForm.tsx @@ -0,0 +1,304 @@ +/** + * LoginForm — credential entry surface rendered for a `login` fenced block. + * + * Mirrors the OptionList / AskForm pattern: agent emits the fence, the + * component renders in place, and on submit we resume the session with a + * structured user turn. The agent reads the credentials on its next turn + * and types them into the live browser view. + * + * Two affordances: + * - Primary: username + password fields → "Login for :\n…" turn. + * - Escape: "Log in manually in the browser" link → opens the site + * externally and resumes with "I'll log in to myself". + * + * Persistence is transcript-derived (same as OptionList): on reload we + * read the next user turn and reconstruct the receipt view. The + * receipt NEVER re-renders the plaintext password — only the agent's + * raw transcript carries it. + */ + +import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import type { LoginPayload } from './htmlBlocks'; +import './loginForm.css'; + +interface Props { + payload: LoginPayload | null; + complete: boolean; + error?: string; + sessionId?: string; + /** User reply turn that follows this form, if any. */ + nextUserText?: string | null; +} + +type Mode = 'live' | 'submitted-credentials'; + +export function LoginForm(props: Props): React.ReactElement { + const { payload, complete, error, sessionId, nextUserText } = props; + if (!payload) { + if (complete && error) { + return ( +
+
login block ignored: {error}
+
+ ); + } + return ; + } + return ; +} + +function LoginFormSkeleton(): React.ReactElement { + return ( +
+
+
+
+
+ ); +} + +interface ReadyProps { + payload: LoginPayload; + sessionId?: string; + nextUserText?: string | null; +} + +function siteFaviconUrl(url: string): string | undefined { + try { + const { hostname } = new URL(url); + return `https://www.google.com/s2/favicons?domain=${hostname}&sz=64`; + } catch { + return undefined; + } +} + +function siteHostname(url: string): string | undefined { + try { + return new URL(url).hostname.replace(/^www\./, ''); + } catch { + return undefined; + } +} + +function LoginFormReady({ payload, sessionId, nextUserText }: ReadyProps): React.ReactElement { + const { site, url, prompt, usernameLabel, passwordLabel } = payload; + const usernameLbl = usernameLabel || 'Username'; + const passwordLbl = passwordLabel || 'Password'; + + const transcriptMode = useMemo( + () => deriveLoginSubmissionMode(nextUserText, site), + [nextUserText, site], + ); + + const [username, setUsername] = useState(''); + const [password, setPassword] = useState(''); + const [showPassword, setShowPassword] = useState(false); + const [mode, setMode] = useState(transcriptMode ?? 'live'); + const [submitError, setSubmitError] = useState(null); + const [localSubmit, setLocalSubmit] = useState(false); + const usernameRef = useRef(null); + + // Late-arriving transcript: hydrate if it shows up after first paint. + useEffect(() => { + if (!transcriptMode || localSubmit) return; + setMode(transcriptMode); + }, [transcriptMode, localSubmit]); + + // Auto-focus the username field on first mount (live state only). + useEffect(() => { + if (mode === 'live' && usernameRef.current) { + usernameRef.current.focus({ preventScroll: true }); + } + }, [mode]); + + const canSubmit = username.trim().length > 0 && password.length > 0; + + const submitCredentials = useCallback(async (): Promise => { + if (!canSubmit || mode !== 'live') return; + if (!sessionId) { + setSubmitError('no active session'); + return; + } + const message = `Login for ${site}:\nusername: ${username}\npassword: ${password}`; + setLocalSubmit(true); + setMode('submitted-credentials'); + setSubmitError(null); + try { + const result = await window.electronAPI?.sessions?.resume(sessionId, message); + if (result?.error) { + setSubmitError(result.error); + setMode('live'); + setLocalSubmit(false); + } + } catch (err) { + setSubmitError((err as Error).message); + setMode('live'); + setLocalSubmit(false); + } + }, [canSubmit, mode, sessionId, site, username, password]); + + const openBrowserView = useCallback((): void => { + // Same action the chatbar's BrowserPreview thumbnail performs — opens + // the in-app WebContentsView so the user can log in directly there. + // No synthetic chat message; the user signals "done" by typing their + // own next message. + window.dispatchEvent(new CustomEvent('chatv2:open-browser')); + }, []); + + if (mode === 'submitted-credentials') { + return ( +
+
+
+ Submitted credentials for {site} +
+ +
+ agent will type these into the live tab +
+
+
+ ); + } + + return ( +
+
+
+ {prompt || `Sign in to ${site}`} +
+ +
+ +
{ e.preventDefault(); void submitCredentials(); }} + > + + + + + + +
+ +
+ + {submitError && ( +
{submitError}
+ )} + +
+ ); +} + +/** Hostname + favicon row under the prompt — same credibility cue + * OptionList uses for its "Results from " attribution. */ +function SiteAttribution({ url, site }: { url: string; site: string }): React.ReactElement | null { + const [broken, setBroken] = useState(false); + const host = siteHostname(url); + const src = siteFaviconUrl(url); + if (!host) return null; + return ( +
+ {src && !broken && ( + setBroken(true)} + /> + )} + + {site} + + {host} + +
+ ); +} + +function ManualLoginLink({ site, onClick }: { site: string; onClick: () => void }): React.ReactElement { + return ( + + ); +} + +/** + * Detect whether `text` is the user-reply turn that follows this login + * form and, if so, which mode the form should restore to. Returns null + * when the text isn't a login reply for this `site`. + * + * Exported for tests. + */ +export function deriveLoginSubmissionMode( + text: string | null | undefined, + site: string, +): 'submitted-credentials' | null { + if (!text) return null; + if (text.trimStart().startsWith(`Login for ${site}:`)) return 'submitted-credentials'; + return null; +} diff --git a/app/src/renderer/hub/chat-v2/loginForm.css b/app/src/renderer/hub/chat-v2/loginForm.css new file mode 100644 index 00000000..51a4ac07 --- /dev/null +++ b/app/src/renderer/hub/chat-v2/loginForm.css @@ -0,0 +1,255 @@ +/* LoginForm — credential entry surface. See LoginForm.tsx. + * Uses the same design-system tokens as OptionList / AskForm so theme + * flips (light/dark) come for free. */ + +.chatv2-login { + clear: both; + display: flex; + flex-direction: column; + gap: 14px; + margin-top: 18px; + padding: 18px 20px 16px; + background: var(--color-bg-base); + border: 1px solid var(--color-border-subtle); + border-radius: var(--radius-lg, 10px); + max-width: 440px; +} + +.chatv2-login__head { + display: flex; + flex-direction: column; + gap: 3px; + min-width: 0; +} +.chatv2-login__prompt { + font-size: 15px; + font-weight: 600; + color: var(--color-fg-primary); + letter-spacing: -0.005em; + line-height: 1.3; +} + +.chatv2-login__form { + display: flex; + flex-direction: column; + gap: 12px; +} + +.chatv2-login__field { + display: flex; + flex-direction: column; + gap: 6px; +} +.chatv2-login__field-label { + font-size: 12px; + font-weight: 500; + color: var(--color-fg-secondary); + letter-spacing: 0.01em; +} + +.chatv2-login__input { + width: 100%; + background: var(--color-bg-base); + border: 1px solid var(--color-border-subtle); + border-radius: var(--radius-sm, 5px); + padding: 9px 11px; + font-family: inherit; + font-size: 13.5px; + color: var(--color-fg-primary); + transition: border-color var(--duration-fast, 100ms) var(--ease-out, ease); +} +.chatv2-login__input:focus { + outline: none; + border-color: var(--color-border-strong); +} +.chatv2-login__input:disabled { + opacity: 0.55; + cursor: not-allowed; +} + +.chatv2-login__password-wrap { + position: relative; + display: flex; + align-items: center; +} +.chatv2-login__password-wrap .chatv2-login__input { + padding-right: 52px; +} +.chatv2-login__reveal { + position: absolute; + right: 8px; + top: 50%; + transform: translateY(-50%); + background: transparent; + border: 0; + color: var(--color-fg-tertiary); + font: inherit; + font-size: 11.5px; + cursor: pointer; + padding: 4px 6px; + border-radius: var(--radius-xs, 3px); +} +.chatv2-login__reveal:hover { + color: var(--color-fg-primary); + background: color-mix(in srgb, var(--color-fg-primary) 6%, transparent); +} + +.chatv2-login__foot { + display: flex; + align-items: center; + gap: 14px; + padding-top: 6px; + flex-wrap: wrap; +} +.chatv2-login__submit { + background: var(--color-fg-primary); + color: var(--color-bg-base); + border: 0; + padding: 11px 22px; + border-radius: 999px; + font-family: inherit; + font-size: 13.5px; + font-weight: 600; + cursor: pointer; + transition: filter var(--duration-fast, 100ms) var(--ease-out, ease); +} +.chatv2-login__submit:hover:not(:disabled) { filter: brightness(0.93); } +.chatv2-login__submit:disabled { + background: transparent; + color: var(--color-fg-tertiary); + border: 1px solid var(--color-border-subtle); + cursor: not-allowed; +} + +.chatv2-login__manual-link { + align-self: flex-start; + display: inline-flex; + align-items: center; + gap: 4px; + background: transparent; + border: 0; + color: var(--color-fg-secondary); + font-family: inherit; + font-size: 12.5px; + cursor: pointer; + padding: 4px 0; +} +.chatv2-login__manual-link > span { + text-decoration: underline; + text-underline-offset: 3px; +} +.chatv2-login__manual-link:hover { + color: var(--color-fg-primary); +} +.chatv2-login__manual-link-icon { + width: 13px; + height: 13px; + border-radius: 2px; + object-fit: contain; + flex-shrink: 0; +} +.chatv2-login__manual-link-arrow { + width: 12px; + height: 12px; + flex-shrink: 0; + opacity: 0; + transform: translate(-2px, 2px); + transition: + opacity var(--duration-fast, 100ms) var(--ease-out, ease), + transform var(--duration-fast, 100ms) var(--ease-out, ease); +} +.chatv2-login__manual-link:hover .chatv2-login__manual-link-arrow, +.chatv2-login__manual-link:focus-visible .chatv2-login__manual-link-arrow { + opacity: 0.8; + transform: translate(0, 0); +} + +/* Site attribution row: favicon + bold brand · hostname. + * Same credibility cue OptionList uses ("Results from "). */ +.chatv2-login__attribution { + display: inline-flex; + align-items: center; + gap: 6px; + font-size: 12px; + line-height: 1.4; + color: var(--color-fg-secondary); +} +.chatv2-login__attribution-icon { + width: 12px; + height: 12px; + border-radius: 2px; + object-fit: contain; + flex-shrink: 0; +} +.chatv2-login__attribution-text { + display: inline-flex; + align-items: baseline; + gap: 6px; + min-width: 0; +} +.chatv2-login__attribution-site { + color: var(--color-fg-primary); + font-weight: 600; +} +.chatv2-login__attribution-sep { + color: var(--color-fg-quaternary, var(--color-fg-tertiary)); +} +.chatv2-login__attribution-host { + font-size: 11.5px; + color: var(--color-fg-tertiary); +} + +.chatv2-login__submit-error { + font-size: 12px; + color: var(--color-status-danger, #ff7a7a); + margin-top: 4px; +} + +/* Post-submit collapsed view — dashed border receipt, no password echo. */ +.chatv2-login--submitted { + background: transparent; + border-style: dashed; + padding: 12px 16px; +} +.chatv2-login__receipt { + display: flex; + flex-direction: column; + gap: 3px; + min-width: 0; +} +.chatv2-login__receipt-title { + font-size: 13.5px; + font-weight: 600; + color: var(--color-fg-primary); +} +.chatv2-login__receipt-meta { + font-size: 12px; + color: var(--color-fg-tertiary); +} + +/* Skeleton state */ +.chatv2-login__skel-line, +.chatv2-login__skel-input { + border-radius: var(--radius-xs, 3px); + background: linear-gradient(110deg, + color-mix(in srgb, var(--color-fg-primary) 6%, transparent) 30%, + color-mix(in srgb, var(--color-fg-primary) 16%, transparent) 50%, + color-mix(in srgb, var(--color-fg-primary) 6%, transparent) 70%); + background-size: 200% 100%; + animation: chatv2-login-sk 1.4s infinite linear; +} +.chatv2-login__skel-line { height: 14px; width: 60%; } +.chatv2-login__skel-input { height: 36px; width: 100%; border-radius: var(--radius-sm, 5px); } +@keyframes chatv2-login-sk { + 0% { background-position: 100% 0; } + 100% { background-position: -100% 0; } +} + +.chatv2-login__error { + font-family: var(--font-mono, ui-monospace, SFMono-Regular, Menlo, monospace); + font-size: 12px; + color: var(--color-fg-tertiary); + padding: 10px 12px; + border: 1px dashed var(--color-border-subtle); + border-radius: var(--radius-md, 6px); +} From bc4e5e801f3728a8eccc0595d898dd308a1069f8 Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 17:34:38 -0700 Subject: [PATCH 03/17] Wire LoginForm into chat turn dispatch + browser-view event listener --- app/src/renderer/hub/chat/ChatPane.tsx | 8 ++++++++ app/src/renderer/hub/chat/ChatTurn.tsx | 15 ++++++++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/app/src/renderer/hub/chat/ChatPane.tsx b/app/src/renderer/hub/chat/ChatPane.tsx index eb663475..4d45ef4f 100644 --- a/app/src/renderer/hub/chat/ChatPane.tsx +++ b/app/src/renderer/hub/chat/ChatPane.tsx @@ -45,6 +45,14 @@ export function ChatPane({ sessionId, onSwitchToBrowser, onExit }: ChatPaneProps return () => { cancelled = true; }; }, [sessionId]); + // Listen for in-renderer requests to switch to the browser view (emitted + // by structured blocks like LoginForm's "log in myself" escape hatch). + useEffect(() => { + const handler = (): void => { onSwitchToBrowser(); }; + window.addEventListener('chatv2:open-browser', handler); + return () => window.removeEventListener('chatv2:open-browser', handler); + }, [onSwitchToBrowser]); + const header = useSessionsStore( useShallow((s): { prompt: string; diff --git a/app/src/renderer/hub/chat/ChatTurn.tsx b/app/src/renderer/hub/chat/ChatTurn.tsx index babe211e..9b27c942 100644 --- a/app/src/renderer/hub/chat/ChatTurn.tsx +++ b/app/src/renderer/hub/chat/ChatTurn.tsx @@ -15,6 +15,7 @@ import { extractAll } from '../chat-v2/htmlBlocks'; import { HtmlBlock } from '../chat-v2/HtmlBlock'; import { OptionList } from '../chat-v2/OptionList'; import { AskForm } from '../chat-v2/AskForm'; +import { LoginForm } from '../chat-v2/LoginForm'; const USER_BUBBLE_CLAMP_LINES = 10; const USER_BUBBLE_CLAMP_CHARS = 600; @@ -423,7 +424,7 @@ function StreamingProse({ // `htmlview`, and `options` fences and emits structured events for // each. Cheap to run (regex-based, pure) — re-execute on every render. const events = extractAll([target]); - const hasStructuredBlock = events.some((e) => e.kind === 'html_block' || e.kind === 'option_list' || e.kind === 'ask_form'); + const hasStructuredBlock = events.some((e) => e.kind === 'html_block' || e.kind === 'option_list' || e.kind === 'ask_form' || e.kind === 'login_form'); // Hook must run unconditionally (rules-of-hooks); result is only consumed // on the no-structured-block branch below. const shown = useTypewriter(target, 110, done); @@ -466,6 +467,18 @@ function StreamingProse({ /> ); } + if (e.kind === 'login_form') { + return ( + + ); + } return ( Date: Tue, 26 May 2026 17:34:42 -0700 Subject: [PATCH 04/17] Add login block guidance to engine system prompts --- app/src/main/hl/engines/browsercode/adapter.ts | 3 ++- app/src/main/hl/engines/claude-code/adapter.ts | 3 ++- app/src/main/hl/engines/codex/adapter.ts | 3 ++- app/src/main/hl/engines/skillIndexPrompt.ts | 17 +++++++++++++++++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/app/src/main/hl/engines/browsercode/adapter.ts b/app/src/main/hl/engines/browsercode/adapter.ts index 5aeb06de..b05d6289 100644 --- a/app/src/main/hl/engines/browsercode/adapter.ts +++ b/app/src/main/hl/engines/browsercode/adapter.ts @@ -8,7 +8,7 @@ import { register } from '../registry'; import { applyBrowserHarnessEnv } from '../browserHarnessEnv'; -import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines } from '../skillIndexPrompt'; +import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines, loginBlockGuidanceLines } from '../skillIndexPrompt'; import { resolveThemeMode } from '../../../themeMode'; import { enrichedEnv } from '../pathEnrich'; import { runCliCapture } from '../cliSpawn'; @@ -179,6 +179,7 @@ const browserCodeAdapter: EngineAdapter = { ...htmlBlockGuidanceLines(resolveThemeMode()), ...optionsBlockGuidanceLines(), ...askBlockGuidanceLines(), + ...loginBlockGuidanceLines(), "Use the `browser-harness-js` CLI for browser actions. Start with `browser-harness-js 'await connectToAssignedTarget()'`.", 'Do not use old helpers.js convenience APIs for browser control.', 'Do not edit harness files unless the user asks or a confirmed Browser Harness JS defect blocks the task.', diff --git a/app/src/main/hl/engines/claude-code/adapter.ts b/app/src/main/hl/engines/claude-code/adapter.ts index 1fd45a96..ec2c3f58 100644 --- a/app/src/main/hl/engines/claude-code/adapter.ts +++ b/app/src/main/hl/engines/claude-code/adapter.ts @@ -13,7 +13,7 @@ import { mainLogger } from '../../../logger'; import { register } from '../registry'; import { applyBrowserHarnessEnv } from '../browserHarnessEnv'; -import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines } from '../skillIndexPrompt'; +import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines, loginBlockGuidanceLines } from '../skillIndexPrompt'; import { resolveThemeMode } from '../../../themeMode'; import { enrichedEnv } from '../pathEnrich'; import { runCliCapture, spawnCli } from '../cliSpawn'; @@ -126,6 +126,7 @@ const claudeCodeAdapter: EngineAdapter = { ...htmlBlockGuidanceLines(resolveThemeMode()), ...optionsBlockGuidanceLines(), ...askBlockGuidanceLines(), + ...loginBlockGuidanceLines(), "Use the `browser-harness-js` CLI for browser actions. Start with `browser-harness-js 'await connectToAssignedTarget()'`.", 'Do not use old helpers.js convenience APIs for browser control.', 'Do not edit harness files unless the user asks or a confirmed Browser Harness JS defect blocks the task.', diff --git a/app/src/main/hl/engines/codex/adapter.ts b/app/src/main/hl/engines/codex/adapter.ts index 2d818e39..2c062350 100644 --- a/app/src/main/hl/engines/codex/adapter.ts +++ b/app/src/main/hl/engines/codex/adapter.ts @@ -21,7 +21,7 @@ import path from 'node:path'; import { mainLogger } from '../../../logger'; import { register } from '../registry'; import { applyBrowserHarnessEnv } from '../browserHarnessEnv'; -import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines } from '../skillIndexPrompt'; +import { buildSkillIndexPrompt, SKILL_DISCOVERY_AND_LIFECYCLE_LINES, htmlBlockGuidanceLines, optionsBlockGuidanceLines, askBlockGuidanceLines, loginBlockGuidanceLines } from '../skillIndexPrompt'; import { resolveThemeMode } from '../../../themeMode'; import { enrichedEnv } from '../pathEnrich'; import { runCliCapture } from '../cliSpawn'; @@ -117,6 +117,7 @@ const codexAdapter: EngineAdapter = { ...htmlBlockGuidanceLines(resolveThemeMode()), ...optionsBlockGuidanceLines(), ...askBlockGuidanceLines(), + ...loginBlockGuidanceLines(), "Use the `browser-harness-js` CLI for browser actions. Start with `browser-harness-js 'await connectToAssignedTarget()'`.", 'Do not use old helpers.js convenience APIs for browser control.', 'Do not edit harness files unless the user asks or a confirmed Browser Harness JS defect blocks the task.', diff --git a/app/src/main/hl/engines/skillIndexPrompt.ts b/app/src/main/hl/engines/skillIndexPrompt.ts index d70b9583..08e09a59 100644 --- a/app/src/main/hl/engines/skillIndexPrompt.ts +++ b/app/src/main/hl/engines/skillIndexPrompt.ts @@ -98,6 +98,23 @@ export function askBlockGuidanceLines(): string[] { ]; } +/** + * Provider-neutral nudge for the `login` fenced block — the renderer + * surfaces it as a username/password form with a "log in manually in the + * browser" escape hatch. The agent reads the credentials from the next + * user turn and types them into the live browser view. See the + * `login-block` interaction skill for the full schema and the + * manual-login fallback contract. + */ +export function loginBlockGuidanceLines(): string[] { + return [ + 'When the live browser hits a login wall and you need the user to provide credentials, emit a ```login fenced block carrying JSON: { site, url, prompt?, usernameLabel?, passwordLabel? }. `site` is the brand token (e.g. "Amazon", not "amazon.com"); `url` is the absolute http(s) login URL.', + 'The `login` block ENDS YOUR TURN. After emitting it, do not call any more tools — stop and wait for the user. Their reply arrives as "Login for :\\nusername: \\npassword:

" — type these verbatim into the username/password fields of the live tab, then submit. Do NOT echo the password back in your own response.', + 'The form also offers the user a "log in on myself" affordance that opens the in-app browser view directly; if they take that path you will not get a structured reply, just whatever they type next (e.g. "done"). Treat any plain follow-up message as the signal to resume.', + 'Use `login` only for real credential walls. For multiple-choice disambiguation, use `ask`; for picking among visible options, use `options`. See the `login-block` interaction skill for the full schema and worked examples.', + ]; +} + function normalizeSlash(value: string): string { return value.split(path.sep).join('/'); } From c571027ca86d264a0a612c9db7e3bab00e38559b Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 19:53:56 -0700 Subject: [PATCH 05/17] Add capture fenced block parser + payload schema --- app/src/renderer/hub/chat-v2/htmlBlocks.ts | 60 ++++++++++++++-- app/tests/unit/chat-v2/captureBlocks.test.ts | 75 ++++++++++++++++++++ 2 files changed, 131 insertions(+), 4 deletions(-) create mode 100644 app/tests/unit/chat-v2/captureBlocks.test.ts diff --git a/app/src/renderer/hub/chat-v2/htmlBlocks.ts b/app/src/renderer/hub/chat-v2/htmlBlocks.ts index d213c281..f6a38b94 100644 --- a/app/src/renderer/hub/chat-v2/htmlBlocks.ts +++ b/app/src/renderer/hub/chat-v2/htmlBlocks.ts @@ -126,14 +126,30 @@ export interface LoginPayload { passwordLabel?: string; } -export type FenceTag = 'html' | 'htmlview' | 'options' | 'ask' | 'login'; +/** + * Payload for a ```capture fence — recaptcha 3×3 (or NxM) tile picker. + * `image` is an absolute path under the harness outputs dir; the renderer + * loads it via `chatfile://files`. The renderer slices that single + * image visually into rows×cols tiles via CSS `background-position`; no + * pre-cropping happens server-side. Selection is returned to the agent as + * a list of tile indices (0-based, left-to-right, top-to-bottom). + */ +export interface CapturePayload { + image: string; + prompt?: string; + rows: number; + cols: number; +} + +export type FenceTag = 'html' | 'htmlview' | 'options' | 'ask' | 'login' | 'capture'; export type ExtractEvent = | { kind: 'text'; text: string } | { kind: 'html_block'; content: string; tag: 'html' | 'htmlview'; complete: boolean } | { kind: 'option_list'; complete: boolean; raw: string; parsed: OptionListPayload | null; error?: string } | { kind: 'ask_form'; complete: boolean; raw: string; parsed: AskFormPayload | null; error?: string } - | { kind: 'login_form'; complete: boolean; raw: string; parsed: LoginPayload | null; error?: string }; + | { kind: 'login_form'; complete: boolean; raw: string; parsed: LoginPayload | null; error?: string } + | { kind: 'capture_block'; complete: boolean; raw: string; parsed: CapturePayload | null; error?: string }; /** * Stateful, chunk-fed extractor. Safe to call `feed` once per streamed @@ -258,8 +274,8 @@ export function extractAll(chunks: string[]): ExtractEvent[] { * newline arrives in the next chunk); LAX also accepts end-of-input * (used only during the final `end()` flush). */ -const OPENER_STRICT = /(^|\n)```(html|htmlview|options|ask|login)[ \t]*\r?\n/; -const OPENER_LAX = /(^|\n)```(html|htmlview|options|ask|login)[ \t]*(\r?\n|$)/; +const OPENER_STRICT = /(^|\n)```(html|htmlview|options|ask|login|capture)[ \t]*\r?\n/; +const OPENER_LAX = /(^|\n)```(html|htmlview|options|ask|login|capture)[ \t]*(\r?\n|$)/; function findOpener(buf: string, flush: boolean): { start: number; end: number; tag: FenceTag } | null { const re = flush ? OPENER_LAX : OPENER_STRICT; @@ -339,6 +355,10 @@ function emitBlock(tag: FenceTag, content: string, complete: boolean): ExtractEv const { parsed, error } = parseLoginBlock(content); return { kind: 'login_form', complete, raw: content, parsed, error }; } + if (tag === 'capture') { + const { parsed, error } = parseCaptureBlock(content); + return { kind: 'capture_block', complete, raw: content, parsed, error }; + } return { kind: 'html_block', content, tag: tag as 'html' | 'htmlview', complete }; } @@ -347,6 +367,38 @@ function emitBlock(tag: FenceTag, content: string, complete: boolean): ExtractEv * there's no partial-streaming path because the body is tiny and the form * has nothing useful to render until the closing fence resolves the JSON. */ +/** + * Parse + validate a capture block body. Small flat JSON; no streaming + * partial path since the body is tiny and the picker can't render + * meaningfully until the image path is in hand. + */ +export function parseCaptureBlock(raw: string): { parsed: CapturePayload | null; error?: string } { + let data: unknown; + try { + data = JSON.parse(raw); + } catch { + return { parsed: null, error: 'invalid json' }; + } + if (!data || typeof data !== 'object' || Array.isArray(data)) { + return { parsed: null, error: 'expected json object at top level' }; + } + const obj = data as Record; + const image = typeof obj.image === 'string' ? obj.image.trim() : ''; + if (!image) return { parsed: null, error: 'missing required field "image"' }; + const rowsRaw = typeof obj.rows === 'number' && Number.isFinite(obj.rows) ? Math.floor(obj.rows) : 3; + const colsRaw = typeof obj.cols === 'number' && Number.isFinite(obj.cols) ? Math.floor(obj.cols) : 3; + const rows = Math.min(8, Math.max(1, rowsRaw)); + const cols = Math.min(8, Math.max(1, colsRaw)); + return { + parsed: { + image, + prompt: typeof obj.prompt === 'string' && obj.prompt.trim().length > 0 ? obj.prompt.trim() : undefined, + rows, + cols, + }, + }; +} + export function parseLoginBlock(raw: string): { parsed: LoginPayload | null; error?: string } { let data: unknown; try { diff --git a/app/tests/unit/chat-v2/captureBlocks.test.ts b/app/tests/unit/chat-v2/captureBlocks.test.ts new file mode 100644 index 00000000..b7243346 --- /dev/null +++ b/app/tests/unit/chat-v2/captureBlocks.test.ts @@ -0,0 +1,75 @@ +/** + * Streaming + parsing contract for ```capture fenced blocks. + * + * Mirrors askBlocks.test.ts in shape: pathological 1-char chunking still + * produces a single `capture_block` event with a parsed CapturePayload, + * and the parseCaptureBlock guard rejects malformed bodies cleanly. + */ + +import { describe, expect, it } from 'vitest'; +import { extractAll, parseCaptureBlock, type ExtractEvent } from '@/renderer/hub/chat-v2/htmlBlocks'; + +function stream1(s: string): string[] { + return s.split(''); +} + +describe('capture fence — streaming', () => { + const fence = '```capture\n{"image":"/tmp/grid.png","prompt":"Select motorcycles","rows":3,"cols":3}\n```'; + const wrapped = `Here it is:\n\n${fence}\n\nPress verify when done.`; + + it('emits a single capture_block event under 1-char chunking', () => { + const events = extractAll(stream1(wrapped)); + const captures = events.filter((e): e is Extract => e.kind === 'capture_block'); + expect(captures).toHaveLength(1); + expect(captures[0].complete).toBe(true); + expect(captures[0].parsed).not.toBeNull(); + expect(captures[0].parsed?.image).toBe('/tmp/grid.png'); + expect(captures[0].parsed?.prompt).toBe('Select motorcycles'); + expect(captures[0].parsed?.rows).toBe(3); + expect(captures[0].parsed?.cols).toBe(3); + }); + + it('preserves the surrounding text on either side', () => { + const events = extractAll(stream1(wrapped)); + const texts = events.filter((e) => e.kind === 'text').map((e) => 'text' in e ? e.text : ''); + expect(texts.join('')).toContain('Here it is:'); + expect(texts.join('')).toContain('Press verify when done.'); + }); +}); + +describe('parseCaptureBlock', () => { + it('accepts a minimal valid payload (just image)', () => { + const { parsed, error } = parseCaptureBlock('{"image":"/tmp/g.png"}'); + expect(error).toBeUndefined(); + expect(parsed).toEqual({ image: '/tmp/g.png', prompt: undefined, rows: 3, cols: 3 }); + }); + + it('clamps absurd rows/cols into the [1, 8] range', () => { + const { parsed } = parseCaptureBlock('{"image":"/tmp/g.png","rows":99,"cols":0}'); + expect(parsed?.rows).toBe(8); + expect(parsed?.cols).toBe(1); + }); + + it('rejects missing image', () => { + const { parsed, error } = parseCaptureBlock('{"prompt":"x"}'); + expect(parsed).toBeNull(); + expect(error).toMatch(/image/); + }); + + it('rejects malformed JSON', () => { + const { parsed, error } = parseCaptureBlock('{not json'); + expect(parsed).toBeNull(); + expect(error).toMatch(/json/i); + }); + + it('rejects a top-level array', () => { + const { parsed, error } = parseCaptureBlock('[]'); + expect(parsed).toBeNull(); + expect(error).toMatch(/object/); + }); + + it('drops a blank prompt', () => { + const { parsed } = parseCaptureBlock('{"image":"/x.png","prompt":" "}'); + expect(parsed?.prompt).toBeUndefined(); + }); +}); From 8686f36107acc2e28543c27ab06336b9cbb12f5e Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 19:53:56 -0700 Subject: [PATCH 06/17] Add CaptureBlock component for in-chat captcha tile picker --- app/src/renderer/hub/chat-v2/CaptureBlock.tsx | 266 ++++++++++++++++++ app/src/renderer/hub/chat-v2/captureBlock.css | 238 ++++++++++++++++ app/tests/unit/chat-v2/CaptureBlock.spec.tsx | 143 ++++++++++ 3 files changed, 647 insertions(+) create mode 100644 app/src/renderer/hub/chat-v2/CaptureBlock.tsx create mode 100644 app/src/renderer/hub/chat-v2/captureBlock.css create mode 100644 app/tests/unit/chat-v2/CaptureBlock.spec.tsx diff --git a/app/src/renderer/hub/chat-v2/CaptureBlock.tsx b/app/src/renderer/hub/chat-v2/CaptureBlock.tsx new file mode 100644 index 00000000..e8488211 --- /dev/null +++ b/app/src/renderer/hub/chat-v2/CaptureBlock.tsx @@ -0,0 +1,266 @@ +/** + * CaptureBlock — recaptcha-style NxM tile picker rendered from a single + * screenshot the agent emitted in a ```capture fence. + * + * The one image is sliced into rows×cols equal tiles purely via CSS + * (`background-size` + `background-position`). The user toggles tiles; + * submission returns the selected tile indices as the next user turn + * shaped as: + * + * Captcha selected tiles: 0, 2, 6 + * + * The agent retains the captcha's page coordinates (it captured them + * to clip the screenshot) and converts indices into per-tile click + * coordinates on its side — the renderer never sends pixels back. + */ + +import React, { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import type { CapturePayload } from './htmlBlocks'; +import { getSubmissionRecord, recordSubmission, submissionKey } from './optionListStore'; +import './captureBlock.css'; + +interface Props { + payload: CapturePayload | null; + complete: boolean; + error?: string; + sessionId?: string; + nextUserText?: string | null; +} + +const SUBMIT_PREFIX = 'Captcha selected tiles:'; + +export function CaptureBlock(props: Props): React.ReactElement { + const { payload, complete, error, sessionId, nextUserText } = props; + if (!payload) { + if (complete && error) { + return ( +

+
capture block ignored: {error}
+
+ ); + } + return ; + } + return ; +} + +function CaptureSkeleton(): React.ReactElement { + return ( +
+
+ {Array.from({ length: 9 }).map((_, i) => ( +
+ ))} +
+
+ ); +} + +interface ReadyProps { + payload: CapturePayload; + sessionId?: string; + streaming?: boolean; + nextUserText?: string | null; +} + +function CaptureReady({ payload, sessionId, streaming, nextUserText }: ReadyProps): React.ReactElement { + const { image, prompt, rows, cols } = payload; + const tileCount = rows * cols; + + // Stable cache key so re-mounts in the same renderer session preserve state. + const cacheKey = useMemo( + () => `capture:${submissionKey(sessionId, [image, String(rows), String(cols)])}`, + [sessionId, image, rows, cols], + ); + const cachedRecord = useMemo(() => getSubmissionRecord(cacheKey), [cacheKey]); + + // Transcript-derived submission — reconstruct selection from a prior + // "Captcha selected tiles: …" reply. Wins over the in-memory cache so + // reopened sessions stay correct without DB persistence. + const transcriptSelection = useMemo( + () => deriveCaptureSubmission(nextUserText, tileCount), + [nextUserText, tileCount], + ); + + const [selected, setSelected] = useState>(() => { + if (transcriptSelection) return new Set(transcriptSelection); + if (cachedRecord) { + const set = new Set(); + for (const id of cachedRecord.selectedIds) { + const n = Number.parseInt(id, 10); + if (Number.isInteger(n) && n >= 0 && n < tileCount) set.add(n); + } + return set; + } + return new Set(); + }); + const [submitted, setSubmitted] = useState( + transcriptSelection !== null || cachedRecord !== null, + ); + const [submitError, setSubmitError] = useState(null); + const [localSubmit, setLocalSubmit] = useState(false); + // Per-tile press-animation state so the scale-down feels tactile rather + // than relying on CSS :active (which doesn't trigger on keyboard activation). + const [pressing, setPressing] = useState(null); + const gridRef = useRef(null); + + // Late-arriving transcript hydration — same pattern as AskForm. + useEffect(() => { + if (!transcriptSelection || localSubmit) return; + setSelected(new Set(transcriptSelection)); + setSubmitted(true); + }, [transcriptSelection, localSubmit]); + + const toggle = useCallback((idx: number): void => { + if (submitted) return; + setSelected((prev) => { + const next = new Set(prev); + if (next.has(idx)) next.delete(idx); + else next.add(idx); + return next; + }); + }, [submitted]); + + const submit = useCallback(async (): Promise => { + if (submitted) return; + if (!sessionId) { + setSubmitError('no active session'); + return; + } + const sortedIndices = Array.from(selected).sort((a, b) => a - b); + const message = sortedIndices.length === 0 + ? `${SUBMIT_PREFIX} (none)` + : `${SUBMIT_PREFIX} ${sortedIndices.join(', ')}`; + setLocalSubmit(true); + setSubmitted(true); + setSubmitError(null); + try { + const result = await window.electronAPI?.sessions?.resume(sessionId, message); + if (result?.error) { + setSubmitError(result.error); + setSubmitted(false); + setLocalSubmit(false); + } else { + recordSubmission(cacheKey, sortedIndices.map((n) => String(n))); + } + } catch (err) { + setSubmitError((err as Error).message); + setSubmitted(false); + setLocalSubmit(false); + } + }, [submitted, sessionId, selected, cacheKey]); + + const imageSrc = useMemo(() => { + // The image path may already be a URL (chatfile://, data:, http(s):); only + // wrap raw absolute paths into chatfile://files. + if (/^[a-z]+:/i.test(image)) return image; + return `chatfile://files${encodeURI(image)}`; + }, [image]); + + const submitLabel = submitted + ? 'Sent to agent' + : selected.size === 0 + ? 'Confirm (no tiles)' + : `Confirm ${selected.size} tile${selected.size === 1 ? '' : 's'}`; + + return ( +
+ {prompt && ( +
{prompt}
+ )} +
+ {Array.from({ length: tileCount }).map((_, i) => { + const row = Math.floor(i / cols); + const col = i % cols; + // CSS background-size of (cols*100%, rows*100%) tiles the same image + // at native size across all cells; background-position picks the slice. + const bgX = cols === 1 ? 0 : (col / (cols - 1)) * 100; + const bgY = rows === 1 ? 0 : (row / (rows - 1)) * 100; + const isSel = selected.has(i); + const isPressing = pressing === i; + return ( + + ); + })} +
+
+ + {submitError && ( + {submitError} + )} +
+
+ ); +} + +/** + * Reverse of the submit format: parse "Captcha selected tiles: 0, 2, 6" + * (or "Captcha selected tiles: (none)") into a Set of tile + * indices. Returns null when text isn't a capture reply. + * + * Exported for tests. + */ +export function deriveCaptureSubmission( + text: string | null | undefined, + tileCount: number, +): Set | null { + if (!text) return null; + const head = text.trimStart(); + if (!head.startsWith(SUBMIT_PREFIX)) return null; + const rest = head.slice(SUBMIT_PREFIX.length).trim(); + if (rest.length === 0) return new Set(); + if (rest.toLowerCase() === '(none)') return new Set(); + const out = new Set(); + for (const part of rest.split(/[,\s]+/)) { + if (!part) continue; + const n = Number.parseInt(part, 10); + if (Number.isInteger(n) && n >= 0 && n < tileCount) out.add(n); + } + return out; +} diff --git a/app/src/renderer/hub/chat-v2/captureBlock.css b/app/src/renderer/hub/chat-v2/captureBlock.css new file mode 100644 index 00000000..c36dcf0b --- /dev/null +++ b/app/src/renderer/hub/chat-v2/captureBlock.css @@ -0,0 +1,238 @@ +/* CaptureBlock — recaptcha-style NxM tile picker. + * + * Uses design-system semantic tokens from theme.shell.css; auto-flips + * with [data-mode="light"] so no light-mode block is needed here. + * + * The grid is one sliced into rows×cols tiles entirely via CSS + * background-position. See CaptureBlock.tsx for the math. + */ + +.chatv2-capture { + clear: both; + display: flex; + flex-direction: column; + gap: 12px; + margin-top: 18px; + padding: 4px 0; +} + +.chatv2-capture__prompt { + font-size: 15px; + font-weight: 600; + color: var(--color-fg-primary); + letter-spacing: -0.005em; + line-height: 1.35; +} + +.chatv2-capture__grid { + display: grid; + grid-template-columns: repeat(var(--cols, 3), 1fr); + grid-auto-rows: 1fr; + gap: 4px; + width: 100%; + max-width: 360px; + aspect-ratio: var(--cols, 3) / var(--rows, 3); + border-radius: 10px; + overflow: hidden; + background: var(--color-surface-2, #1a1a1a); + outline: none; +} + +.chatv2-capture__tile { + position: relative; + border: none; + padding: 0; + margin: 0; + background-color: var(--color-surface-1, #111); + background-repeat: no-repeat; + cursor: pointer; + overflow: hidden; + outline: none; + /* Snap to integer pixels so adjacent tiles share an exact edge — no + * sub-pixel seams that show the slicing. */ + transform: translateZ(0); + transition: transform 120ms cubic-bezier(0.2, 0.7, 0.2, 1), + filter 140ms ease-out, + box-shadow 140ms ease-out; + animation: chatv2-capture-tile-in 280ms cubic-bezier(0.2, 0.8, 0.2, 1) both; + animation-delay: calc(var(--tile-index, 0) * 28ms); + will-change: transform; +} + +@keyframes chatv2-capture-tile-in { + from { opacity: 0; transform: translateY(6px) scale(0.96); } + to { opacity: 1; transform: translateY(0) scale(1); } +} + +.chatv2-capture__tile:hover:not(:disabled) { + filter: brightness(1.08); + z-index: 1; +} + +.chatv2-capture__tile:focus-visible { + box-shadow: inset 0 0 0 2px var(--color-fg-primary); + z-index: 2; +} + +/* Pointer-down: snap down 4% with a fast curve. Pointer-up returns + * through the transform transition above for a tactile bounce-back. */ +.chatv2-capture__tile[data-pressing="true"]:not(:disabled) { + transform: scale(0.94); + transition-duration: 70ms; + filter: brightness(0.95); +} + +/* Selected state — neutral b/w treatment that matches OptionList / + * AskForm. Ring uses the foreground color (mode-inverted: white on dark, + * black on light), the wash is a soft neutral dim, and the check chip + * pill is fg-on-bg so it reads the same way in both modes. */ +.chatv2-capture__tile[data-selected="true"] { + box-shadow: inset 0 0 0 3px var(--color-fg-primary); +} +.chatv2-capture__tile[data-selected="true"]::after { + content: ''; + position: absolute; + inset: 0; + background: rgba(0, 0, 0, 0.32); + pointer-events: none; + animation: chatv2-capture-wash-in 160ms ease-out both; +} +@keyframes chatv2-capture-wash-in { + from { opacity: 0; } + to { opacity: 1; } +} + +.chatv2-capture__check { + position: absolute; + top: 6px; + right: 6px; + width: 22px; + height: 22px; + border-radius: 11px; + background: var(--color-fg-primary); + color: var(--color-bg-base); + display: flex; + align-items: center; + justify-content: center; + transform: scale(0); + transition: transform 180ms cubic-bezier(0.34, 1.56, 0.64, 1); + z-index: 2; + pointer-events: none; + box-shadow: 0 1px 4px rgba(0, 0, 0, 0.35); +} +.chatv2-capture__tile[data-selected="true"] .chatv2-capture__check { + transform: scale(1); +} + +.chatv2-capture__tile:disabled { + cursor: default; + filter: none; +} + +/* Skeleton variant — placeholder grid while the fence is still streaming + * (no image yet). */ +.chatv2-capture__grid--skel .chatv2-capture__tile--skel { + background: linear-gradient( + 90deg, + var(--color-surface-1, #111) 0%, + var(--color-surface-2, #1a1a1a) 50%, + var(--color-surface-1, #111) 100% + ); + background-size: 200% 100%; + animation: chatv2-capture-shimmer 1.4s linear infinite; + cursor: default; +} +@keyframes chatv2-capture-shimmer { + from { background-position: 200% 0; } + to { background-position: -200% 0; } +} + +/* Footer — Confirm + error hint. Mirrors the option-list / ask-form foot + * so the chat surface stays visually consistent. */ +.chatv2-capture__foot { + display: flex; + align-items: center; + gap: 10px; + margin-top: 6px; +} + +/* Mirrors OptionList: fg-on-bg solid pill when live, transparent + * outline-only pill when locked. No color accents — keeps the simple + * b/w aesthetic consistent across all chat-v2 pickers. */ +.chatv2-capture__submit { + appearance: none; + background: var(--color-fg-primary); + color: var(--color-bg-base); + border: 0; + border-radius: 999px; + padding: 11px 24px; + font: 600 13.5px var(--font-ui, system-ui, sans-serif); + cursor: pointer; + transition: filter var(--duration-fast, 100ms) var(--ease-out, ease); +} +.chatv2-capture__submit:hover:not(:disabled) { + filter: brightness(0.93); +} +.chatv2-capture__submit:disabled { + background: transparent; + color: var(--color-fg-tertiary); + border: 1px solid var(--color-border-subtle); + cursor: not-allowed; +} + +.chatv2-capture__hint { + font-size: 12.5px; + color: var(--color-fg-tertiary, #888); +} +.chatv2-capture__hint--err { + color: #ff7a7a; +} + +/* Submitted state — the user has confirmed, agent is processing. + * + * No color shift (keep the b/w treatment consistent with options / + * askform). The distinction comes from two unambiguous signals: + * 1) unselected tiles are heavily desaturated and dimmed, so the + * surviving "selected" tiles read as the only live content. + * 2) the foot button switches to the OptionList disabled pill — + * transparent body, tertiary outline, "Sent to agent" text. */ +.chatv2-capture--submitted .chatv2-capture__tile[data-selected="false"] { + filter: grayscale(1) brightness(0.42); + opacity: 0.5; +} +/* Selected tiles flip to a success-green ring + chip on submit so the + * "you finalized these" reading is unambiguous. The foot button stays + * b/w (matches OptionList) — only the tile state carries the success + * color, not the surrounding chrome. */ +.chatv2-capture--submitted .chatv2-capture__tile[data-selected="true"] { + box-shadow: inset 0 0 0 3px #3ecf8e; +} +.chatv2-capture--submitted .chatv2-capture__tile[data-selected="true"]::after { + background: linear-gradient(180deg, rgba(20, 110, 70, 0.30), rgba(20, 110, 70, 0.45)); +} +.chatv2-capture--submitted .chatv2-capture__tile[data-selected="true"] .chatv2-capture__check { + background: #3ecf8e; + color: #062213; +} + +.chatv2-capture__error { + font-size: 12.5px; + color: #ff7a7a; + padding: 6px 0; +} + +@media (prefers-reduced-motion: reduce) { + .chatv2-capture__tile { + animation: none; + transition: none; + } + .chatv2-capture__tile[data-selected="true"]::after { + animation: none; + } + .chatv2-capture__check { + transition: none; + } + .chatv2-capture__grid--skel .chatv2-capture__tile--skel { + animation: none; + } +} diff --git a/app/tests/unit/chat-v2/CaptureBlock.spec.tsx b/app/tests/unit/chat-v2/CaptureBlock.spec.tsx new file mode 100644 index 00000000..ca9c0e1d --- /dev/null +++ b/app/tests/unit/chat-v2/CaptureBlock.spec.tsx @@ -0,0 +1,143 @@ +// @vitest-environment jsdom + +import React, { act } from 'react'; +import { createRoot, type Root } from 'react-dom/client'; +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { CaptureBlock, deriveCaptureSubmission } from '@/renderer/hub/chat-v2/CaptureBlock'; +import type { CapturePayload } from '@/renderer/hub/chat-v2/htmlBlocks'; +import { _resetSubmissionCacheForTests } from '@/renderer/hub/chat-v2/optionListStore'; + +(globalThis as { IS_REACT_ACT_ENVIRONMENT?: boolean }).IS_REACT_ACT_ENVIRONMENT = true; + +type ResumeMock = ReturnType; + +function installBridge(): ResumeMock { + const resume: ResumeMock = vi.fn(async () => ({ ok: true })); + (globalThis as unknown as { window: Window }).window = globalThis.window; + // @ts-expect-error — minimal stub + (globalThis as { window: Window }).window.electronAPI = { sessions: { resume } }; + return resume; +} + +function renderCapture(payload: CapturePayload, sessionId = 'session-1', nextUserText?: string): { container: HTMLDivElement; root: Root } { + const container = document.createElement('div'); + document.body.appendChild(container); + const root = createRoot(container); + act(() => { + root.render( + , + ); + }); + return { container, root }; +} + +function tiles(container: HTMLDivElement): HTMLButtonElement[] { + return Array.from(container.querySelectorAll('button.chatv2-capture__tile')); +} + +function submitBtn(container: HTMLDivElement): HTMLButtonElement { + const btn = container.querySelector('button.chatv2-capture__submit'); + if (!btn) throw new Error('submit button not found'); + return btn; +} + +const basePayload: CapturePayload = { + image: '/tmp/grid.png', + prompt: 'Select motorcycles', + rows: 3, + cols: 3, +}; + +let mounted: Root | null = null; + +beforeEach(() => { + _resetSubmissionCacheForTests(); + installBridge(); +}); + +afterEach(() => { + if (mounted) { + act(() => { mounted!.unmount(); }); + mounted = null; + } + document.body.innerHTML = ''; +}); + +describe('CaptureBlock', () => { + it('renders 9 tile buttons for a 3x3 grid', () => { + const { container, root } = renderCapture(basePayload); + mounted = root; + expect(tiles(container)).toHaveLength(9); + }); + + it('positions each tile background by index', () => { + const { container, root } = renderCapture(basePayload); + mounted = root; + const ts = tiles(container); + expect(ts[0].style.backgroundPosition).toBe('0% 0%'); + expect(ts[4].style.backgroundPosition).toBe('50% 50%'); + expect(ts[8].style.backgroundPosition).toBe('100% 100%'); + expect(ts[0].style.backgroundSize).toBe('300% 300%'); + }); + + it('embeds chatfile:// prefix for raw absolute paths', () => { + const { container, root } = renderCapture(basePayload); + mounted = root; + const ts = tiles(container); + expect(ts[0].style.backgroundImage).toContain('chatfile://files/tmp/grid.png'); + }); + + it('toggles selection on click and submits sorted indices', async () => { + const resume = installBridge(); + const { container, root } = renderCapture(basePayload); + mounted = root; + const ts = tiles(container); + await act(async () => { ts[6].click(); }); + await act(async () => { ts[0].click(); }); + await act(async () => { ts[2].click(); }); + expect(ts[0].getAttribute('data-selected')).toBe('true'); + expect(ts[2].getAttribute('data-selected')).toBe('true'); + expect(ts[6].getAttribute('data-selected')).toBe('true'); + await act(async () => { submitBtn(container).click(); }); + // Allow the resume promise + state setState in the same microtask to flush. + await act(async () => {}); + expect(resume).toHaveBeenCalledTimes(1); + expect(resume).toHaveBeenCalledWith('session-1', 'Captcha selected tiles: 0, 2, 6'); + }); + + it('submits "(none)" when no tiles are selected', async () => { + const resume = installBridge(); + const { container, root } = renderCapture(basePayload); + mounted = root; + await act(async () => { submitBtn(container).click(); }); + await act(async () => {}); + expect(resume).toHaveBeenCalledWith('session-1', 'Captcha selected tiles: (none)'); + }); + + it('hydrates the answered state from transcript text', () => { + const { container, root } = renderCapture(basePayload, 'session-1', 'Captcha selected tiles: 1, 5'); + mounted = root; + const ts = tiles(container); + expect(ts[1].getAttribute('data-selected')).toBe('true'); + expect(ts[5].getAttribute('data-selected')).toBe('true'); + expect(ts[1].disabled).toBe(true); + expect(submitBtn(container).disabled).toBe(true); + expect(submitBtn(container).textContent).toContain('Sent to agent'); + }); +}); + +describe('deriveCaptureSubmission', () => { + it('parses (none) reply as an empty selection', () => { + const out = deriveCaptureSubmission('Captcha selected tiles: (none)', 9); + expect(out?.size).toBe(0); + }); + + it('ignores out-of-range indices', () => { + const out = deriveCaptureSubmission('Captcha selected tiles: 0, 9, 42', 9); + expect(out && Array.from(out)).toEqual([0]); + }); + + it('returns null for non-capture replies', () => { + expect(deriveCaptureSubmission('hello there', 9)).toBeNull(); + }); +}); From 897ac19bd7daad4fc5b6533131a490a01f943e3f Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 19:53:56 -0700 Subject: [PATCH 07/17] Wire CaptureBlock into chat turn dispatch --- app/src/renderer/hub/chat/ChatTurn.tsx | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/app/src/renderer/hub/chat/ChatTurn.tsx b/app/src/renderer/hub/chat/ChatTurn.tsx index 9b27c942..e163e231 100644 --- a/app/src/renderer/hub/chat/ChatTurn.tsx +++ b/app/src/renderer/hub/chat/ChatTurn.tsx @@ -16,6 +16,7 @@ import { HtmlBlock } from '../chat-v2/HtmlBlock'; import { OptionList } from '../chat-v2/OptionList'; import { AskForm } from '../chat-v2/AskForm'; import { LoginForm } from '../chat-v2/LoginForm'; +import { CaptureBlock } from '../chat-v2/CaptureBlock'; const USER_BUBBLE_CLAMP_LINES = 10; const USER_BUBBLE_CLAMP_CHARS = 600; @@ -424,7 +425,7 @@ function StreamingProse({ // `htmlview`, and `options` fences and emits structured events for // each. Cheap to run (regex-based, pure) — re-execute on every render. const events = extractAll([target]); - const hasStructuredBlock = events.some((e) => e.kind === 'html_block' || e.kind === 'option_list' || e.kind === 'ask_form' || e.kind === 'login_form'); + const hasStructuredBlock = events.some((e) => e.kind === 'html_block' || e.kind === 'option_list' || e.kind === 'ask_form' || e.kind === 'login_form' || e.kind === 'capture_block'); // Hook must run unconditionally (rules-of-hooks); result is only consumed // on the no-structured-block branch below. const shown = useTypewriter(target, 110, done); @@ -479,6 +480,18 @@ function StreamingProse({ /> ); } + if (e.kind === 'capture_block') { + return ( + + ); + } return ( Date: Tue, 26 May 2026 19:53:56 -0700 Subject: [PATCH 08/17] Add capture block guidance to interaction-skills --- .../stock/interaction-skills/capture-block.md | 100 ++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 app/src/main/hl/stock/interaction-skills/capture-block.md diff --git a/app/src/main/hl/stock/interaction-skills/capture-block.md b/app/src/main/hl/stock/interaction-skills/capture-block.md new file mode 100644 index 00000000..86f813b1 --- /dev/null +++ b/app/src/main/hl/stock/interaction-skills/capture-block.md @@ -0,0 +1,100 @@ +# Capture block — recaptcha 3×3 tile picker + +When you hit a Google reCAPTCHA image challenge ("Select all squares with +motorcycles", "Select all images with traffic lights"), emit a fenced +` ```capture ` block with a single screenshot of the 3×3 tile grid. The +renderer slices that one image into 9 clickable tiles on the user's +screen. After the user picks tiles and confirms, you receive a new user +message listing the selected tile indices. + +## When to use it + +- Google reCAPTCHA v2 image challenges (3×3 grid, sometimes 4×4). +- Any captcha that asks "click all tiles containing X". + +Do NOT use it for: + +- Text or audio captchas. +- hCaptcha / Cloudflare Turnstile / Funcaptcha. +- Drag-puzzle or rotation captchas. +- General "look at this image" questions (use an inline image instead). + +## The flow + +1. Find the captcha iframe. Record its **page-coordinate bounding box** + for the **grid area only** (the 3×3 image grid). Exclude the prompt + header and the Verify/Audio/Reload toolbar — the renderer slices the + image into equal tiles, so any non-tile pixels in the screenshot + throw off the tile boundaries the user sees. +2. Call `Page.captureScreenshot` with a `clip` rect equal to that + bounding box. The wrapper auto-saves the PNG into the session's + outputs dir and returns the path. +3. Emit: + ``` + ```capture + { + "prompt": "Select all images with motorcycles", + "image": "/abs/path/to/recaptcha-grid.png", + "rows": 3, + "cols": 3 + } + ``` + ``` +4. **Stop calling tools.** Your turn ends. The browser session stays + warm. When the user clicks tiles and confirms, you receive a reply: + > Captcha selected tiles: 0, 2, 6 + + Indices are 0-based, left-to-right, top-to-bottom: + ``` + 0 1 2 + 3 4 5 + 6 7 8 + ``` +5. Convert each index back to its tile center using the grid bounding + box you saved in step 1: + ```js + const tileW = grid.w / cols // cols = 3 + const tileH = grid.h / rows // rows = 3 + for (const i of indices) { + const row = Math.floor(i / cols) + const col = i % cols + const cx = grid.x + (col + 0.5) * tileW + const cy = grid.y + (row + 0.5) * tileH + await session.Input.dispatchMouseEvent({ type: 'mousePressed', x: cx, y: cy, button: 'left', clickCount: 1 }) + await session.Input.dispatchMouseEvent({ type: 'mouseReleased', x: cx, y: cy, button: 'left', clickCount: 1 }) + } + ``` +6. Press Verify, wait for the page to settle, and continue. + +## Fields + +| field | required | notes | +|----------|----------|-------| +| `image` | **yes** | Absolute path to the screenshot file (under the outputs dir). | +| `prompt` | no | The captcha's instruction text ("Select all motorcycles"). | +| `rows` | no | Default `3`. Set to `4` for the rare 4×4 challenge. | +| `cols` | no | Default `3`. | + +## The turn-ending rule + +After the closing ` ``` `, **stop**. No more tool calls. Wait for the +user's reply. + +## "(none)" replies + +If the user confirms without picking any tile, you get: + +> Captcha selected tiles: (none) + +This usually means the prompt has no matches in the visible grid. +reCAPTCHA's UI expects a "Skip" / fresh challenge in that case — press +Verify anyway; the challenge will either accept (rare) or refresh to a +new image set. + +## Banned + +- Cropping to anything beyond the 3×3 grid (prompt header, Verify + button). The renderer assumes the image is the grid only. +- Multiple `capture` fences in one turn. One challenge at a time. +- Inline base64 in `image` — the renderer expects an absolute path; the + `Page.captureScreenshot` wrapper already saves to a real file. From 7525ede470e09f90b739d2e904753eb40050f677 Mon Sep 17 00:00:00 2001 From: Reagan Hsu Date: Tue, 26 May 2026 23:57:08 -0700 Subject: [PATCH 09/17] Restore OptionList outer chrome and add streaming skeleton cards The outer bordered container around the picker was removed in 9bdcd99; bring it back so the option grid reads as a card on the chat surface again. Also render two skeleton cards at the tail while the fence is still streaming, mirroring AskForm's TRAILING_SKELETONS pattern, so the UI doesn't appear frozen between option arrivals. --- app/src/renderer/hub/chat-v2/OptionList.tsx | 9 +++++++++ app/src/renderer/hub/chat-v2/optionList.css | 21 ++++++++++++++++----- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/app/src/renderer/hub/chat-v2/OptionList.tsx b/app/src/renderer/hub/chat-v2/OptionList.tsx index c6976149..22ca48f3 100644 --- a/app/src/renderer/hub/chat-v2/OptionList.tsx +++ b/app/src/renderer/hub/chat-v2/OptionList.tsx @@ -555,6 +555,15 @@ function OptionListReady({ payload, sessionId, streaming, cancelled, nextUserTex /> ); })} + {streaming && Array.from({ length: 2 }).map((_, i) => ( +