From ddcf6604fa0134783922b91c9e6a3c175f25b193 Mon Sep 17 00:00:00 2001 From: Maurice Wen Date: Wed, 25 Mar 2026 09:06:24 +0800 Subject: [PATCH 1/4] feat(cdp): CDP-First routing for browser apps + 11 query strategies For Chrome/Electron apps, try CDP before the expensive AX tree walk. This reduces web element find latency from ~11s to ~50ms for apps like Gmail where Chrome exposes everything as AXGroup. Changes: - CDPBridge: target cache (1s TTL), isBrowserApp() detection (16 apps), 6 new JS query strategies (CSS selector, data-testid, role+text, nearest-input, Shadow DOM pierce, fuzzy Levenshtein) - Perception: CDP-First routing for browser apps, AX-First unchanged for native apps. Zero breaking changes. - Tests: 8 new CDPBridge tests (13/13 total pass) --- Sources/GhostOS/Perception/Perception.swift | 34 +++- Sources/GhostOS/Vision/CDPBridge.swift | 167 ++++++++++++++++++-- Tests/GhostOSTests/CDPBridgeTests.swift | 67 ++++++++ 3 files changed, 255 insertions(+), 13 deletions(-) create mode 100644 Tests/GhostOSTests/CDPBridgeTests.swift diff --git a/Sources/GhostOS/Perception/Perception.swift b/Sources/GhostOS/Perception/Perception.swift index b3d0666..668dcb2 100644 --- a/Sources/GhostOS/Perception/Perception.swift +++ b/Sources/GhostOS/Perception/Perception.swift @@ -137,7 +137,32 @@ public enum Perception { ) } - // Strategy 2: AXorcist's search with ElementSearchOptions + // Strategy 2 (CDP-First): For Chrome/Electron apps, try CDP BEFORE + // the expensive AX tree walk. CDP queries the real DOM via JavaScript + // and returns in ~50ms vs ~11s for a full AX tree walk on web apps. + // + // Routing: browser app → CDP first → AX fallback + // native app → AX first → CDP fallback (existing behavior) + if CDPBridge.isBrowserApp(appName), let query, identifier == nil { + if let cdpResults = cdpFallbackFind(query: query, appName: appName) { + Log.info("CDP-First: found \(cdpResults.count) elements for '\(query)' (skipped AX tree walk)") + return ToolResult( + success: true, + data: [ + "elements": cdpResults, + "count": cdpResults.count, + "total_matches": cdpResults.count, + "source": "cdp-first", + ], + suggestion: "Elements found via Chrome DevTools Protocol (CDP-First path). " + + "Use ghost_click with the x/y coordinates shown in the position field." + ) + } + // CDP miss — fall through to AX tree walk + Log.debug("CDP-First: no results for '\(query)', falling through to AX tree") + } + + // Strategy 3: AXorcist's search with ElementSearchOptions var options = ElementSearchOptions() options.maxDepth = maxDepth options.caseInsensitive = true @@ -162,9 +187,10 @@ public enum Perception { results = semanticDepthSearch(query: query, role: role, in: searchRoot, maxDepth: maxDepth) } - // CDP fallback: if AX search found nothing and we're in Chrome/Electron, - // try Chrome DevTools Protocol for instant DOM-based element finding. - if results.isEmpty, let query { + // CDP fallback (for native apps): if AX search found nothing, + // try Chrome DevTools Protocol as last resort before vision. + // For browser apps this was already tried above (CDP-First path). + if results.isEmpty, let query, !CDPBridge.isBrowserApp(appName) { if let cdpResults = cdpFallbackFind(query: query, appName: appName) { return ToolResult( success: true, diff --git a/Sources/GhostOS/Vision/CDPBridge.swift b/Sources/GhostOS/Vision/CDPBridge.swift index b61cfde..6ef7cb7 100644 --- a/Sources/GhostOS/Vision/CDPBridge.swift +++ b/Sources/GhostOS/Vision/CDPBridge.swift @@ -41,6 +41,39 @@ public enum CDPBridge { /// cases where Chrome is hung or the WebSocket connection is stale. private static let wsTimeout: TimeInterval = 3.0 + // MARK: - Target Cache + + /// Cached debug targets to avoid repeated HTTP calls within a single + /// findElements invocation chain. Cache is very short-lived (1 second) + /// since tabs can open/close at any time. + private static let targetCacheTTL: TimeInterval = 1.0 + private nonisolated(unsafe) static var cachedTargets: [[String: Any]]? + private nonisolated(unsafe) static var cachedTargetsTime: Date? + + // MARK: - Browser App Detection + + /// Known browser/Electron app names that expose DOM via CDP. + /// Used by Perception to decide whether to try CDP before AX tree walk. + private static let browserAppNames = [ + "Google Chrome", "Chrome", "Chromium", "Arc", "Arc Browser", + "Microsoft Edge", "Brave Browser", "Vivaldi", "Opera", + // Electron apps (use Chrome's engine, expose CDP when debug port is open) + "Slack", "Discord", "Visual Studio Code", "Code", + "Figma", "Notion", "Obsidian", "Cursor", + ] + + /// Check if an app name corresponds to a Chrome/Electron browser. + /// Used by Perception.findElements() to decide routing: + /// - Browser app → CDP-First path (try CDP before AX tree walk) + /// - Native app → AX-First path (existing behavior, unchanged) + /// + /// False positives are safe: CDP will simply return nil and fall through. + /// False negatives cost ~11s per query (full AX tree walk before CDP). + public static func isBrowserApp(_ name: String?) -> Bool { + guard let name else { return false } + return browserAppNames.contains(where: { name.localizedCaseInsensitiveContains($0) }) + } + // MARK: - Availability Check /// Check if Chrome is running with remote debugging enabled. @@ -49,7 +82,17 @@ public enum CDPBridge { } /// Get the list of debuggable Chrome tabs. + /// Uses a 1-second cache to avoid repeated HTTP calls during a single + /// ghost_find → ghost_click sequence. public static func getDebugTargets() -> [[String: Any]]? { + // Return cached targets if fresh enough + if let cached = cachedTargets, + let time = cachedTargetsTime, + Date().timeIntervalSince(time) < targetCacheTTL + { + return cached + } + guard let url = URL(string: "http://127.0.0.1:\(defaultPort)/json") else { return nil } @@ -77,9 +120,15 @@ public enum CDPBridge { let data = box.data, let targets = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else { + cachedTargets = nil + cachedTargetsTime = nil return nil } + // Update cache + cachedTargets = targets + cachedTargetsTime = Date() + return targets } @@ -91,6 +140,19 @@ public enum CDPBridge { /// /// This is dramatically faster than AX tree walking for web apps /// (~50ms vs ~11s for Gmail). + /// + /// Search strategies (executed in order, results deduplicated): + /// 1. CSS Selector — direct query if input looks like a selector (#id, .class, tag) + /// 2. data-testid — React/Vue test attribute match + /// 3. aria-label — ARIA label match (existing) + /// 4. placeholder — input placeholder match (existing) + /// 5. role + text — ARIA role with text content match + /// 6. button/link text — text content of interactive elements (existing) + /// 7. input labels — label[for] association (existing) + /// 8. title/alt — title or alt attribute match (existing) + /// 9. nearest-input — find label text, return nearest input/textarea + /// 10. Shadow DOM — pierce open shadow roots + /// 11. fuzzy text — Levenshtein distance <= 2 for typo tolerance public static func findElements( query: String, tabIndex: Int = 0 @@ -107,8 +169,8 @@ public enum CDPBridge { return nil } - // JavaScript that finds elements by text content, aria-label, placeholder, etc. - // Returns an array of {text, tag, role, x, y, width, height} objects. + // JavaScript that finds elements using 11 strategies. + // Returns an array of {text, tag, role, x, y, width, height, ...} objects. let js = """ (function() { const query = \(escapeJSString(query)); @@ -125,12 +187,14 @@ public enum CDPBridge { if (seen.has(key)) return; seen.add(key); + const dataTestId = el.getAttribute('data-testid') || el.getAttribute('data-test-id') || ''; results.push({ text: (el.textContent || '').trim().substring(0, 100), tag: el.tagName.toLowerCase(), role: el.getAttribute('role') || '', ariaLabel: el.getAttribute('aria-label') || '', id: el.id || '', + dataTestId: dataTestId, className: (el.className || '').toString().substring(0, 100), x: Math.round(rect.x), y: Math.round(rect.y), @@ -143,33 +207,59 @@ public enum CDPBridge { el.getAttribute('role') === 'button' || el.getAttribute('role') === 'link' || el.getAttribute('role') === 'textbox' || + el.getAttribute('role') === 'combobox' || + el.getAttribute('role') === 'menuitem' || el.onclick !== null || - el.getAttribute('tabindex') !== null + el.getAttribute('tabindex') !== null || + window.getComputedStyle(el).cursor === 'pointer' }); } - // Strategy 1: aria-label match + // Strategy 1: CSS Selector — if query starts with #, ., or contains [] + if (/^[#.[]/.test(query) || /\\w+\\[/.test(query)) { + try { + document.querySelectorAll(query).forEach(el => addResult(el, 'css-selector')); + } catch(e) { /* invalid selector, skip */ } + } + + // Strategy 2: data-testid match (React/Vue/Angular test attributes) + document.querySelectorAll('[data-testid], [data-test-id]').forEach(el => { + const tid = (el.getAttribute('data-testid') || el.getAttribute('data-test-id') || '').toLowerCase(); + if (tid.includes(queryLower)) { + addResult(el, 'data-testid'); + } + }); + + // Strategy 3: aria-label match document.querySelectorAll('[aria-label]').forEach(el => { if (el.getAttribute('aria-label').toLowerCase().includes(queryLower)) { addResult(el, 'aria-label'); } }); - // Strategy 2: placeholder match + // Strategy 4: placeholder match document.querySelectorAll('[placeholder]').forEach(el => { if (el.getAttribute('placeholder').toLowerCase().includes(queryLower)) { addResult(el, 'placeholder'); } }); - // Strategy 3: button/link text content match - document.querySelectorAll('button, a, [role="button"], [role="link"], [role="tab"]').forEach(el => { + // Strategy 5: role + aria-label/text combo (ARIA widgets) + document.querySelectorAll('[role]').forEach(el => { + const label = el.getAttribute('aria-label') || el.textContent || ''; + if (label.toLowerCase().includes(queryLower)) { + addResult(el, 'role-text'); + } + }); + + // Strategy 6: button/link text content match + document.querySelectorAll('button, a, [role="button"], [role="link"], [role="tab"], [role="menuitem"]').forEach(el => { if ((el.textContent || '').toLowerCase().includes(queryLower)) { addResult(el, 'text-content'); } }); - // Strategy 4: input labels + // Strategy 7: input labels document.querySelectorAll('label').forEach(label => { if ((label.textContent || '').toLowerCase().includes(queryLower)) { const forId = label.getAttribute('for'); @@ -180,7 +270,7 @@ public enum CDPBridge { } }); - // Strategy 5: title/alt attribute match + // Strategy 8: title/alt attribute match document.querySelectorAll('[title], [alt]').forEach(el => { const t = (el.getAttribute('title') || el.getAttribute('alt') || '').toLowerCase(); if (t.includes(queryLower)) { @@ -188,6 +278,65 @@ public enum CDPBridge { } }); + // Strategy 9: nearest-input — find text, return the closest input/textarea + if (results.length === 0) { + const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT, null); + while (walker.nextNode()) { + if (walker.currentNode.textContent.toLowerCase().includes(queryLower)) { + let parent = walker.currentNode.parentElement; + for (let i = 0; i < 5 && parent; i++) { + const input = parent.querySelector('input, textarea, select, [contenteditable="true"]'); + if (input) { addResult(input, 'nearest-input'); break; } + parent = parent.parentElement; + } + } + } + } + + // Strategy 10: Shadow DOM — pierce open shadow roots (Web Components) + if (results.length === 0) { + function searchShadow(root) { + root.querySelectorAll('*').forEach(el => { + if (el.shadowRoot) { + el.shadowRoot.querySelectorAll('[aria-label], button, a, [role="button"], input').forEach(inner => { + const label = inner.getAttribute('aria-label') || inner.textContent || ''; + if (label.toLowerCase().includes(queryLower)) { + addResult(inner, 'shadow-dom'); + } + }); + searchShadow(el.shadowRoot); + } + }); + } + searchShadow(document); + } + + // Strategy 11: fuzzy text match (Levenshtein distance <= 2) + if (results.length === 0 && query.length >= 3) { + function levenshtein(a, b) { + const m = a.length, n = b.length; + if (Math.abs(m - n) > 2) return 3; + const d = Array.from({length: m + 1}, (_, i) => [i]); + for (let j = 1; j <= n; j++) d[0][j] = j; + for (let i = 1; i <= m; i++) + for (let j = 1; j <= n; j++) + d[i][j] = Math.min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+(a[i-1]!==b[j-1]?1:0)); + return d[m][n]; + } + document.querySelectorAll('button, a, [role="button"], [role="link"], input, [role="tab"]').forEach(el => { + const text = (el.getAttribute('aria-label') || el.textContent || '').trim().toLowerCase(); + if (text.length > 0 && text.length < 50) { + const words = text.split(/\\s+/); + for (const word of words) { + if (levenshtein(queryLower, word) <= 2) { + addResult(el, 'fuzzy-text'); + break; + } + } + } + }); + } + return results.slice(0, 20); })(); """ diff --git a/Tests/GhostOSTests/CDPBridgeTests.swift b/Tests/GhostOSTests/CDPBridgeTests.swift new file mode 100644 index 0000000..94f299f --- /dev/null +++ b/Tests/GhostOSTests/CDPBridgeTests.swift @@ -0,0 +1,67 @@ +// CDPBridgeTests.swift - Unit tests for CDPBridge enhancements + +import Testing +@testable import GhostOS + +@Suite("CDPBridge Tests") +struct CDPBridgeTests { + + // MARK: - isBrowserApp detection + + @Test("Detects Google Chrome as browser app") + func chromeDetection() { + #expect(CDPBridge.isBrowserApp("Google Chrome") == true) + } + + @Test("Detects Arc as browser app") + func arcDetection() { + #expect(CDPBridge.isBrowserApp("Arc") == true) + } + + @Test("Detects Electron apps as browser app") + func electronDetection() { + #expect(CDPBridge.isBrowserApp("Slack") == true) + #expect(CDPBridge.isBrowserApp("Visual Studio Code") == true) + #expect(CDPBridge.isBrowserApp("Discord") == true) + } + + @Test("Does not detect native apps as browser app") + func nativeAppDetection() { + #expect(CDPBridge.isBrowserApp("Finder") == false) + #expect(CDPBridge.isBrowserApp("Mail") == false) + #expect(CDPBridge.isBrowserApp("Preview") == false) + #expect(CDPBridge.isBrowserApp("Terminal") == false) + #expect(CDPBridge.isBrowserApp("Safari") == false) // Safari has no CDP + } + + @Test("Handles nil app name gracefully") + func nilAppName() { + #expect(CDPBridge.isBrowserApp(nil) == false) + } + + @Test("Case insensitive browser detection") + func caseInsensitive() { + #expect(CDPBridge.isBrowserApp("google chrome") == true) + #expect(CDPBridge.isBrowserApp("GOOGLE CHROME") == true) + #expect(CDPBridge.isBrowserApp("Microsoft Edge") == true) + } + + // MARK: - CDP availability (safe to run without Chrome) + + @Test("isAvailable returns false when Chrome debug port is not open") + func availabilityWithoutChrome() { + // This test is safe: if Chrome isn't running with --remote-debugging-port=9222, + // isAvailable() should return false quickly (connection refused). + // If Chrome IS running with debug port, it returns true — both are valid. + let result = CDPBridge.isAvailable() + // We just verify it doesn't crash or hang + #expect(result == true || result == false) + } + + @Test("getDebugTargets returns nil when Chrome debug port is not open") + func debugTargetsWithoutChrome() { + // Same as above: graceful nil when Chrome debug port isn't available + let targets = CDPBridge.getDebugTargets() + #expect(targets == nil || targets != nil) + } +} From 516b85c50b5d3fc699d833e7a05a9086ae866be6 Mon Sep 17 00:00:00 2001 From: Maurice Wen Date: Wed, 25 Mar 2026 09:08:10 +0800 Subject: [PATCH 2/4] feat(vision): CDP structured snapshot for ghost_parse_screen (Issue #9) For Chrome/Electron apps, ghost_parse_screen now returns a compact text-based element list via CDP instead of a full screenshot image. Token cost drops from ~2000+ to ~100-200 tokens per snapshot. Output format: [e0] button "Compose" (142, 223) dom:":oq" This directly addresses Issue #9 (screenshot context overflow) by providing a structured alternative that avoids base64 image encoding. Native apps fall through to the existing vision sidecar path unchanged. --- Sources/GhostOS/Perception/Perception.swift | 6 + Sources/GhostOS/Vision/VisionPerception.swift | 189 +++++++++++++++++- 2 files changed, 193 insertions(+), 2 deletions(-) diff --git a/Sources/GhostOS/Perception/Perception.swift b/Sources/GhostOS/Perception/Perception.swift index 668dcb2..5ee1ec7 100644 --- a/Sources/GhostOS/Perception/Perception.swift +++ b/Sources/GhostOS/Perception/Perception.swift @@ -805,6 +805,12 @@ public enum Perception { /// Get Chrome window origin for coordinate conversion. private static func chromeWindowOrigin(appName: String?) -> (x: Double, y: Double) { + return Self.chromeWindowOriginPublic(appName: appName) + } + + /// Public accessor for Chrome window origin. Used by VisionPerception for + /// CDP structured snapshots that need viewport-to-screen coordinate mapping. + public static func chromeWindowOriginPublic(appName: String?) -> (x: Double, y: Double) { let name = appName ?? "Chrome" guard let app = findApp(named: name), let appElement = Element.application(for: app.processIdentifier), diff --git a/Sources/GhostOS/Vision/VisionPerception.swift b/Sources/GhostOS/Vision/VisionPerception.swift index 66e92d5..0121c25 100644 --- a/Sources/GhostOS/Vision/VisionPerception.swift +++ b/Sources/GhostOS/Vision/VisionPerception.swift @@ -20,12 +20,29 @@ public enum VisionPerception { // MARK: - ghost_parse_screen - /// Detect all interactive UI elements using vision. - /// Takes a screenshot and sends it to the vision sidecar for YOLO detection. + /// Detect all interactive UI elements using vision or CDP. + /// + /// For Chrome/Electron apps, uses CDP to enumerate DOM elements directly. + /// This is dramatically cheaper than sending a full screenshot to a VLM: + /// - CDP structured output: ~100-200 tokens + /// - Screenshot image: ~2000+ tokens + /// + /// Falls back to vision sidecar YOLO detection for native apps (not yet implemented). public static func parseScreen( appName: String?, fullResolution: Bool ) -> ToolResult { + // Strategy 1: CDP structured snapshot for Chrome/Electron apps. + // Returns a text-based element list instead of a screenshot image. + // This directly addresses Issue #9 (screenshot context overflow). + if CDPBridge.isBrowserApp(appName) && CDPBridge.isAvailable() { + if let snapshot = cdpStructuredSnapshot(appName: appName) { + return snapshot + } + // CDP miss — fall through to vision + } + + // Strategy 2: Vision sidecar (YOLO detection) for native apps. // Check sidecar availability guard VisionBridge.isAvailable() else { return sidecarUnavailableResult(tool: "ghost_parse_screen") @@ -56,6 +73,174 @@ public enum VisionPerception { ) } + /// Generate a structured text-based snapshot of Chrome's interactive elements via CDP. + /// Returns a compact element list (~100-200 tokens) instead of a screenshot (~2000+ tokens). + /// + /// Output format per element: + /// [e0] button "Compose" (142, 223) dom:":oq" + /// [e1] input "Search mail" (450, 72) dom:":ol" + /// + /// This is the CDP equivalent of ghost_annotate but without any image — pure text. + private static func cdpStructuredSnapshot(appName: String?) -> ToolResult? { + let js = """ + (function() { + const els = []; + const selectors = 'a, button, input, select, textarea, [role="button"], [role="link"], ' + + '[role="textbox"], [role="combobox"], [role="tab"], [role="menuitem"], ' + + '[role="checkbox"], [role="radio"], [tabindex], [contenteditable="true"]'; + document.querySelectorAll(selectors).forEach(el => { + const rect = el.getBoundingClientRect(); + if (rect.width === 0 || rect.height === 0) return; + if (rect.bottom < 0 || rect.top > window.innerHeight) return; + if (rect.right < 0 || rect.left > window.innerWidth) return; + els.push({ + tag: el.tagName.toLowerCase(), + role: el.getAttribute('role') || '', + label: el.getAttribute('aria-label') || el.getAttribute('placeholder') || + el.getAttribute('title') || (el.textContent || '').trim().substring(0, 60), + id: el.id || '', + dataTestId: el.getAttribute('data-testid') || '', + type: el.type || '', + x: Math.round(rect.x + rect.width / 2), + y: Math.round(rect.y + rect.height / 2), + w: Math.round(rect.width), + h: Math.round(rect.height) + }); + }); + return { + url: location.href, + title: document.title, + elements: els.slice(0, 50), + totalOnPage: els.length + }; + })(); + """ + + guard let targets = CDPBridge.getDebugTargets() else { return nil } + let pages = targets.filter { ($0["type"] as? String) == "page" } + guard let wsURL = pages.first?["webSocketDebuggerUrl"] as? String else { return nil } + + // Evaluate JS and parse the structured result + guard let url = URL(string: wsURL) else { return nil } + let session = URLSession(configuration: .default) + let wsTask = session.webSocketTask(with: url) + wsTask.resume() + + let command: [String: Any] = [ + "id": 1, + "method": "Runtime.evaluate", + "params": ["expression": js, "returnByValue": true], + ] + guard let cmdData = try? JSONSerialization.data(withJSONObject: command), + let cmdStr = String(data: cmdData, encoding: .utf8) + else { + wsTask.cancel(with: .goingAway, reason: nil) + return nil + } + + nonisolated final class Box: @unchecked Sendable { + var result: [String: Any]? + } + let box = Box() + let sem = DispatchSemaphore(value: 0) + + wsTask.send(.string(cmdStr)) { error in + if error != nil { sem.signal(); return } + wsTask.receive { msg in + if case .success(let message) = msg, case .string(let text) = message, + let data = text.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let r = json["result"] as? [String: Any], + let rv = r["result"] as? [String: Any], + let value = rv["value"] as? [String: Any] + { + box.result = value + } + sem.signal() + } + } + + let waitResult = sem.wait(timeout: .now() + 3.0) + wsTask.cancel(with: .goingAway, reason: nil) + if waitResult == .timedOut { return nil } + guard let snapshot = box.result else { return nil } + + let pageURL = snapshot["url"] as? String ?? "" + let pageTitle = snapshot["title"] as? String ?? "" + let elements = snapshot["elements"] as? [[String: Any]] ?? [] + let totalOnPage = snapshot["totalOnPage"] as? Int ?? elements.count + + // Get Chrome window position for coordinate conversion + let windowOrigin = Perception.chromeWindowOriginPublic(appName: appName) + + // Build compact text representation (~2-5 tokens per element vs ~40 per screenshot pixel) + var lines: [String] = [] + var structuredElements: [[String: Any]] = [] + + for (i, el) in elements.enumerated() { + let tag = el["tag"] as? String ?? "?" + let role = el["role"] as? String ?? "" + let label = el["label"] as? String ?? "" + let domId = el["id"] as? String ?? "" + let dataTestId = el["dataTestId"] as? String ?? "" + let viewX = el["x"] as? Int ?? 0 + let viewY = el["y"] as? Int ?? 0 + let w = el["w"] as? Int ?? 0 + let h = el["h"] as? Int ?? 0 + + // Convert viewport to screen coordinates + let screen = CDPBridge.viewportToScreen( + viewportX: Double(viewX), viewportY: Double(viewY), + windowX: windowOrigin.x, windowY: windowOrigin.y + ) + let sx = Int(screen.x) + let sy = Int(screen.y) + + // Determine display role + let displayRole = !role.isEmpty ? role : tag + + // Build compact line + var line = "[e\(i)] \(displayRole) \"\(label)\" (\(sx), \(sy))" + if !domId.isEmpty { line += " dom:\"\(domId)\"" } + if !dataTestId.isEmpty { line += " test:\"\(dataTestId)\"" } + lines.append(line) + + // Build structured element for programmatic use + structuredElements.append([ + "ref": "e\(i)", + "role": displayRole, + "label": label, + "position": ["x": sx, "y": sy], + "size": ["width": w, "height": h], + "dom_id": domId, + "data_testid": dataTestId, + "source": "cdp-snapshot", + ]) + } + + let summary = lines.joined(separator: "\n") + let tokenEstimate = elements.count * 4 + 10 // ~4 tokens per line + header + + Log.info("CDP snapshot: \(elements.count)/\(totalOnPage) elements, ~\(tokenEstimate) tokens (vs ~2000 for screenshot)") + + return ToolResult( + success: true, + data: [ + "snapshot": summary, + "elements": structuredElements, + "count": elements.count, + "total_on_page": totalOnPage, + "url": pageURL, + "title": pageTitle, + "source": "cdp-structured-snapshot", + "token_estimate": tokenEstimate, + ], + suggestion: elements.isEmpty + ? "No interactive elements found via CDP. Try ghost_find or ghost_ground." + : "Use ghost_click with x/y coordinates from the element list. Ref format: [e0], [e1], etc." + ) + } + // MARK: - ghost_ground /// Find precise screen coordinates for a described UI element using VLM. From a272cc91ee0b96e5ee3ec0db4be137b2f0435dc9 Mon Sep 17 00:00:00 2001 From: Maurice Wen Date: Wed, 25 Mar 2026 09:14:48 +0800 Subject: [PATCH 3/4] feat: Session persistence + anti-bot stealth modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New Session/ module: - ChromeProfileManager: persistent Chrome profiles (~/.ghost-os/profiles/), profile CRUD, Chrome launch args builder, cookie export via CDP. File permissions set to 700 for security. New Stealth/ module: - TimingJitter: log-normal human delays (Box-Muller), burst typing patterns, coordinate jitter (±2px), pre/post click delays. - BehavioralMimicry: cubic Bezier mouse paths, short-distance jitter paths, pre-action scroll simulation, off-center click offset. Tests: 10 new stealth tests (23/23 total pass). All tests verify statistical properties (distribution bounds, curvature, timing ranges). --- .../Session/ChromeProfileManager.swift | 292 ++++++++++++++++++ .../GhostOS/Stealth/BehavioralMimicry.swift | 171 ++++++++++ Sources/GhostOS/Stealth/TimingJitter.swift | 109 +++++++ Tests/GhostOSTests/StealthTests.swift | 130 ++++++++ 4 files changed, 702 insertions(+) create mode 100644 Sources/GhostOS/Session/ChromeProfileManager.swift create mode 100644 Sources/GhostOS/Stealth/BehavioralMimicry.swift create mode 100644 Sources/GhostOS/Stealth/TimingJitter.swift create mode 100644 Tests/GhostOSTests/StealthTests.swift diff --git a/Sources/GhostOS/Session/ChromeProfileManager.swift b/Sources/GhostOS/Session/ChromeProfileManager.swift new file mode 100644 index 0000000..7a44b7c --- /dev/null +++ b/Sources/GhostOS/Session/ChromeProfileManager.swift @@ -0,0 +1,292 @@ +// ChromeProfileManager.swift - Persistent Chrome session management for Ghost OS +// +// Manages Chrome user profiles so that login state persists across Ghost OS +// sessions. Users log in once (manually or via recipe), and subsequent +// recipe runs reuse the authenticated session. +// +// Architecture: +// ~/.ghost-os/profiles// → Chrome user-data-dir +// ~/.ghost-os/profiles/index.json → profile metadata (name, url, lastUsed) +// +// Security: profile directories contain cookies equivalent to credentials. +// File permissions are set to 700 (owner-only). Profiles are excluded from +// git via .gitignore. Never sync profiles across machines. + +import Foundation + +/// Manages persistent Chrome browser profiles for authenticated workflows. +public enum ChromeProfileManager { + + /// Base directory for all profiles. + private static let profilesDir: URL = { + let home = FileManager.default.homeDirectoryForCurrentUser + return home.appendingPathComponent(".ghost-os/profiles") + }() + + /// Profile metadata stored in index.json. + public struct ProfileInfo: Codable, Sendable { + public let name: String + public let createdAt: Date + public var lastUsed: Date + public var url: String? // Last known URL + public var description: String? + } + + // MARK: - Profile CRUD + + /// List all available profiles. + public static func listProfiles() -> [ProfileInfo] { + let indexURL = profilesDir.appendingPathComponent("index.json") + guard let data = try? Data(contentsOf: indexURL), + let profiles = try? JSONDecoder.withISO8601.decode([ProfileInfo].self, from: data) + else { + return [] + } + return profiles.sorted { $0.lastUsed > $1.lastUsed } + } + + /// Create a new profile directory. + /// Returns the profile directory path for Chrome's --user-data-dir flag. + public static func createProfile(name: String, description: String? = nil) -> URL? { + let dir = profilesDir.appendingPathComponent(name) + + do { + try FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) + // Set owner-only permissions (700) for security + try FileManager.default.setAttributes( + [.posixPermissions: 0o700], + ofItemAtPath: dir.path + ) + } catch { + Log.error("Failed to create profile directory: \(error)") + return nil + } + + // Update index + var profiles = listProfiles() + if !profiles.contains(where: { $0.name == name }) { + profiles.append(ProfileInfo( + name: name, + createdAt: Date(), + lastUsed: Date(), + url: nil, + description: description + )) + saveIndex(profiles) + } + + Log.info("Created Chrome profile '\(name)' at \(dir.path)") + return dir + } + + /// Get the directory for an existing profile. + /// Returns nil if the profile doesn't exist. + public static func profileDir(for name: String) -> URL? { + let dir = profilesDir.appendingPathComponent(name) + var isDir: ObjCBool = false + guard FileManager.default.fileExists(atPath: dir.path, isDirectory: &isDir), + isDir.boolValue + else { + return nil + } + return dir + } + + /// Delete a profile and its data. + public static func deleteProfile(name: String) -> Bool { + let dir = profilesDir.appendingPathComponent(name) + do { + try FileManager.default.removeItem(at: dir) + var profiles = listProfiles() + profiles.removeAll { $0.name == name } + saveIndex(profiles) + Log.info("Deleted Chrome profile '\(name)'") + return true + } catch { + Log.error("Failed to delete profile '\(name)': \(error)") + return false + } + } + + /// Update the lastUsed timestamp for a profile. + public static func touchProfile(name: String, url: String? = nil) { + var profiles = listProfiles() + if let idx = profiles.firstIndex(where: { $0.name == name }) { + profiles[idx].lastUsed = Date() + if let url { profiles[idx].url = url } + saveIndex(profiles) + } + } + + // MARK: - Chrome Launch + + /// Build Chrome launch arguments for a given profile. + /// Returns an array of command-line arguments. + /// + /// Usage: + /// let args = ChromeProfileManager.chromeLaunchArgs(profile: "github-work", url: "https://github.com") + /// // ["--remote-debugging-port=9222", "--user-data-dir=/path/to/profile", "https://github.com"] + public static func chromeLaunchArgs( + profile: String, + url: String? = nil, + debugPort: Int = 9222 + ) -> [String]? { + // Get or create profile directory + let dir = profileDir(for: profile) ?? createProfile(name: profile) + guard let profilePath = dir?.path else { return nil } + + var args = [ + "--remote-debugging-port=\(debugPort)", + "--user-data-dir=\(profilePath)", + ] + if let url { args.append(url) } + + touchProfile(name: profile, url: url) + return args + } + + /// Chrome application path (macOS). + public static let chromeAppPath = "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome" + + // MARK: - Cookie Export/Import via CDP + + /// Export cookies from the current Chrome session via CDP. + /// Requires Chrome to be running with --remote-debugging-port. + public static func exportCookies() -> [[String: Any]]? { + guard CDPBridge.isAvailable(), + let targets = CDPBridge.getDebugTargets(), + let page = targets.first(where: { ($0["type"] as? String) == "page" }), + let wsURL = page["webSocketDebuggerUrl"] as? String + else { + return nil + } + + return cdpCommand(wsURL: wsURL, method: "Network.getAllCookies", params: [:]) + } + + /// Get the current page URL via CDP. + public static func currentURL() -> String? { + let js = "location.href" + guard let targets = CDPBridge.getDebugTargets(), + let page = targets.first(where: { ($0["type"] as? String) == "page" }), + let wsURL = page["webSocketDebuggerUrl"] as? String, + let url = URL(string: wsURL) + else { + return nil + } + + let session = URLSession(configuration: .default) + let wsTask = session.webSocketTask(with: url) + wsTask.resume() + + let command: [String: Any] = [ + "id": 1, + "method": "Runtime.evaluate", + "params": ["expression": js, "returnByValue": true], + ] + guard let cmdData = try? JSONSerialization.data(withJSONObject: command), + let cmdStr = String(data: cmdData, encoding: .utf8) + else { + wsTask.cancel(with: .goingAway, reason: nil) + return nil + } + + nonisolated final class Box: @unchecked Sendable { var result: String? } + let box = Box() + let sem = DispatchSemaphore(value: 0) + + wsTask.send(.string(cmdStr)) { error in + if error != nil { sem.signal(); return } + wsTask.receive { msg in + if case .success(let message) = msg, case .string(let text) = message, + let data = text.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let r = json["result"] as? [String: Any], + let rv = r["result"] as? [String: Any], + let value = rv["value"] as? String + { + box.result = value + } + sem.signal() + } + } + sem.wait() + wsTask.cancel(with: .goingAway, reason: nil) + return box.result + } + + // MARK: - Private Helpers + + private static func saveIndex(_ profiles: [ProfileInfo]) { + let indexURL = profilesDir.appendingPathComponent("index.json") + do { + try FileManager.default.createDirectory(at: profilesDir, withIntermediateDirectories: true) + let data = try JSONEncoder.withISO8601.encode(profiles) + try data.write(to: indexURL, options: .atomic) + } catch { + Log.error("Failed to save profile index: \(error)") + } + } + + /// Execute a CDP domain command and return the result. + private static func cdpCommand( + wsURL: String, + method: String, + params: [String: Any] + ) -> [[String: Any]]? { + guard let url = URL(string: wsURL) else { return nil } + let session = URLSession(configuration: .default) + let wsTask = session.webSocketTask(with: url) + wsTask.resume() + + let command: [String: Any] = ["id": 1, "method": method, "params": params] + guard let cmdData = try? JSONSerialization.data(withJSONObject: command), + let cmdStr = String(data: cmdData, encoding: .utf8) + else { + wsTask.cancel(with: .goingAway, reason: nil) + return nil + } + + nonisolated final class Box: @unchecked Sendable { var result: [[String: Any]]? } + let box = Box() + let sem = DispatchSemaphore(value: 0) + + wsTask.send(.string(cmdStr)) { error in + if error != nil { sem.signal(); return } + wsTask.receive { msg in + if case .success(let message) = msg, case .string(let text) = message, + let data = text.data(using: .utf8), + let json = try? JSONSerialization.jsonObject(with: data) as? [String: Any], + let r = json["result"] as? [String: Any], + let cookies = r["cookies"] as? [[String: Any]] + { + box.result = cookies + } + sem.signal() + } + } + + let _ = sem.wait(timeout: .now() + 3.0) + wsTask.cancel(with: .goingAway, reason: nil) + return box.result + } +} + +// MARK: - JSON Coding Helpers + +private extension JSONEncoder { + static let withISO8601: JSONEncoder = { + let encoder = JSONEncoder() + encoder.dateEncodingStrategy = .iso8601 + encoder.outputFormatting = [.prettyPrinted, .sortedKeys] + return encoder + }() +} + +private extension JSONDecoder { + static let withISO8601: JSONDecoder = { + let decoder = JSONDecoder() + decoder.dateDecodingStrategy = .iso8601 + return decoder + }() +} diff --git a/Sources/GhostOS/Stealth/BehavioralMimicry.swift b/Sources/GhostOS/Stealth/BehavioralMimicry.swift new file mode 100644 index 0000000..527228a --- /dev/null +++ b/Sources/GhostOS/Stealth/BehavioralMimicry.swift @@ -0,0 +1,171 @@ +// BehavioralMimicry.swift - Human-like mouse and interaction patterns for Ghost OS +// +// Bot detection systems analyze behavioral patterns beyond timing: +// - Mouse teleportation (instant jump to target) vs smooth cursor movement +// - Perfectly straight mouse paths vs natural curves +// - Clicking exact center of buttons vs slightly off-center +// - No scrolling/hovering before action vs natural reading behavior +// +// BehavioralMimicry provides realistic interaction patterns that mimic +// human motor control. Uses cubic Bezier curves for mouse paths (matching +// how human wrist/arm movement naturally creates smooth arcs). + +import CoreGraphics +import Foundation + +/// Human-like behavioral patterns for mouse and interaction mimicry. +public enum BehavioralMimicry { + + // MARK: - Mouse Path Generation + + /// Generate a natural-looking mouse path from one point to another. + /// + /// Uses a cubic Bezier curve with randomized control points to simulate + /// the natural arc of human wrist/arm movement. Real mouse paths are + /// never perfectly straight — they curve slightly due to arm mechanics. + /// + /// - Parameters: + /// - from: Starting point. + /// - to: Target point. + /// - steps: Number of intermediate points (default 10). + /// - Returns: Array of points along the curve, including start and end. + public static func mousePath(from: CGPoint, to: CGPoint, steps: Int = 10) -> [CGPoint] { + let dx = to.x - from.x + let dy = to.y - from.y + + // Control points create a slight arc (not a straight line) + // Randomize to avoid detectable patterns across multiple moves + let control1 = CGPoint( + x: from.x + dx * 0.25 + CGFloat.random(in: -30...30), + y: from.y + dy * 0.1 + CGFloat.random(in: -20...20) + ) + let control2 = CGPoint( + x: from.x + dx * 0.75 + CGFloat.random(in: -20...20), + y: from.y + dy * 0.9 + CGFloat.random(in: -10...10) + ) + + return cubicBezier(p0: from, p1: control1, p2: control2, p3: to, steps: steps) + } + + /// Generate a short, jittery mouse path for nearby targets. + /// + /// When the mouse only needs to move a short distance (<50px), humans + /// make quick, slightly wobbly movements rather than smooth arcs. + /// + /// - Parameters: + /// - from: Starting point. + /// - to: Target point. + /// - Returns: 3-5 points with micro-jitter. + public static func shortMousePath(from: CGPoint, to: CGPoint) -> [CGPoint] { + let distance = hypot(to.x - from.x, to.y - from.y) + if distance < 5 { + return [from, to] // Too close, just jump + } + + let steps = min(5, max(3, Int(distance / 15))) + var points: [CGPoint] = [from] + + for i in 1.. Int { + // 25% chance of a small pre-action scroll + if Int.random(in: 0..<4) == 0 { + return Int.random(in: 1...3) + } + return 0 + } + + /// Determine scroll direction based on target position. + /// + /// If the target is in the lower third of the viewport, humans tend to + /// scroll down slightly first (reading behavior). Upper third → no scroll. + /// + /// - Parameter targetY: The Y coordinate of the target on screen. + /// - Parameter screenHeight: The visible screen height. + /// - Returns: "down", "up", or nil (no scroll). + public static func scrollDirection(targetY: Double, screenHeight: Double) -> String? { + let ratio = targetY / screenHeight + if ratio > 0.7 { + return Int.random(in: 0..<3) == 0 ? "down" : nil // 33% chance + } else if ratio < 0.2 { + return Int.random(in: 0..<5) == 0 ? "up" : nil // 20% chance + } + return nil + } + + // MARK: - Click Offset + + /// Generate a human-like click offset within a button's bounds. + /// + /// Humans don't click the exact mathematical center of buttons. + /// They click slightly off-center, biased towards the text/icon. + /// The offset is proportional to the element size (bigger = more variance). + /// + /// - Parameters: + /// - center: The element's center point. + /// - width: The element's width. + /// - height: The element's height. + /// - Returns: A slightly offset click point. + public static func clickOffset( + center: CGPoint, + width: CGFloat, + height: CGFloat + ) -> CGPoint { + // Max offset: 15% of dimension, but at least 1px and at most 5px + let maxOffX = max(1.0, min(5.0, width * 0.15)) + let maxOffY = max(1.0, min(3.0, height * 0.15)) + + return CGPoint( + x: center.x + CGFloat.random(in: -maxOffX...maxOffX), + y: center.y + CGFloat.random(in: -maxOffY...maxOffY) + ) + } + + // MARK: - Private: Bezier Math + + /// Compute a point on a cubic Bezier curve at parameter t. + private static func cubicBezierPoint( + p0: CGPoint, p1: CGPoint, p2: CGPoint, p3: CGPoint, t: CGFloat + ) -> CGPoint { + let mt = 1.0 - t + let mt2 = mt * mt + let mt3 = mt2 * mt + let t2 = t * t + let t3 = t2 * t + + return CGPoint( + x: mt3 * p0.x + 3 * mt2 * t * p1.x + 3 * mt * t2 * p2.x + t3 * p3.x, + y: mt3 * p0.y + 3 * mt2 * t * p1.y + 3 * mt * t2 * p2.y + t3 * p3.y + ) + } + + /// Generate points along a cubic Bezier curve. + private static func cubicBezier( + p0: CGPoint, p1: CGPoint, p2: CGPoint, p3: CGPoint, steps: Int + ) -> [CGPoint] { + var points: [CGPoint] = [] + for i in 0...steps { + let t = CGFloat(i) / CGFloat(steps) + points.append(cubicBezierPoint(p0: p0, p1: p1, p2: p2, p3: p3, t: t)) + } + return points + } +} diff --git a/Sources/GhostOS/Stealth/TimingJitter.swift b/Sources/GhostOS/Stealth/TimingJitter.swift new file mode 100644 index 0000000..f048225 --- /dev/null +++ b/Sources/GhostOS/Stealth/TimingJitter.swift @@ -0,0 +1,109 @@ +// TimingJitter.swift - Human-like timing randomization for Ghost OS +// +// Anti-bot detection relies on timing patterns. Bots are predictable: +// fixed delays between actions, instant mouse teleportation, uniform +// typing speed. Humans are noisy: variable delays, hesitations, bursts. +// +// TimingJitter provides log-normal distributed delays that mimic human +// reaction times. Log-normal is used because human response times are +// right-skewed: mostly quick, occasionally slow (distracted/thinking). +// +// Usage: +// let delay = TimingJitter.humanDelay(base: 0.5) // ~0.2-1.5s +// let typeDelay = TimingJitter.typingDelay() // ~50-150ms per char +// let point = TimingJitter.jitter(point, radius: 2) // ±2px + +import CoreGraphics +import Foundation + +/// Human-like timing randomization to avoid bot detection. +public enum TimingJitter { + + // MARK: - Action Delays + + /// Generate a human-like delay between actions. + /// + /// Uses log-normal distribution centered around `base` seconds. + /// Log-normal models human reaction times: mostly quick, occasionally slow. + /// + /// - Parameter base: The median delay in seconds (default 0.5s). + /// - Returns: A randomized delay in seconds, typically 0.3x-3x of base. + public static func humanDelay(base: TimeInterval = 0.5) -> TimeInterval { + let mu = log(base) + let sigma = 0.4 + // Box-Muller transform for normal distribution + let u1 = Double.random(in: 0.001...1.0) + let u2 = Double.random(in: 0.0...1.0) + let normal = sqrt(-2.0 * log(u1)) * cos(2.0 * .pi * u2) + // Clamp to avoid extreme outliers + let clamped = max(-2.5, min(2.5, normal)) + let result = exp(mu + sigma * clamped) + // Floor at 50ms, ceiling at 5s + return max(0.05, min(5.0, result)) + } + + /// Generate a human-like typing delay per character. + /// + /// Average human typing speed: 40-80 WPM (75-150ms per character). + /// Fast typists: 80-120 WPM (50-75ms). Hunts-and-pecks: 20-40 WPM (150-300ms). + /// + /// - Returns: Delay in seconds for one keystroke. + public static func typingDelay() -> TimeInterval { + // Normal range centered at 100ms with 30ms std dev + let base = 0.1 + let jitter = Double.random(in: -0.05...0.05) + let result = base + jitter + return max(0.03, min(0.25, result)) + } + + /// Generate a burst typing pattern: fast sequences with occasional pauses. + /// + /// Humans type in bursts of 3-8 characters, then pause briefly (word boundary, + /// thinking, looking at keyboard). This is more realistic than uniform timing. + /// + /// - Parameter charIndex: The index of the current character in the string. + /// - Returns: Delay in seconds before this keystroke. + public static func burstTypingDelay(charIndex: Int) -> TimeInterval { + // Every 3-8 chars, insert a longer "thinking" pause + let burstLength = Int.random(in: 3...8) + if charIndex > 0 && charIndex % burstLength == 0 { + return humanDelay(base: 0.3) // Word boundary pause + } + return typingDelay() + } + + // MARK: - Coordinate Jitter + + /// Add random noise to click coordinates. + /// + /// Humans don't click at exact pixel coordinates. There's always a few + /// pixels of noise from hand tremor and mouse precision. + /// + /// - Parameters: + /// - point: The target click point. + /// - radius: Maximum jitter in pixels (default ±2px). + /// - Returns: A slightly randomized point within the jitter radius. + public static func jitter(_ point: CGPoint, radius: CGFloat = 2.0) -> CGPoint { + CGPoint( + x: point.x + CGFloat.random(in: -radius...radius), + y: point.y + CGFloat.random(in: -radius...radius) + ) + } + + // MARK: - Pre/Post Action Delays + + /// Delay before clicking (human reads/aims at target). + public static func preClickDelay() -> TimeInterval { + humanDelay(base: 0.3) + } + + /// Delay after clicking (human waits for visual feedback). + public static func postClickDelay() -> TimeInterval { + humanDelay(base: 0.5) + } + + /// Delay before typing starts (human focuses on input field). + public static func preTypeDelay() -> TimeInterval { + humanDelay(base: 0.2) + } +} diff --git a/Tests/GhostOSTests/StealthTests.swift b/Tests/GhostOSTests/StealthTests.swift new file mode 100644 index 0000000..cb874e3 --- /dev/null +++ b/Tests/GhostOSTests/StealthTests.swift @@ -0,0 +1,130 @@ +// StealthTests.swift - Unit tests for TimingJitter and BehavioralMimicry + +import CoreGraphics +import Testing +@testable import GhostOS + +@Suite("Stealth Tests") +struct StealthTests { + + // MARK: - TimingJitter + + @Test("Human delay produces bounded values") + func humanDelayBounds() { + for _ in 0..<100 { + let delay = TimingJitter.humanDelay(base: 0.5) + #expect(delay >= 0.05, "Delay should be at least 50ms, got \(delay)") + #expect(delay <= 5.0, "Delay should be at most 5s, got \(delay)") + } + } + + @Test("Human delay centers around base value") + func humanDelayDistribution() { + let samples = (0..<500).map { _ in TimingJitter.humanDelay(base: 0.5) } + let mean = samples.reduce(0, +) / Double(samples.count) + // Log-normal mean = exp(mu + sigma^2/2) ≈ 0.54 for base=0.5, sigma=0.4 + #expect(mean > 0.25, "Mean should be above 0.25, got \(mean)") + #expect(mean < 1.2, "Mean should be below 1.2, got \(mean)") + } + + @Test("Typing delay is within realistic range") + func typingDelayRange() { + for _ in 0..<100 { + let delay = TimingJitter.typingDelay() + #expect(delay >= 0.03, "Too fast: \(delay)") + #expect(delay <= 0.25, "Too slow: \(delay)") + } + } + + @Test("Burst typing includes occasional pauses") + func burstTypingPattern() { + var normalCount = 0 + var pauseCount = 0 + for i in 0..<100 { + let delay = TimingJitter.burstTypingDelay(charIndex: i) + if delay > 0.15 { + pauseCount += 1 + } else { + normalCount += 1 + } + } + // There should be some pauses (word boundaries) but not too many + #expect(pauseCount > 0, "Should have at least some burst pauses") + #expect(normalCount > pauseCount, "Normal keystrokes should outnumber pauses") + } + + @Test("Coordinate jitter stays within radius") + func coordinateJitter() { + let origin = CGPoint(x: 100, y: 200) + for _ in 0..<100 { + let jittered = TimingJitter.jitter(origin, radius: 3.0) + let dx = abs(jittered.x - origin.x) + let dy = abs(jittered.y - origin.y) + #expect(dx <= 3.0, "X jitter exceeded radius: \(dx)") + #expect(dy <= 3.0, "Y jitter exceeded radius: \(dy)") + } + } + + // MARK: - BehavioralMimicry + + @Test("Mouse path starts and ends at correct points") + func mousePathEndpoints() { + let from = CGPoint(x: 100, y: 100) + let to = CGPoint(x: 500, y: 300) + let path = BehavioralMimicry.mousePath(from: from, to: to, steps: 10) + + #expect(path.count == 11, "Should have steps+1 points") + #expect(path.first!.x == from.x) + #expect(path.first!.y == from.y) + #expect(path.last!.x == to.x) + #expect(path.last!.y == to.y) + } + + @Test("Mouse path is not a straight line") + func mousePathCurvature() { + let from = CGPoint(x: 0, y: 0) + let to = CGPoint(x: 400, y: 0) + let path = BehavioralMimicry.mousePath(from: from, to: to, steps: 10) + + // At least one midpoint should deviate from Y=0 (Bezier curve) + let midpoints = path.dropFirst().dropLast() + let hasDeviation = midpoints.contains { abs($0.y) > 0.5 } + #expect(hasDeviation, "Path should curve, not be a straight line") + } + + @Test("Short mouse path handles nearby targets") + func shortMousePath() { + let from = CGPoint(x: 100, y: 100) + let to = CGPoint(x: 110, y: 105) + let path = BehavioralMimicry.shortMousePath(from: from, to: to) + + #expect(path.count >= 2, "Should have at least start and end") + #expect(path.first!.x == from.x) + #expect(path.last!.x == to.x) + } + + @Test("Click offset stays within element bounds") + func clickOffsetBounds() { + let center = CGPoint(x: 200, y: 150) + for _ in 0..<100 { + let offset = BehavioralMimicry.clickOffset(center: center, width: 80, height: 30) + let dx = abs(offset.x - center.x) + let dy = abs(offset.y - center.y) + #expect(dx <= 5.0, "X offset too large: \(dx)") + #expect(dy <= 3.0, "Y offset too large: \(dy)") + } + } + + @Test("Pre-action scroll is bounded") + func preActionScroll() { + var scrollCount = 0 + for _ in 0..<100 { + let amount = BehavioralMimicry.preActionScrollAmount() + #expect(amount >= 0 && amount <= 3) + if amount > 0 { scrollCount += 1 } + } + // ~25% should scroll (binomial: expect 15-35 in 100 trials) + #expect(scrollCount >= 5, "Too few scrolls: \(scrollCount)") + #expect(scrollCount <= 50, "Too many scrolls: \(scrollCount)") + } +} From f48e96d5f9cb33898099d4208c30ee7dee62313d Mon Sep 17 00:00:00 2001 From: Maurice Wen Date: Wed, 25 Mar 2026 09:17:25 +0800 Subject: [PATCH 4/4] feat(recipes): auto-heal engine + github-pr-review recipe MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RecipeEngine now auto-heals failed click/type/hover steps in browser apps by retrying via CDP element finding. When a DOM ID changes after a web app update, the auto-heal finds the element by text content and re-executes the action at CDP-found coordinates. Auto-heal flow: step fails → detect browser app → CDP query by computedNameContains → re-execute action at CDP coordinates → log "[auto-healed via CDP]" in step result. New recipe: github-pr-review.json (navigate Files changed, open review dialog, type comment, submit review). --- Sources/GhostOS/Recipes/RecipeEngine.swift | 126 +++++++++++++++++++-- recipes/github-pr-review.json | 82 ++++++++++++++ 2 files changed, 201 insertions(+), 7 deletions(-) create mode 100644 recipes/github-pr-review.json diff --git a/Sources/GhostOS/Recipes/RecipeEngine.swift b/Sources/GhostOS/Recipes/RecipeEngine.swift index 59e692a..9b9b168 100644 --- a/Sources/GhostOS/Recipes/RecipeEngine.swift +++ b/Sources/GhostOS/Recipes/RecipeEngine.swift @@ -86,14 +86,34 @@ public enum RecipeEngine { stepResults.append(stepResult) if !result.success { - let failurePolicy = step.onFailure ?? globalFailurePolicy - - if failurePolicy == "skip" { - Log.info("Recipe '\(recipe.name)' step \(step.id) failed (skipping): \(result.error ?? "")") - continue - } + // AUTO-HEAL: Before giving up, try CDP-based element finding for + // click/type actions in browser apps. This handles the common case + // where a web app updated its DOM IDs but the element text is the same. + if let _ = attemptAutoHeal( + step: step, resolvedParams: resolvedParams, appName: recipe.app + ) { + // Auto-heal succeeded — update the step result + let healDuration = Int(Date().timeIntervalSince(stepStart) * 1000) + stepResults[stepResults.count - 1] = RecipeStepResult( + stepId: step.id, + action: step.action, + success: true, + durationMs: healDuration, + error: nil, + note: (step.note ?? step.action) + " [auto-healed via CDP]" + ) + Log.info("Recipe '\(recipe.name)' step \(step.id) auto-healed via CDP") + // Continue to wait_after handling below (don't return) + } else { + // Auto-heal failed — apply normal failure policy + let failurePolicy = step.onFailure ?? globalFailurePolicy + + if failurePolicy == "skip" { + Log.info("Recipe '\(recipe.name)' step \(step.id) failed (skipping): \(result.error ?? "")") + continue + } - // Stop: return failure with diagnostics + // Stop: return failure with diagnostics let totalDuration = Int(Date().timeIntervalSince(startTime) * 1000) // Capture failure context @@ -126,6 +146,7 @@ public enum RecipeEngine { error: "Recipe '\(recipe.name)' failed at step \(step.id) (\(step.note ?? step.action)): \(result.error ?? "")", suggestion: "Check the current_context and failed_step details. Use ghost_screenshot for visual debugging." ) + } // end else (auto-heal failed) } // Handle wait_after condition (substitute {{params}} in value) @@ -246,6 +267,97 @@ public enum RecipeEngine { ) } + // MARK: - Auto-Heal + + /// Attempt to recover a failed recipe step using CDP element finding. + /// + /// When a click/type step fails (usually because a DOM ID changed after a + /// web app update), this function tries to find the target element via CDP + /// using the step's computedNameContains text. If found, it re-executes the + /// action with the CDP-found coordinates. + /// + /// Only applies to click/type/hover actions in browser apps with CDP available. + /// Returns nil if auto-heal is not applicable or fails. + private static func attemptAutoHeal( + step: RecipeStep, + resolvedParams: [String: String]?, + appName: String? + ) -> ToolResult? { + // Only auto-heal click/type/hover actions + guard ["click", "type", "hover"].contains(step.action) else { return nil } + + // Only for browser apps with CDP + guard CDPBridge.isBrowserApp(appName), CDPBridge.isAvailable() else { return nil } + + // Need a text query to search for + let query = step.target?.computedNameContains + ?? resolvedParams?["query"] + ?? resolvedParams?["into"] + ?? resolvedParams?["target"] + guard let query, !query.isEmpty else { return nil } + + Log.info("Auto-heal: trying CDP for '\(query)' (step \(step.id), action: \(step.action))") + + // Try to find the element via CDP + guard let cdpElements = CDPBridge.findElements(query: query), + let first = cdpElements.first + else { + Log.info("Auto-heal: CDP found no matches for '\(query)'") + return nil + } + + // Get screen coordinates + let viewportX = first["centerX"] as? Int ?? 0 + let viewportY = first["centerY"] as? Int ?? 0 + let windowOrigin = Perception.chromeWindowOriginPublic(appName: appName) + let screen = CDPBridge.viewportToScreen( + viewportX: Double(viewportX), viewportY: Double(viewportY), + windowX: windowOrigin.x, windowY: windowOrigin.y + ) + + // Re-execute the action with CDP coordinates + switch step.action { + case "click": + let result = Actions.click( + query: nil, role: nil, domId: nil, + appName: appName, + x: screen.x, y: screen.y, + button: resolvedParams?["button"], + count: resolvedParams?["count"].flatMap(Int.init) + ) + return result.success ? result : nil + + case "type": + // Click the field first, then type + let clickResult = Actions.click( + query: nil, role: nil, domId: nil, + appName: appName, + x: screen.x, y: screen.y, + button: nil, count: nil + ) + guard clickResult.success else { return nil } + + if let text = resolvedParams?["text"] { + let clear = resolvedParams?["clear"] == "true" + return Actions.typeText( + text: text, into: nil, domId: nil, + appName: appName, clear: clear + ) + } + return clickResult + + case "hover": + return Actions.hover( + query: nil, role: nil, domId: nil, + appName: appName, + x: screen.x, y: screen.y + ) + + default: + return nil + } + } + // MARK: - Step Execution /// Execute a single recipe step by dispatching to the appropriate action. diff --git a/recipes/github-pr-review.json b/recipes/github-pr-review.json new file mode 100644 index 0000000..ecb2726 --- /dev/null +++ b/recipes/github-pr-review.json @@ -0,0 +1,82 @@ +{ + "schema_version": 2, + "name": "github-pr-review", + "description": "Open a GitHub pull request and leave a review comment. Navigate to the PR page first (github.com/owner/repo/pull/N).", + "app": "Google Chrome", + "params": { + "comment": { + "type": "string", + "description": "Review comment text to leave on the PR", + "required": true + }, + "action": { + "type": "string", + "description": "Review action: 'approve', 'comment', or 'request_changes' (default: 'comment')" + } + }, + "preconditions": { + "app_running": "Google Chrome", + "url_contains": "github.com" + }, + "steps": [ + { + "id": 1, + "action": "click", + "target": { + "criteria": [{"attribute": "AXRole", "value": "AXLink"}], + "computedNameContains": "Files changed" + }, + "wait_after": { + "condition": "elementExists", + "value": "Review changes", + "timeout": 10 + }, + "note": "Navigate to Files changed tab" + }, + { + "id": 2, + "action": "click", + "target": { + "criteria": [], + "computedNameContains": "Review changes" + }, + "wait_after": { + "condition": "elementExists", + "value": "Leave a comment", + "timeout": 5 + }, + "note": "Open review dialog" + }, + { + "id": 3, + "action": "click", + "target": { + "criteria": [], + "computedNameContains": "Leave a comment" + }, + "note": "Focus the comment textarea" + }, + { + "id": 4, + "action": "type", + "params": {"text": "{{comment}}"}, + "note": "Type review comment" + }, + { + "id": 5, + "action": "click", + "target": { + "criteria": [], + "computedNameContains": "Submit review" + }, + "wait_after": { + "condition": "elementExists", + "value": "reviewed", + "timeout": 10 + }, + "on_failure": "skip", + "note": "Submit the review" + } + ], + "on_failure": "stop" +}