Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Sources/GhostOS/Learning/EventHandlers.swift
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,13 @@ nonisolated enum EventHandlers {
if flags.contains(.maskShift) { mods.append("shift") }
if flags.contains(.maskAlternate) { mods.append("option") }
if flags.contains(.maskControl) { mods.append("control") }
let hasShortcutModifier = flags.contains(.maskCommand)
|| flags.contains(.maskAlternate)
|| flags.contains(.maskControl)

let chars = keyChars(from: event)

if let chars, mods.isEmpty {
if let chars, !hasShortcutModifier {
// Coalesce character keys into pending buffer
recorder.withLock { session in
guard session != nil else { return }
Expand Down
63 changes: 49 additions & 14 deletions Sources/GhostOS/Learning/LearningRecorder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ nonisolated public final class LearningRecorder: @unchecked Sendable {
private var eventTap: CFMachPort?
private var learningRunLoop: CFRunLoop?
private var learningThread: Thread?
private var startupCancelled = false

// Keystroke coalescing -- only access within withLock or flushPending* (caller holds lock)
internal var pendingKeystrokes: String = ""
Expand Down Expand Up @@ -64,27 +65,37 @@ nonisolated public final class LearningRecorder: @unchecked Sendable {
if session != nil { os_unfair_lock_unlock(&lock); return .alreadyRecording }
session = LearningSession(taskDescription: taskDescription)
lastRecordedAppName = ""
startupCancelled = false
os_unfair_lock_unlock(&lock)

let thread = Thread { [weak self] in self?.runLearningThread() }
let startupSignal = DispatchSemaphore(value: 0)
let thread = Thread { [weak self] in self?.runLearningThread(startupSignal: startupSignal) }
thread.name = "ghost-learning"
thread.qualityOfService = .userInteractive
learningThread = thread
thread.start()

// Busy-wait up to 500ms for tap creation
for _ in 0..<50 {
Thread.sleep(forTimeInterval: 0.01)
os_unfair_lock_lock(&lock)
let ready = eventTap != nil
os_unfair_lock_unlock(&lock)
if ready { learningLog("INFO", "Learning: recording started"); return nil }
}
let signaled = startupSignal.wait(timeout: .now() + .milliseconds(500)) == .success

os_unfair_lock_lock(&lock)
let failed = eventTap == nil
if failed { session = nil }
let failed = !signaled || eventTap == nil
if failed {
session = nil
startupCancelled = true
}
os_unfair_lock_unlock(&lock)

if failed {
for _ in 0..<50 {
if learningThread?.isFinished == true { break }
Thread.sleep(forTimeInterval: 0.01)
}
if learningThread?.isFinished == true {
learningThread = nil
learningRunLoop = nil
}
}
if !failed { learningLog("INFO", "Learning: recording started") }
return failed ? .inputMonitoringNotGranted : nil
}

Expand All @@ -97,6 +108,7 @@ nonisolated public final class LearningRecorder: @unchecked Sendable {
let actions = cur.actions
let result = cur
session = nil
startupCancelled = false
os_unfair_lock_unlock(&lock)

if let rl = learningRunLoop { CFRunLoopStop(rl) }
Expand All @@ -120,7 +132,15 @@ nonisolated public final class LearningRecorder: @unchecked Sendable {

// MARK: - Background Thread

private func runLearningThread() {
private func runLearningThread(startupSignal: DispatchSemaphore) {
os_unfair_lock_lock(&lock)
let cancelledBeforeStart = startupCancelled || session == nil
os_unfair_lock_unlock(&lock)
if cancelledBeforeStart {
startupSignal.signal()
return
}

var mask: CGEventMask = 0
for t: CGEventType in [.keyDown, .leftMouseDown, .leftMouseUp, .rightMouseDown, .rightMouseUp, .scrollWheel] {
mask |= (1 << t.rawValue)
Expand All @@ -131,13 +151,25 @@ nonisolated public final class LearningRecorder: @unchecked Sendable {
eventsOfInterest: mask, callback: learningEventCallback, userInfo: userInfo
) else {
learningLog("ERROR", "Learning: CGEvent tap creation failed (Input Monitoring not granted?)")
os_unfair_lock_lock(&lock); session = nil; os_unfair_lock_unlock(&lock)
os_unfair_lock_lock(&lock)
session = nil
startupCancelled = true
os_unfair_lock_unlock(&lock)
startupSignal.signal()
return
}

let source = CFMachPortCreateRunLoopSource(nil, tap, 0)
let rl = CFRunLoopGetCurrent()!
os_unfair_lock_lock(&lock); eventTap = tap; learningRunLoop = rl; os_unfair_lock_unlock(&lock)
os_unfair_lock_lock(&lock)
let cancelledAfterCreate = startupCancelled || session == nil
if !cancelledAfterCreate {
eventTap = tap
learningRunLoop = rl
}
os_unfair_lock_unlock(&lock)
startupSignal.signal()
if cancelledAfterCreate { return }

CFRunLoopAddSource(rl, source, .commonModes)
CGEvent.tapEnable(tap: tap, enable: true)
Expand All @@ -160,6 +192,9 @@ nonisolated public final class LearningRecorder: @unchecked Sendable {
CFRunLoopRemoveSource(rl, source, .commonModes)
os_unfair_lock_lock(&lock)
eventTap = nil
learningRunLoop = nil
learningThread = nil
startupCancelled = false
invalidateTimer(&keystrokeFlushTimer)
invalidateTimer(&scrollFlushTimer)
invalidateTimer(&maxDurationTimer)
Expand Down
74 changes: 73 additions & 1 deletion Sources/GhostOS/Recipes/RecipeEngine.swift
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,42 @@ public enum RecipeEngine {
count: params["count"].flatMap(Int.init)
)

case "vision_click":
guard let description = params["description"] ?? params["query"] ?? params["target"] else {
return ToolResult(
success: false,
error: "Step \(step.id): 'vision_click' action requires 'description' param"
)
}

let cropBox = parseCropBox(params["crop_box"], appName: stepApp)
let grounded = VisionPerception.groundElement(
description: description,
appName: stepApp,
cropBox: cropBox
)
guard grounded.success,
let x = grounded.data?["x"] as? Double,
let y = grounded.data?["y"] as? Double
else {
return ToolResult(
success: false,
error: grounded.error ?? "Step \(step.id): vision grounding failed",
suggestion: grounded.suggestion
)
}

return Actions.click(
query: nil,
role: nil,
domId: nil,
appName: stepApp,
x: x,
y: y,
button: params["button"],
count: params["count"].flatMap(Int.init)
)

case "type":
guard let text = params["text"] else {
return ToolResult(success: false, error: "Step \(step.id): 'type' action requires 'text' param")
Expand Down Expand Up @@ -374,11 +410,47 @@ public enum RecipeEngine {
return ToolResult(
success: false,
error: "Unknown recipe action: '\(step.action)'",
suggestion: "Valid actions: click, type, press, hotkey, focus, scroll, hover, long_press, drag, wait"
suggestion: "Valid actions: click, vision_click, type, press, hotkey, focus, scroll, hover, long_press, drag, wait"
)
}
}

private static func parseCropBox(_ rawValue: String?, appName: String?) -> [Double]? {
guard let rawValue else { return nil }

let numbers = rawValue
.split(separator: ",")
.compactMap { Double($0.trimmingCharacters(in: .whitespaces)) }

guard numbers.count == 4 else { return nil }

// Treat normalized values (0...1) as ratios of the focused window frame.
let isNormalized = numbers.allSatisfy { $0 >= 0 && $0 <= 1 }
guard isNormalized else { return numbers }

guard let appName,
let app = Perception.findApp(named: appName),
let appElement = Element.application(for: app.processIdentifier),
let window = appElement.focusedWindow(),
let pos = window.position(),
let size = window.size()
else {
return nil
}

let originX = Double(pos.x)
let originY = Double(pos.y)
let width = Double(size.width)
let height = Double(size.height)

return [
originX + numbers[0] * width,
originY + numbers[1] * height,
originX + numbers[2] * width,
originY + numbers[3] * height,
]
}

// MARK: - Wait After

/// Handle a step's wait_after condition.
Expand Down
28 changes: 28 additions & 0 deletions recipes/linkedin-submit-comment.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"schema_version": 2,
"name": "linkedin-submit-comment",
"description": "Submit an already-open LinkedIn comment draft in Chrome by visually locating the blue 评论 button inside the active comment composer.",
"app": "Google Chrome",
"preconditions": {
"app_running": "Google Chrome",
"url_contains": "linkedin.com"
},
"steps": [
{
"id": 1,
"action": "vision_click",
"on_failure": "skip",
"note": "Click the blue 评论 submit button in the open LinkedIn comment composer",
"params": {
"description": "blue 评论 submit button at the bottom right of the open LinkedIn comment box",
"crop_box": "0.68,0.38,0.98,0.86"
},
"wait_after": {
"condition": "elementGone",
"value": "内容创建文本编辑器",
"timeout": 2
}
}
],
"on_failure": "stop"
}