Skip to content

Commit be6fa6c

Browse files
k4cper-gclaude
andcommitted
Fix long text input dropping characters on Windows
_type() now tries ValuePattern.SetValue() before falling back to SendInput keyboard simulation. This bypasses the message queue entirely for elements that support it (e.g. Notepad). The SendInput fallback also removes chunking in favor of a single atomic call. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent ecef1e1 commit be6fa6c

File tree

2 files changed

+144
-31
lines changed

2 files changed

+144
-31
lines changed

cup/actions/_macos.py

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,30 @@ def _get_element_center(element) -> tuple[float, float] | None:
208208
return x + w / 2.0, y + h / 2.0
209209

210210

211+
def _get_element_center_or_parent(element) -> tuple[float, float] | None:
212+
"""Get center point of an element, walking up parents if needed.
213+
214+
Some elements (e.g., offscreen web content nodes in Safari) don't
215+
report valid bounds. This function walks up the AXParent chain to
216+
find the nearest ancestor with bounds, falling back to the window
217+
center as a last resort.
218+
"""
219+
from ApplicationServices import AXUIElementCopyAttributeValue, kAXErrorSuccess
220+
221+
current = element
222+
for _ in range(20): # guard against infinite loops
223+
center = _get_element_center(current)
224+
if center is not None:
225+
return center
226+
# Walk up to parent
227+
err, parent = AXUIElementCopyAttributeValue(current, "AXParent", None)
228+
if err != kAXErrorSuccess or parent is None:
229+
break
230+
current = parent
231+
232+
return None
233+
234+
211235
def _send_mouse_click(
212236
x: float,
213237
y: float,
@@ -305,30 +329,39 @@ def _send_mouse_long_press(x: float, y: float, duration: float = 0.8) -> None:
305329

306330

307331
def _send_scroll(x: float, y: float, direction: str, amount: int = 5) -> None:
308-
"""Send scroll event at screen coordinates via Quartz CGEvents."""
332+
"""Send scroll event at screen coordinates via Quartz CGEvents.
333+
334+
Uses pixel-based scrolling (kCGScrollEventUnitPixel) for reliable
335+
scrolling across all apps. Line-based scrolling (kCGScrollEventUnitLine)
336+
is unreliable in apps like Safari where line units may be interpreted
337+
as tiny or zero-pixel movements.
338+
"""
309339
from Quartz import (
310340
CGEventCreateScrollWheelEvent,
311341
CGEventPost,
312342
CGEventSetLocation,
313343
CGPointMake,
314344
kCGHIDEventTap,
315-
kCGScrollEventUnitLine,
345+
kCGScrollEventUnitPixel,
316346
)
317347

318348
point = CGPointMake(x, y)
319349

350+
# Convert line amount to pixels (~80px per line is a reasonable default)
351+
pixel_amount = amount * 80
352+
320353
if direction == "up":
321-
dy, dx = amount, 0
354+
dy, dx = pixel_amount, 0
322355
elif direction == "down":
323-
dy, dx = -amount, 0
356+
dy, dx = -pixel_amount, 0
324357
elif direction == "left":
325-
dy, dx = 0, amount
358+
dy, dx = 0, pixel_amount
326359
elif direction == "right":
327-
dy, dx = 0, -amount
360+
dy, dx = 0, -pixel_amount
328361
else:
329362
dy, dx = 0, 0
330363

331-
event = CGEventCreateScrollWheelEvent(None, kCGScrollEventUnitLine, 2, dy, dx)
364+
event = CGEventCreateScrollWheelEvent(None, kCGScrollEventUnitPixel, 2, dy, dx)
332365
CGEventSetLocation(event, point)
333366
CGEventPost(kCGHIDEventTap, event)
334367
time.sleep(0.02)
@@ -707,8 +740,10 @@ def _select(self, element) -> ActionResult:
707740
return self._click(element)
708741

709742
def _scroll(self, element, direction: str) -> ActionResult:
710-
# Get element center for scroll target
711-
center = _get_element_center(element)
743+
# Get element center for scroll target, walking up parents if needed.
744+
# Some elements (e.g., offscreen nodes in Safari) have no bounds,
745+
# so we fall back to the nearest ancestor with valid bounds.
746+
center = _get_element_center(element) or _get_element_center_or_parent(element)
712747
if center:
713748
try:
714749
_send_scroll(center[0], center[1], direction)

cup/actions/_windows.py

Lines changed: 100 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -259,33 +259,67 @@ def _send_unicode_string(text: str) -> None:
259259
Unlike _send_key_combo which maps characters to virtual key codes
260260
(breaking special characters like :, /, -, .), this sends each
261261
character as a Unicode scan code — preserving all characters exactly.
262+
263+
Control characters (newlines, tabs) are sent as virtual-key presses
264+
(VK_RETURN, VK_TAB) because many Windows apps — including the modern
265+
Windows 11 Notepad — do not interpret these when delivered as Unicode
266+
scan codes via KEYEVENTF_UNICODE.
267+
268+
Long strings are sent in chunks with brief pauses so the target app's
269+
message queue can keep up.
262270
"""
263-
inputs = []
264-
for char in text:
265-
code = ord(char)
266-
# Key down
267-
inp_down = INPUT()
268-
inp_down.type = INPUT_KEYBOARD
269-
inp_down._input.ki.wVk = 0
270-
inp_down._input.ki.wScan = code
271-
inp_down._input.ki.dwFlags = KEYEVENTF_UNICODE
272-
inputs.append(inp_down)
273-
# Key up
274-
inp_up = INPUT()
275-
inp_up.type = INPUT_KEYBOARD
276-
inp_up._input.ki.wVk = 0
277-
inp_up._input.ki.wScan = code
278-
inp_up._input.ki.dwFlags = KEYEVENTF_UNICODE | KEYEVENTF_KEYUP
279-
inputs.append(inp_up)
271+
# Normalize newlines: \r\n → \n, then standalone \r → \n.
272+
# We'll emit VK_RETURN for every \n below.
273+
text = text.replace("\r\n", "\n").replace("\r", "\n")
274+
275+
# Map control characters to their virtual-key codes.
276+
_CONTROL_VK = {
277+
"\n": 0x0D, # VK_RETURN
278+
"\t": 0x09, # VK_TAB
279+
}
280280

281+
inputs: list[INPUT] = []
282+
for char in text:
283+
vk = _CONTROL_VK.get(char)
284+
if vk is not None:
285+
# Send control character as a normal virtual-key press.
286+
inputs.append(_make_key_input(vk, down=True))
287+
inputs.append(_make_key_input(vk, down=False))
288+
else:
289+
code = ord(char)
290+
# Key down
291+
inp_down = INPUT()
292+
inp_down.type = INPUT_KEYBOARD
293+
inp_down._input.ki.wVk = 0
294+
inp_down._input.ki.wScan = code
295+
inp_down._input.ki.dwFlags = KEYEVENTF_UNICODE
296+
inputs.append(inp_down)
297+
# Key up
298+
inp_up = INPUT()
299+
inp_up.type = INPUT_KEYBOARD
300+
inp_up._input.ki.wVk = 0
301+
inp_up._input.ki.wScan = code
302+
inp_up._input.ki.dwFlags = KEYEVENTF_UNICODE | KEYEVENTF_KEYUP
303+
inputs.append(inp_up)
304+
305+
# Send all events in a single atomic SendInput call.
306+
_flush_inputs(inputs)
307+
308+
309+
def _flush_inputs(inputs: list[INPUT]) -> None:
310+
"""Send a batch of INPUT events via SendInput with a brief trailing pause."""
281311
if not inputs:
282312
return
283-
284313
arr = (INPUT * len(inputs))(*inputs)
285314
sent = ctypes.windll.user32.SendInput(len(inputs), arr, ctypes.sizeof(INPUT))
286315
if sent == 0:
287316
err = ctypes.get_last_error()
288-
raise RuntimeError(f"SendInput (unicode) failed, sent 0/{len(inputs)} events (error={err})")
317+
raise RuntimeError(
318+
f"SendInput (unicode) failed, sent 0/{len(inputs)} events (error={err})"
319+
)
320+
# Brief pause gives the target app time to process the events before
321+
# the next chunk arrives.
322+
time.sleep(0.01)
289323

290324

291325
# ---------------------------------------------------------------------------
@@ -477,7 +511,25 @@ def _toggle(self, element) -> ActionResult:
477511
)
478512

479513
def _type(self, element, text: str) -> ActionResult:
480-
"""Type text via Unicode SendInput (preserves all special characters)."""
514+
"""Type text into an element.
515+
516+
Prefers ValuePattern.SetValue (instant, lossless) when available.
517+
Falls back to Unicode SendInput for elements that don't expose it.
518+
"""
519+
import comtypes
520+
521+
# Fast path: use ValuePattern to set text directly (no keyboard sim).
522+
try:
523+
pat = _get_pattern(element, UIA_ValuePatternId, _IValue)
524+
if pat:
525+
element.SetFocus()
526+
time.sleep(0.05)
527+
pat.SetValue(text)
528+
return ActionResult(success=True, message=f"Typed: {text}")
529+
except (comtypes.COMError, Exception):
530+
pass # fall through to SendInput
531+
532+
# Fallback: keyboard simulation via Unicode SendInput.
481533
try:
482534
element.SetFocus()
483535
time.sleep(0.05)
@@ -685,7 +737,28 @@ def open_app(self, name: str) -> ActionResult:
685737
error="Could not discover installed applications",
686738
)
687739

740+
# Try matching against display names first.
688741
match = _fuzzy_match(name, list(apps.keys()))
742+
743+
# If no match on display names, try matching against AppIDs.
744+
# This handles localized Windows where display names are
745+
# translated (e.g. "Notatnik" for Notepad on Polish Windows)
746+
# but AppIDs still contain the English name.
747+
if match is None:
748+
appid_to_name: dict[str, str] = {}
749+
for display, appid in apps.items():
750+
# Extract a readable name from the AppID.
751+
# UWP: "Microsoft.WindowsNotepad_8wekyb3d8bbwe!App" -> "WindowsNotepad"
752+
# Path: just use the display name (already tried above).
753+
parts = appid.split(".")
754+
if len(parts) >= 2:
755+
# Take the component after "Microsoft." etc., strip the suffix
756+
raw = parts[-1].split("_")[0].split("!")[0]
757+
appid_to_name[raw.lower()] = display
758+
appid_match = _fuzzy_match(name, list(appid_to_name.keys()))
759+
if appid_match is not None:
760+
match = appid_to_name[appid_match]
761+
689762
if match is None:
690763
return ActionResult(
691764
success=False,
@@ -847,7 +920,11 @@ def callback(hwnd, _lparam):
847920

848921
def _run_powershell(command: str, timeout: int = 10) -> tuple[str, bool]:
849922
"""Run a PowerShell command using base64-encoded input. Returns (output, success)."""
850-
encoded = base64.b64encode(command.encode("utf-16le")).decode("ascii")
923+
# Prepend a UTF-8 output-encoding directive so the stdout bytes are
924+
# valid UTF-8 regardless of the system's default codepage (e.g. cp1250
925+
# on Polish Windows which cannot represent many app names).
926+
full_command = "[Console]::OutputEncoding = [System.Text.Encoding]::UTF8; " + command
927+
encoded = base64.b64encode(full_command.encode("utf-16le")).decode("ascii")
851928
try:
852929
result = subprocess.run(
853930
[
@@ -859,8 +936,9 @@ def _run_powershell(command: str, timeout: int = 10) -> tuple[str, bool]:
859936
encoded,
860937
],
861938
capture_output=True,
862-
text=True,
863939
timeout=timeout,
940+
encoding="utf-8",
941+
errors="replace",
864942
)
865943
return result.stdout or "", result.returncode == 0
866944
except (subprocess.TimeoutExpired, FileNotFoundError, OSError):

0 commit comments

Comments
 (0)