From 184d9669ebdcc61d43a5a2b7b8ce36551c237de3 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:58:05 -0700 Subject: [PATCH 1/3] perf(visual): skip re-OCR of unchanged pixels, right-size the OCR input, pin the clipboard preface Three visual-context efficiency cuts. Refocusing a window re-ran the full Vision pass even when the captured pixels were identical; a small pixel-hash cache now reuses the raw extraction while hygiene and bounding still rerun against the live field text, so a hit stays byte-identical to re-OCRing the same pixels. The pre-OCR downscale cap drops from 1600 to 1200 (the Retina capture of the 700pt strip exceeds both caps, and 1200 keeps UI text well above Vision's recognition floor while cutting the Vision workload ~44%). And the clipboard relevance verdict, which was re-evaluated against the live prefix on every request, is now pinned per field session once accepted: the clipboard section precedes the typed prefix in the prompt, so each verdict flip rewrote the prompt head and collapsed the engine's reusable KV common prefix into a full re-prefill. --- .../SuggestionCoordinator+Prediction.swift | 55 +++++++++++---- .../Coordinators/SuggestionCoordinator.swift | 15 ++++ Cotabby/Models/VisualContextModels.swift | 8 ++- .../Visual/ScreenshotContextGenerator.swift | 70 ++++++++++++++++++- .../ScreenshotContextGeneratorTests.swift | 44 ++++++++++++ 5 files changed, 174 insertions(+), 18 deletions(-) diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index 68380313..fcfae5e9 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -96,21 +96,7 @@ extension SuggestionCoordinator { let visualContextSummary = permissionManager.screenRecordingGranted ? visualContextCoordinator.excerpt(for: context) : nil - let rawClipboard = settingsSnapshot.isClipboardContextEnabled - ? clipboardContextProvider.currentContext() - : nil - // Same bounded window the downstream distiller sees, so the relevance gate and the - // per-line filter can't disagree about what "shares tokens with the prefix" means. - let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix( - from: rawContext.precedingText, - configuration: configuration, - engine: settingsSnapshot.selectedEngine - ) - let clipboardContext = clipboardRelevanceFilter.filter( - clipboard: rawClipboard, - pasteboardChangeCount: clipboardContextProvider.currentChangeCount, - precedingText: truncatedPrefix - ) + let clipboardContext = pinnedClipboardContext(rawContext: rawContext) let requestBuildResult = SuggestionRequestFactory.buildRequest( context: context, settings: settingsSnapshot, @@ -164,6 +150,45 @@ extension SuggestionCoordinator { } } + /// Resolves the clipboard prompt section under the pinning policy documented on + /// `clipboardPrefaceMemo`: an accepted (non-nil) verdict is reused for the rest of the field + /// session so the prompt head stays stable and the engine's KV common prefix survives; a nil + /// verdict re-evaluates per request because it adds nothing to the prompt and the clipboard + /// may only become relevant once more text is typed. A new copy or a field switch always + /// re-evaluates. + private func pinnedClipboardContext(rawContext: FocusedInputSnapshot) -> String? { + guard settingsSnapshot.isClipboardContextEnabled else { + return nil + } + + let changeCount = clipboardContextProvider.currentChangeCount + if let memo = clipboardPrefaceMemo, + memo.focusSequence == rawContext.focusChangeSequence, + memo.changeCount == changeCount, + memo.value != nil { + return memo.value + } + + // Same bounded window the downstream distiller sees, so the relevance gate and the + // per-line filter can't disagree about what "shares tokens with the prefix" means. + let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix( + from: rawContext.precedingText, + configuration: configuration, + engine: settingsSnapshot.selectedEngine + ) + let value = clipboardRelevanceFilter.filter( + clipboard: clipboardContextProvider.currentContext(), + pasteboardChangeCount: changeCount, + precedingText: truncatedPrefix + ) + clipboardPrefaceMemo = ClipboardPrefaceMemo( + focusSequence: rawContext.focusChangeSequence, + changeCount: changeCount, + value: value + ) + return value + } + /// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing, /// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate /// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget. diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift index 14980d95..6475bae3 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift @@ -78,6 +78,21 @@ final class SuggestionCoordinator: ObservableObject { // barrier task that the next generation must cross before it can ask the runtime for output. var cacheResetSequence: UInt64 = 0 var pendingCacheReset: (sequence: UInt64, task: Task)? + /// One accepted clipboard-relevance verdict per (field session, pasteboard state). The verdict + /// used to be re-evaluated against the live prefix on every request, and because the clipboard + /// section precedes the typed prefix in the prompt, every flip rewrote the prompt HEAD and + /// collapsed the engine's reusable common prefix back to zero (a full re-prefill). A pinned + /// non-nil verdict keeps the prompt head stable for the field session; a nil verdict keeps + /// re-evaluating because adding nothing to the prompt cannot destabilize the head, and the + /// clipboard may only become relevant once more text is typed. A new copy (change count) or a + /// field switch (focus sequence) always re-evaluates. See `pinnedClipboardContext`. + struct ClipboardPrefaceMemo { + let focusSequence: UInt64 + let changeCount: Int + let value: String? + } + + var clipboardPrefaceMemo: ClipboardPrefaceMemo? /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop. /// /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this diff --git a/Cotabby/Models/VisualContextModels.swift b/Cotabby/Models/VisualContextModels.swift index c346caa3..bd80e917 100644 --- a/Cotabby/Models/VisualContextModels.swift +++ b/Cotabby/Models/VisualContextModels.swift @@ -20,8 +20,12 @@ struct VisualContextConfiguration: Equatable, Sendable { static let `default` = VisualContextConfiguration( // Capture a wider field-centered area so OCR can see nearby labels and conversation turns. snapshotDimension: 700, - // Vision's accurate mode benefits from more pixels, especially on dense document UIs. - maxImageDimension: 1600, + // Vision's accurate mode benefits from more pixels, but OCR cost scales with pixel area + // and a Retina capture of the 700pt strip arrives well above this cap either way. 1200 + // keeps typical 11-13pt UI text comfortably above Vision's recognition floor (the strip + // is ~1000pt wide, so this is ~1.2 px/pt) while cutting the Vision workload ~44% versus + // the previous 1600 cap. + maxImageDimension: 1200, minRecognizedCharacterCount: 12, // The summarizer needs enough raw OCR to recover task, filenames, and nearby messages. maxRecognizedCharacters: 5000, diff --git a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift index fe0f42cf..70ef7101 100644 --- a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift +++ b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift @@ -34,6 +34,14 @@ final class ScreenshotContextGenerator { private let textExtractor: any ScreenTextExtracting private let configuration: VisualContextConfiguration + /// Recent OCR extractions keyed by a pixel hash of the captured crop, so refocusing a window + /// whose content has not changed skips the Vision pass (the dominant cost of this pipeline). + /// Only the raw extraction is cached: hygiene and bounding still rerun against the live field + /// text below, so a cache hit stays byte-identical to re-OCRing identical pixels. Bounded to a + /// few entries so alt-tabbing between two or three windows keeps hitting. + private var extractionCache: [(hash: UInt64, extracted: ExtractedScreenText)] = [] + private static let extractionCacheLimit = 4 + init( screenshotService: (any WindowScreenshotCapturing)? = nil, textExtractor: (any ScreenTextExtracting)? = nil, @@ -60,6 +68,11 @@ final class ScreenshotContextGenerator { await onStatusChange?(.extractingText) + let pixelHash = Self.pixelHash(of: screenshot.image) + if let pixelHash, let cached = cachedExtraction(for: pixelHash) { + return try finishedExcerpt(from: cached, context: context, image: screenshot.image) + } + let extracted: ExtractedScreenText do { extracted = try await textExtractor.extractText(from: screenshot.image) @@ -89,6 +102,19 @@ final class ScreenshotContextGenerator { throw ScreenshotContextGenerationError.failed(error.localizedDescription) } + storeExtraction(extracted, for: pixelHash) + return try finishedExcerpt(from: extracted, context: context, image: screenshot.image) + } + + /// Hygiene, normalization, bounding, and the meaningful-signal gate, shared by the fresh and + /// cache-hit paths so a hit stays byte-identical to re-OCRing the same pixels. The field-text + /// stripping in particular must rerun per call: the cached extraction may have been taken when + /// the user's own typed text differed. + private func finishedExcerpt( + from extracted: ExtractedScreenText, + context: FocusedInputSnapshot, + image: CGImage + ) throws -> VisualContextExcerpt { // Filter OCR corruption (garbled / symbol-noise / digit-substituted lines) and strip any // line that merely echoes the user's own field text, then sanitize for prompt-injection // safety. No model summarization: a base model conditions fine on cleaned raw context, and @@ -102,7 +128,7 @@ final class ScreenshotContextGenerator { if CotabbyDebugOptions.isEnabled { saveDebugScreenshot( - screenshot.image, + image, text: extracted.text, name: sanitizedDebugName(from: context.applicationName) ) @@ -122,6 +148,48 @@ final class ScreenshotContextGenerator { return VisualContextExcerpt(text: finalContextText) } + // MARK: - Extraction cache + + /// FNV-1a over a strided sample of the image bytes, mixed with the dimensions. Sampling every + /// 16th byte keeps the hash sub-millisecond on Retina crops while still touching every row; + /// any real content change moves enough antialiased pixels that a stride collision is + /// vanishingly unlikely, and the worst case of one is reusing OCR text for a window whose + /// pixels barely changed. `nil` (no readable backing data) simply disables caching. + private static func pixelHash(of image: CGImage) -> UInt64? { + guard let data = image.dataProvider?.data, + let bytes = CFDataGetBytePtr(data) else { + return nil + } + + let length = CFDataGetLength(data) + let prime: UInt64 = 0x0000_0100_0000_01B3 + var hash: UInt64 = 0xcbf2_9ce4_8422_2325 + var index = 0 + while index < length { + hash = (hash ^ UInt64(bytes[index])) &* prime + index += 16 + } + hash = (hash ^ UInt64(image.width)) &* prime + hash = (hash ^ UInt64(image.height)) &* prime + return hash + } + + private func cachedExtraction(for hash: UInt64) -> ExtractedScreenText? { + extractionCache.first(where: { $0.hash == hash })?.extracted + } + + private func storeExtraction(_ extracted: ExtractedScreenText, for hash: UInt64?) { + guard let hash else { + return + } + + extractionCache.removeAll { $0.hash == hash } + extractionCache.append((hash, extracted)) + if extractionCache.count > Self.extractionCacheLimit { + extractionCache.removeFirst(extractionCache.count - Self.extractionCacheLimit) + } + } + private func captureScreenshot( for context: FocusedInputSnapshot, onStatusChange: (@Sendable (VisualContextStatus) async -> Void)? diff --git a/CotabbyTests/ScreenshotContextGeneratorTests.swift b/CotabbyTests/ScreenshotContextGeneratorTests.swift index 1fdc6faf..0ea125cc 100644 --- a/CotabbyTests/ScreenshotContextGeneratorTests.swift +++ b/CotabbyTests/ScreenshotContextGeneratorTests.swift @@ -80,6 +80,34 @@ final class ScreenshotContextGeneratorTests: XCTestCase { ) } + func test_generateContext_reusesExtractionForIdenticalPixels() async throws { + let line = "GeneralPaneView.swift should say Screen Recording is required for autocomplete context" + let extractor = CountingTextExtractor( + extracted: ExtractedScreenText( + text: line, + lineCount: 1, + lines: [OCRTextHygiene.OCRLine(text: line, confidence: 0.9)] + ) + ) + let generator = ScreenshotContextGenerator( + screenshotService: StubScreenshotCapture( + screenshot: CapturedWindowScreenshot(image: makeImage(), windowTitle: nil) + ), + textExtractor: extractor, + configuration: .default + ) + + let first = try await generator.generateContext(for: makeSnapshot()) + let second = try await generator.generateContext(for: makeSnapshot()) + + XCTAssertEqual( + extractor.extractionCount, + 1, + "Re-capturing pixel-identical content must reuse the extraction instead of re-running Vision." + ) + XCTAssertEqual(first.text, second.text, "A cache hit must produce the same excerpt as a fresh OCR.") + } + func test_generateContext_dropsLowConfidenceOCRLines() async throws { // A clean, plausible sentence at low confidence must be dropped even though no other hygiene // filter would catch it, proving real per-line Vision confidence now reaches the hygiene pass. @@ -152,6 +180,22 @@ private struct StubScreenshotCapture: WindowScreenshotCapturing { } } +/// Counts Vision-pass invocations so the pixel-hash extraction cache can be asserted on. +@MainActor +private final class CountingTextExtractor: ScreenTextExtracting { + private let extracted: ExtractedScreenText + private(set) var extractionCount = 0 + + init(extracted: ExtractedScreenText) { + self.extracted = extracted + } + + func extractText(from image: CGImage) async throws -> ExtractedScreenText { + extractionCount += 1 + return extracted + } +} + private struct StubTextExtractor: ScreenTextExtracting { enum Result { case success(ExtractedScreenText) From 0dc803a0ca53cfaf1164634d29e668a3eb6a6177 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:08:06 -0700 Subject: [PATCH 2/3] fix(tests): align the default-config expectation with the 1200px OCR input cap --- CotabbyTests/PermissionAndContextModelTests.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CotabbyTests/PermissionAndContextModelTests.swift b/CotabbyTests/PermissionAndContextModelTests.swift index bd2a8019..ee308f5b 100644 --- a/CotabbyTests/PermissionAndContextModelTests.swift +++ b/CotabbyTests/PermissionAndContextModelTests.swift @@ -101,7 +101,8 @@ final class VisualContextModelTests: XCTestCase { func test_defaultConfiguration_hasExpectedValues() { let config = VisualContextConfiguration.default XCTAssertEqual(config.snapshotDimension, 700) - XCTAssertEqual(config.maxImageDimension, 1600) + // 1200 is the measured-tradeoff OCR input cap; see the rationale on the default config. + XCTAssertEqual(config.maxImageDimension, 1200) XCTAssertEqual(config.minRecognizedCharacterCount, 12) XCTAssertEqual(config.maxRecognizedCharacters, 5000) XCTAssertEqual(config.maxSummaryCharacters, 1500) From de46342342a6f76538c2ef6339b28e96591011c2 Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:22:21 -0700 Subject: [PATCH 3/3] review: stride the pixel hash coprime with the pixel size so all channels are sampled --- Cotabby/Services/Visual/ScreenshotContextGenerator.swift | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift index 70ef7101..0ca338b1 100644 --- a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift +++ b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift @@ -165,9 +165,12 @@ final class ScreenshotContextGenerator { let prime: UInt64 = 0x0000_0100_0000_01B3 var hash: UInt64 = 0xcbf2_9ce4_8422_2325 var index = 0 + // 17, not 16: with 4-byte pixels a multiple-of-4 stride lands on the same color channel + // forever, so a chroma-only change (e.g. a theme toggle with unchanged luminance) could + // hash identically. A stride coprime with the pixel size cycles through all four channels. while index < length { hash = (hash ^ UInt64(bytes[index])) &* prime - index += 16 + index += 17 } hash = (hash ^ UInt64(image.width)) &* prime hash = (hash ^ UInt64(image.height)) &* prime