diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index 68380313..fcfae5e9 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -96,21 +96,7 @@ extension SuggestionCoordinator { let visualContextSummary = permissionManager.screenRecordingGranted ? visualContextCoordinator.excerpt(for: context) : nil - let rawClipboard = settingsSnapshot.isClipboardContextEnabled - ? clipboardContextProvider.currentContext() - : nil - // Same bounded window the downstream distiller sees, so the relevance gate and the - // per-line filter can't disagree about what "shares tokens with the prefix" means. - let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix( - from: rawContext.precedingText, - configuration: configuration, - engine: settingsSnapshot.selectedEngine - ) - let clipboardContext = clipboardRelevanceFilter.filter( - clipboard: rawClipboard, - pasteboardChangeCount: clipboardContextProvider.currentChangeCount, - precedingText: truncatedPrefix - ) + let clipboardContext = pinnedClipboardContext(rawContext: rawContext) let requestBuildResult = SuggestionRequestFactory.buildRequest( context: context, settings: settingsSnapshot, @@ -164,6 +150,45 @@ extension SuggestionCoordinator { } } + /// Resolves the clipboard prompt section under the pinning policy documented on + /// `clipboardPrefaceMemo`: an accepted (non-nil) verdict is reused for the rest of the field + /// session so the prompt head stays stable and the engine's KV common prefix survives; a nil + /// verdict re-evaluates per request because it adds nothing to the prompt and the clipboard + /// may only become relevant once more text is typed. A new copy or a field switch always + /// re-evaluates. + private func pinnedClipboardContext(rawContext: FocusedInputSnapshot) -> String? { + guard settingsSnapshot.isClipboardContextEnabled else { + return nil + } + + let changeCount = clipboardContextProvider.currentChangeCount + if let memo = clipboardPrefaceMemo, + memo.focusSequence == rawContext.focusChangeSequence, + memo.changeCount == changeCount, + memo.value != nil { + return memo.value + } + + // Same bounded window the downstream distiller sees, so the relevance gate and the + // per-line filter can't disagree about what "shares tokens with the prefix" means. + let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix( + from: rawContext.precedingText, + configuration: configuration, + engine: settingsSnapshot.selectedEngine + ) + let value = clipboardRelevanceFilter.filter( + clipboard: clipboardContextProvider.currentContext(), + pasteboardChangeCount: changeCount, + precedingText: truncatedPrefix + ) + clipboardPrefaceMemo = ClipboardPrefaceMemo( + focusSequence: rawContext.focusChangeSequence, + changeCount: changeCount, + value: value + ) + return value + } + /// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing, /// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate /// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget. diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift index 14980d95..6475bae3 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift @@ -78,6 +78,21 @@ final class SuggestionCoordinator: ObservableObject { // barrier task that the next generation must cross before it can ask the runtime for output. var cacheResetSequence: UInt64 = 0 var pendingCacheReset: (sequence: UInt64, task: Task)? + /// One accepted clipboard-relevance verdict per (field session, pasteboard state). The verdict + /// used to be re-evaluated against the live prefix on every request, and because the clipboard + /// section precedes the typed prefix in the prompt, every flip rewrote the prompt HEAD and + /// collapsed the engine's reusable common prefix back to zero (a full re-prefill). A pinned + /// non-nil verdict keeps the prompt head stable for the field session; a nil verdict keeps + /// re-evaluating because adding nothing to the prompt cannot destabilize the head, and the + /// clipboard may only become relevant once more text is typed. A new copy (change count) or a + /// field switch (focus sequence) always re-evaluates. See `pinnedClipboardContext`. + struct ClipboardPrefaceMemo { + let focusSequence: UInt64 + let changeCount: Int + let value: String? + } + + var clipboardPrefaceMemo: ClipboardPrefaceMemo? /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop. /// /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this diff --git a/Cotabby/Models/VisualContextModels.swift b/Cotabby/Models/VisualContextModels.swift index c346caa3..bd80e917 100644 --- a/Cotabby/Models/VisualContextModels.swift +++ b/Cotabby/Models/VisualContextModels.swift @@ -20,8 +20,12 @@ struct VisualContextConfiguration: Equatable, Sendable { static let `default` = VisualContextConfiguration( // Capture a wider field-centered area so OCR can see nearby labels and conversation turns. snapshotDimension: 700, - // Vision's accurate mode benefits from more pixels, especially on dense document UIs. - maxImageDimension: 1600, + // Vision's accurate mode benefits from more pixels, but OCR cost scales with pixel area + // and a Retina capture of the 700pt strip arrives well above this cap either way. 1200 + // keeps typical 11-13pt UI text comfortably above Vision's recognition floor (the strip + // is ~1000pt wide, so this is ~1.2 px/pt) while cutting the Vision workload ~44% versus + // the previous 1600 cap. + maxImageDimension: 1200, minRecognizedCharacterCount: 12, // The summarizer needs enough raw OCR to recover task, filenames, and nearby messages. maxRecognizedCharacters: 5000, diff --git a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift index fe0f42cf..0ca338b1 100644 --- a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift +++ b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift @@ -34,6 +34,14 @@ final class ScreenshotContextGenerator { private let textExtractor: any ScreenTextExtracting private let configuration: VisualContextConfiguration + /// Recent OCR extractions keyed by a pixel hash of the captured crop, so refocusing a window + /// whose content has not changed skips the Vision pass (the dominant cost of this pipeline). + /// Only the raw extraction is cached: hygiene and bounding still rerun against the live field + /// text below, so a cache hit stays byte-identical to re-OCRing identical pixels. Bounded to a + /// few entries so alt-tabbing between two or three windows keeps hitting. + private var extractionCache: [(hash: UInt64, extracted: ExtractedScreenText)] = [] + private static let extractionCacheLimit = 4 + init( screenshotService: (any WindowScreenshotCapturing)? = nil, textExtractor: (any ScreenTextExtracting)? = nil, @@ -60,6 +68,11 @@ final class ScreenshotContextGenerator { await onStatusChange?(.extractingText) + let pixelHash = Self.pixelHash(of: screenshot.image) + if let pixelHash, let cached = cachedExtraction(for: pixelHash) { + return try finishedExcerpt(from: cached, context: context, image: screenshot.image) + } + let extracted: ExtractedScreenText do { extracted = try await textExtractor.extractText(from: screenshot.image) @@ -89,6 +102,19 @@ final class ScreenshotContextGenerator { throw ScreenshotContextGenerationError.failed(error.localizedDescription) } + storeExtraction(extracted, for: pixelHash) + return try finishedExcerpt(from: extracted, context: context, image: screenshot.image) + } + + /// Hygiene, normalization, bounding, and the meaningful-signal gate, shared by the fresh and + /// cache-hit paths so a hit stays byte-identical to re-OCRing the same pixels. The field-text + /// stripping in particular must rerun per call: the cached extraction may have been taken when + /// the user's own typed text differed. + private func finishedExcerpt( + from extracted: ExtractedScreenText, + context: FocusedInputSnapshot, + image: CGImage + ) throws -> VisualContextExcerpt { // Filter OCR corruption (garbled / symbol-noise / digit-substituted lines) and strip any // line that merely echoes the user's own field text, then sanitize for prompt-injection // safety. No model summarization: a base model conditions fine on cleaned raw context, and @@ -102,7 +128,7 @@ final class ScreenshotContextGenerator { if CotabbyDebugOptions.isEnabled { saveDebugScreenshot( - screenshot.image, + image, text: extracted.text, name: sanitizedDebugName(from: context.applicationName) ) @@ -122,6 +148,51 @@ final class ScreenshotContextGenerator { return VisualContextExcerpt(text: finalContextText) } + // MARK: - Extraction cache + + /// FNV-1a over a strided sample of the image bytes, mixed with the dimensions. Sampling every + /// 16th byte keeps the hash sub-millisecond on Retina crops while still touching every row; + /// any real content change moves enough antialiased pixels that a stride collision is + /// vanishingly unlikely, and the worst case of one is reusing OCR text for a window whose + /// pixels barely changed. `nil` (no readable backing data) simply disables caching. + private static func pixelHash(of image: CGImage) -> UInt64? { + guard let data = image.dataProvider?.data, + let bytes = CFDataGetBytePtr(data) else { + return nil + } + + let length = CFDataGetLength(data) + let prime: UInt64 = 0x0000_0100_0000_01B3 + var hash: UInt64 = 0xcbf2_9ce4_8422_2325 + var index = 0 + // 17, not 16: with 4-byte pixels a multiple-of-4 stride lands on the same color channel + // forever, so a chroma-only change (e.g. a theme toggle with unchanged luminance) could + // hash identically. A stride coprime with the pixel size cycles through all four channels. + while index < length { + hash = (hash ^ UInt64(bytes[index])) &* prime + index += 17 + } + hash = (hash ^ UInt64(image.width)) &* prime + hash = (hash ^ UInt64(image.height)) &* prime + return hash + } + + private func cachedExtraction(for hash: UInt64) -> ExtractedScreenText? { + extractionCache.first(where: { $0.hash == hash })?.extracted + } + + private func storeExtraction(_ extracted: ExtractedScreenText, for hash: UInt64?) { + guard let hash else { + return + } + + extractionCache.removeAll { $0.hash == hash } + extractionCache.append((hash, extracted)) + if extractionCache.count > Self.extractionCacheLimit { + extractionCache.removeFirst(extractionCache.count - Self.extractionCacheLimit) + } + } + private func captureScreenshot( for context: FocusedInputSnapshot, onStatusChange: (@Sendable (VisualContextStatus) async -> Void)? diff --git a/CotabbyTests/PermissionAndContextModelTests.swift b/CotabbyTests/PermissionAndContextModelTests.swift index bd2a8019..ee308f5b 100644 --- a/CotabbyTests/PermissionAndContextModelTests.swift +++ b/CotabbyTests/PermissionAndContextModelTests.swift @@ -101,7 +101,8 @@ final class VisualContextModelTests: XCTestCase { func test_defaultConfiguration_hasExpectedValues() { let config = VisualContextConfiguration.default XCTAssertEqual(config.snapshotDimension, 700) - XCTAssertEqual(config.maxImageDimension, 1600) + // 1200 is the measured-tradeoff OCR input cap; see the rationale on the default config. + XCTAssertEqual(config.maxImageDimension, 1200) XCTAssertEqual(config.minRecognizedCharacterCount, 12) XCTAssertEqual(config.maxRecognizedCharacters, 5000) XCTAssertEqual(config.maxSummaryCharacters, 1500) diff --git a/CotabbyTests/ScreenshotContextGeneratorTests.swift b/CotabbyTests/ScreenshotContextGeneratorTests.swift index 1fdc6faf..0ea125cc 100644 --- a/CotabbyTests/ScreenshotContextGeneratorTests.swift +++ b/CotabbyTests/ScreenshotContextGeneratorTests.swift @@ -80,6 +80,34 @@ final class ScreenshotContextGeneratorTests: XCTestCase { ) } + func test_generateContext_reusesExtractionForIdenticalPixels() async throws { + let line = "GeneralPaneView.swift should say Screen Recording is required for autocomplete context" + let extractor = CountingTextExtractor( + extracted: ExtractedScreenText( + text: line, + lineCount: 1, + lines: [OCRTextHygiene.OCRLine(text: line, confidence: 0.9)] + ) + ) + let generator = ScreenshotContextGenerator( + screenshotService: StubScreenshotCapture( + screenshot: CapturedWindowScreenshot(image: makeImage(), windowTitle: nil) + ), + textExtractor: extractor, + configuration: .default + ) + + let first = try await generator.generateContext(for: makeSnapshot()) + let second = try await generator.generateContext(for: makeSnapshot()) + + XCTAssertEqual( + extractor.extractionCount, + 1, + "Re-capturing pixel-identical content must reuse the extraction instead of re-running Vision." + ) + XCTAssertEqual(first.text, second.text, "A cache hit must produce the same excerpt as a fresh OCR.") + } + func test_generateContext_dropsLowConfidenceOCRLines() async throws { // A clean, plausible sentence at low confidence must be dropped even though no other hygiene // filter would catch it, proving real per-line Vision confidence now reaches the hygiene pass. @@ -152,6 +180,22 @@ private struct StubScreenshotCapture: WindowScreenshotCapturing { } } +/// Counts Vision-pass invocations so the pixel-hash extraction cache can be asserted on. +@MainActor +private final class CountingTextExtractor: ScreenTextExtracting { + private let extracted: ExtractedScreenText + private(set) var extractionCount = 0 + + init(extracted: ExtractedScreenText) { + self.extracted = extracted + } + + func extractText(from image: CGImage) async throws -> ExtractedScreenText { + extractionCount += 1 + return extracted + } +} + private struct StubTextExtractor: ScreenTextExtracting { enum Result { case success(ExtractedScreenText)