FuJacob · FuJacob · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
@@ -96,21 +96,7 @@ extension SuggestionCoordinator {
         let visualContextSummary = permissionManager.screenRecordingGranted
             ? visualContextCoordinator.excerpt(for: context)
             : nil
-        let rawClipboard = settingsSnapshot.isClipboardContextEnabled
-            ? clipboardContextProvider.currentContext()
-            : nil
-        // Same bounded window the downstream distiller sees, so the relevance gate and the
-        // per-line filter can't disagree about what "shares tokens with the prefix" means.
-        let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix(
-            from: rawContext.precedingText,
-            configuration: configuration,
-            engine: settingsSnapshot.selectedEngine
-        )
-        let clipboardContext = clipboardRelevanceFilter.filter(
-            clipboard: rawClipboard,
-            pasteboardChangeCount: clipboardContextProvider.currentChangeCount,
-            precedingText: truncatedPrefix
-        )
+        let clipboardContext = pinnedClipboardContext(rawContext: rawContext)
         let requestBuildResult = SuggestionRequestFactory.buildRequest(
             context: context,
             settings: settingsSnapshot,
@@ -164,6 +150,45 @@ extension SuggestionCoordinator {
         }
     }
 
+    /// Resolves the clipboard prompt section under the pinning policy documented on
+    /// `clipboardPrefaceMemo`: an accepted (non-nil) verdict is reused for the rest of the field
+    /// session so the prompt head stays stable and the engine's KV common prefix survives; a nil
+    /// verdict re-evaluates per request because it adds nothing to the prompt and the clipboard
+    /// may only become relevant once more text is typed. A new copy or a field switch always
+    /// re-evaluates.
+    private func pinnedClipboardContext(rawContext: FocusedInputSnapshot) -> String? {
+        guard settingsSnapshot.isClipboardContextEnabled else {
+            return nil
+        }
+
+        let changeCount = clipboardContextProvider.currentChangeCount
+        if let memo = clipboardPrefaceMemo,
+           memo.focusSequence == rawContext.focusChangeSequence,
+           memo.changeCount == changeCount,
+           memo.value != nil {
+            return memo.value
+        }
+
+        // Same bounded window the downstream distiller sees, so the relevance gate and the
+        // per-line filter can't disagree about what "shares tokens with the prefix" means.
+        let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix(
+            from: rawContext.precedingText,
+            configuration: configuration,
+            engine: settingsSnapshot.selectedEngine
+        )
+        let value = clipboardRelevanceFilter.filter(
+            clipboard: clipboardContextProvider.currentContext(),
+            pasteboardChangeCount: changeCount,
+            precedingText: truncatedPrefix
+        )
+        clipboardPrefaceMemo = ClipboardPrefaceMemo(
+            focusSequence: rawContext.focusChangeSequence,
+            changeCount: changeCount,
+            value: value
+        )
+        return value
+    }
+
     /// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing,
     /// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate
     /// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget.

diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
@@ -78,6 +78,21 @@ final class SuggestionCoordinator: ObservableObject {
     // barrier task that the next generation must cross before it can ask the runtime for output.
     var cacheResetSequence: UInt64 = 0
     var pendingCacheReset: (sequence: UInt64, task: Task<Void, Never>)?
+    /// One accepted clipboard-relevance verdict per (field session, pasteboard state). The verdict
+    /// used to be re-evaluated against the live prefix on every request, and because the clipboard
+    /// section precedes the typed prefix in the prompt, every flip rewrote the prompt HEAD and
+    /// collapsed the engine's reusable common prefix back to zero (a full re-prefill). A pinned
+    /// non-nil verdict keeps the prompt head stable for the field session; a nil verdict keeps
+    /// re-evaluating because adding nothing to the prompt cannot destabilize the head, and the
+    /// clipboard may only become relevant once more text is typed. A new copy (change count) or a
+    /// field switch (focus sequence) always re-evaluates. See `pinnedClipboardContext`.
+    struct ClipboardPrefaceMemo {
+        let focusSequence: UInt64
+        let changeCount: Int
+        let value: String?
+    }
+
+    var clipboardPrefaceMemo: ClipboardPrefaceMemo?
     /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop.
     ///
     /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this

diff --git a/Cotabby/Models/VisualContextModels.swift b/Cotabby/Models/VisualContextModels.swift
@@ -20,8 +20,12 @@ struct VisualContextConfiguration: Equatable, Sendable {
     static let `default` = VisualContextConfiguration(
         // Capture a wider field-centered area so OCR can see nearby labels and conversation turns.
         snapshotDimension: 700,
-        // Vision's accurate mode benefits from more pixels, especially on dense document UIs.
-        maxImageDimension: 1600,
+        // Vision's accurate mode benefits from more pixels, but OCR cost scales with pixel area
+        // and a Retina capture of the 700pt strip arrives well above this cap either way. 1200
+        // keeps typical 11-13pt UI text comfortably above Vision's recognition floor (the strip
+        // is ~1000pt wide, so this is ~1.2 px/pt) while cutting the Vision workload ~44% versus
+        // the previous 1600 cap.
+        maxImageDimension: 1200,
         minRecognizedCharacterCount: 12,
         // The summarizer needs enough raw OCR to recover task, filenames, and nearby messages.
         maxRecognizedCharacters: 5000,

diff --git a/Cotabby/Services/Visual/ScreenshotContextGenerator.swift b/Cotabby/Services/Visual/ScreenshotContextGenerator.swift
@@ -34,6 +34,14 @@ final class ScreenshotContextGenerator {
     private let textExtractor: any ScreenTextExtracting
     private let configuration: VisualContextConfiguration
 
+    /// Recent OCR extractions keyed by a pixel hash of the captured crop, so refocusing a window
+    /// whose content has not changed skips the Vision pass (the dominant cost of this pipeline).
+    /// Only the raw extraction is cached: hygiene and bounding still rerun against the live field
+    /// text below, so a cache hit stays byte-identical to re-OCRing identical pixels. Bounded to a
+    /// few entries so alt-tabbing between two or three windows keeps hitting.
+    private var extractionCache: [(hash: UInt64, extracted: ExtractedScreenText)] = []
+    private static let extractionCacheLimit = 4
+
     init(
         screenshotService: (any WindowScreenshotCapturing)? = nil,
         textExtractor: (any ScreenTextExtracting)? = nil,
@@ -60,6 +68,11 @@ final class ScreenshotContextGenerator {
 
         await onStatusChange?(.extractingText)
 
+        let pixelHash = Self.pixelHash(of: screenshot.image)
+        if let pixelHash, let cached = cachedExtraction(for: pixelHash) {
+            return try finishedExcerpt(from: cached, context: context, image: screenshot.image)
+        }
+
         let extracted: ExtractedScreenText
         do {
             extracted = try await textExtractor.extractText(from: screenshot.image)
@@ -89,6 +102,19 @@ final class ScreenshotContextGenerator {
             throw ScreenshotContextGenerationError.failed(error.localizedDescription)
         }
 
+        storeExtraction(extracted, for: pixelHash)
+        return try finishedExcerpt(from: extracted, context: context, image: screenshot.image)
+    }
+
+    /// Hygiene, normalization, bounding, and the meaningful-signal gate, shared by the fresh and
+    /// cache-hit paths so a hit stays byte-identical to re-OCRing the same pixels. The field-text
+    /// stripping in particular must rerun per call: the cached extraction may have been taken when
+    /// the user's own typed text differed.
+    private func finishedExcerpt(
+        from extracted: ExtractedScreenText,
+        context: FocusedInputSnapshot,
+        image: CGImage
+    ) throws -> VisualContextExcerpt {
         // Filter OCR corruption (garbled / symbol-noise / digit-substituted lines) and strip any
         // line that merely echoes the user's own field text, then sanitize for prompt-injection
         // safety. No model summarization: a base model conditions fine on cleaned raw context, and
@@ -102,7 +128,7 @@ final class ScreenshotContextGenerator {
 
         if CotabbyDebugOptions.isEnabled {
             saveDebugScreenshot(
-                screenshot.image,
+                image,
                 text: extracted.text,
                 name: sanitizedDebugName(from: context.applicationName)
             )
@@ -122,6 +148,51 @@ final class ScreenshotContextGenerator {
         return VisualContextExcerpt(text: finalContextText)
     }
 
+    // MARK: - Extraction cache
+
+    /// FNV-1a over a strided sample of the image bytes, mixed with the dimensions. Sampling every
+    /// 16th byte keeps the hash sub-millisecond on Retina crops while still touching every row;
+    /// any real content change moves enough antialiased pixels that a stride collision is
+    /// vanishingly unlikely, and the worst case of one is reusing OCR text for a window whose
+    /// pixels barely changed. `nil` (no readable backing data) simply disables caching.
+    private static func pixelHash(of image: CGImage) -> UInt64? {
+        guard let data = image.dataProvider?.data,
+              let bytes = CFDataGetBytePtr(data) else {
+            return nil
+        }
+
+        let length = CFDataGetLength(data)
+        let prime: UInt64 = 0x0000_0100_0000_01B3
+        var hash: UInt64 = 0xcbf2_9ce4_8422_2325
+        var index = 0
+        // 17, not 16: with 4-byte pixels a multiple-of-4 stride lands on the same color channel
+        // forever, so a chroma-only change (e.g. a theme toggle with unchanged luminance) could
+        // hash identically. A stride coprime with the pixel size cycles through all four channels.
+        while index < length {
+            hash = (hash ^ UInt64(bytes[index])) &* prime
+            index += 17
+        }
+        hash = (hash ^ UInt64(image.width)) &* prime
+        hash = (hash ^ UInt64(image.height)) &* prime
+        return hash
+    }
+
+    private func cachedExtraction(for hash: UInt64) -> ExtractedScreenText? {
+        extractionCache.first(where: { $0.hash == hash })?.extracted
+    }
+
+    private func storeExtraction(_ extracted: ExtractedScreenText, for hash: UInt64?) {
+        guard let hash else {
+            return
+        }
+
+        extractionCache.removeAll { $0.hash == hash }
+        extractionCache.append((hash, extracted))
+        if extractionCache.count > Self.extractionCacheLimit {
+            extractionCache.removeFirst(extractionCache.count - Self.extractionCacheLimit)
+        }
+    }
+
     private func captureScreenshot(
         for context: FocusedInputSnapshot,
         onStatusChange: (@Sendable (VisualContextStatus) async -> Void)?

diff --git a/CotabbyTests/PermissionAndContextModelTests.swift b/CotabbyTests/PermissionAndContextModelTests.swift
@@ -101,7 +101,8 @@ final class VisualContextModelTests: XCTestCase {
     func test_defaultConfiguration_hasExpectedValues() {
         let config = VisualContextConfiguration.default
         XCTAssertEqual(config.snapshotDimension, 700)
-        XCTAssertEqual(config.maxImageDimension, 1600)
+        // 1200 is the measured-tradeoff OCR input cap; see the rationale on the default config.
+        XCTAssertEqual(config.maxImageDimension, 1200)
         XCTAssertEqual(config.minRecognizedCharacterCount, 12)
         XCTAssertEqual(config.maxRecognizedCharacters, 5000)
         XCTAssertEqual(config.maxSummaryCharacters, 1500)

diff --git a/CotabbyTests/ScreenshotContextGeneratorTests.swift b/CotabbyTests/ScreenshotContextGeneratorTests.swift
@@ -80,6 +80,34 @@ final class ScreenshotContextGeneratorTests: XCTestCase {
         )
     }
 
+    func test_generateContext_reusesExtractionForIdenticalPixels() async throws {
+        let line = "GeneralPaneView.swift should say Screen Recording is required for autocomplete context"
+        let extractor = CountingTextExtractor(
+            extracted: ExtractedScreenText(
+                text: line,
+                lineCount: 1,
+                lines: [OCRTextHygiene.OCRLine(text: line, confidence: 0.9)]
+            )
+        )
+        let generator = ScreenshotContextGenerator(
+            screenshotService: StubScreenshotCapture(
+                screenshot: CapturedWindowScreenshot(image: makeImage(), windowTitle: nil)
+            ),
+            textExtractor: extractor,
+            configuration: .default
+        )
+
+        let first = try await generator.generateContext(for: makeSnapshot())
+        let second = try await generator.generateContext(for: makeSnapshot())
+
+        XCTAssertEqual(
+            extractor.extractionCount,
+            1,
+            "Re-capturing pixel-identical content must reuse the extraction instead of re-running Vision."
+        )
+        XCTAssertEqual(first.text, second.text, "A cache hit must produce the same excerpt as a fresh OCR.")
+    }
+
     func test_generateContext_dropsLowConfidenceOCRLines() async throws {
         // A clean, plausible sentence at low confidence must be dropped even though no other hygiene
         // filter would catch it, proving real per-line Vision confidence now reaches the hygiene pass.
@@ -152,6 +180,22 @@ private struct StubScreenshotCapture: WindowScreenshotCapturing {
     }
 }
 
+/// Counts Vision-pass invocations so the pixel-hash extraction cache can be asserted on.
+@MainActor
+private final class CountingTextExtractor: ScreenTextExtracting {
+    private let extracted: ExtractedScreenText
+    private(set) var extractionCount = 0
+
+    init(extracted: ExtractedScreenText) {
+        self.extracted = extracted
+    }
+
+    func extractText(from image: CGImage) async throws -> ExtractedScreenText {
+        extractionCount += 1
+        return extracted
+    }
+}
+
 private struct StubTextExtractor: ScreenTextExtracting {
     enum Result {
         case success(ExtractedScreenText)