Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 40 additions & 15 deletions Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -96,21 +96,7 @@ extension SuggestionCoordinator {
let visualContextSummary = permissionManager.screenRecordingGranted
? visualContextCoordinator.excerpt(for: context)
: nil
let rawClipboard = settingsSnapshot.isClipboardContextEnabled
? clipboardContextProvider.currentContext()
: nil
// Same bounded window the downstream distiller sees, so the relevance gate and the
// per-line filter can't disagree about what "shares tokens with the prefix" means.
let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix(
from: rawContext.precedingText,
configuration: configuration,
engine: settingsSnapshot.selectedEngine
)
let clipboardContext = clipboardRelevanceFilter.filter(
clipboard: rawClipboard,
pasteboardChangeCount: clipboardContextProvider.currentChangeCount,
precedingText: truncatedPrefix
)
let clipboardContext = pinnedClipboardContext(rawContext: rawContext)
let requestBuildResult = SuggestionRequestFactory.buildRequest(
context: context,
settings: settingsSnapshot,
Expand Down Expand Up @@ -164,6 +150,45 @@ extension SuggestionCoordinator {
}
}

/// Resolves the clipboard prompt section under the pinning policy documented on
/// `clipboardPrefaceMemo`: an accepted (non-nil) verdict is reused for the rest of the field
/// session so the prompt head stays stable and the engine's KV common prefix survives; a nil
/// verdict re-evaluates per request because it adds nothing to the prompt and the clipboard
/// may only become relevant once more text is typed. A new copy or a field switch always
/// re-evaluates.
private func pinnedClipboardContext(rawContext: FocusedInputSnapshot) -> String? {
guard settingsSnapshot.isClipboardContextEnabled else {
return nil
}

let changeCount = clipboardContextProvider.currentChangeCount
if let memo = clipboardPrefaceMemo,
memo.focusSequence == rawContext.focusChangeSequence,
memo.changeCount == changeCount,
memo.value != nil {
return memo.value
}

// Same bounded window the downstream distiller sees, so the relevance gate and the
// per-line filter can't disagree about what "shares tokens with the prefix" means.
let truncatedPrefix = SuggestionRequestFactory.truncatedPromptPrefix(
from: rawContext.precedingText,
configuration: configuration,
engine: settingsSnapshot.selectedEngine
)
let value = clipboardRelevanceFilter.filter(
clipboard: clipboardContextProvider.currentContext(),
pasteboardChangeCount: changeCount,
precedingText: truncatedPrefix
)
clipboardPrefaceMemo = ClipboardPrefaceMemo(
focusSequence: rawContext.focusChangeSequence,
changeCount: changeCount,
value: value
)
return value
}

/// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing,
/// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate
/// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget.
Expand Down
15 changes: 15 additions & 0 deletions Cotabby/App/Coordinators/SuggestionCoordinator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,21 @@ final class SuggestionCoordinator: ObservableObject {
// barrier task that the next generation must cross before it can ask the runtime for output.
var cacheResetSequence: UInt64 = 0
var pendingCacheReset: (sequence: UInt64, task: Task<Void, Never>)?
/// One accepted clipboard-relevance verdict per (field session, pasteboard state). The verdict
/// used to be re-evaluated against the live prefix on every request, and because the clipboard
/// section precedes the typed prefix in the prompt, every flip rewrote the prompt HEAD and
/// collapsed the engine's reusable common prefix back to zero (a full re-prefill). A pinned
/// non-nil verdict keeps the prompt head stable for the field session; a nil verdict keeps
/// re-evaluating because adding nothing to the prompt cannot destabilize the head, and the
/// clipboard may only become relevant once more text is typed. A new copy (change count) or a
/// field switch (focus sequence) always re-evaluates. See `pinnedClipboardContext`.
struct ClipboardPrefaceMemo {
let focusSequence: UInt64
let changeCount: Int
let value: String?
}

var clipboardPrefaceMemo: ClipboardPrefaceMemo?
/// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop.
///
/// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this
Expand Down
8 changes: 6 additions & 2 deletions Cotabby/Models/VisualContextModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,12 @@ struct VisualContextConfiguration: Equatable, Sendable {
static let `default` = VisualContextConfiguration(
// Capture a wider field-centered area so OCR can see nearby labels and conversation turns.
snapshotDimension: 700,
// Vision's accurate mode benefits from more pixels, especially on dense document UIs.
maxImageDimension: 1600,
// Vision's accurate mode benefits from more pixels, but OCR cost scales with pixel area
// and a Retina capture of the 700pt strip arrives well above this cap either way. 1200
// keeps typical 11-13pt UI text comfortably above Vision's recognition floor (the strip
// is ~1000pt wide, so this is ~1.2 px/pt) while cutting the Vision workload ~44% versus
// the previous 1600 cap.
maxImageDimension: 1200,
minRecognizedCharacterCount: 12,
// The summarizer needs enough raw OCR to recover task, filenames, and nearby messages.
maxRecognizedCharacters: 5000,
Expand Down
73 changes: 72 additions & 1 deletion Cotabby/Services/Visual/ScreenshotContextGenerator.swift
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ final class ScreenshotContextGenerator {
private let textExtractor: any ScreenTextExtracting
private let configuration: VisualContextConfiguration

/// Recent OCR extractions keyed by a pixel hash of the captured crop, so refocusing a window
/// whose content has not changed skips the Vision pass (the dominant cost of this pipeline).
/// Only the raw extraction is cached: hygiene and bounding still rerun against the live field
/// text below, so a cache hit stays byte-identical to re-OCRing identical pixels. Bounded to a
/// few entries so alt-tabbing between two or three windows keeps hitting.
private var extractionCache: [(hash: UInt64, extracted: ExtractedScreenText)] = []
private static let extractionCacheLimit = 4

init(
screenshotService: (any WindowScreenshotCapturing)? = nil,
textExtractor: (any ScreenTextExtracting)? = nil,
Expand All @@ -60,6 +68,11 @@ final class ScreenshotContextGenerator {

await onStatusChange?(.extractingText)

let pixelHash = Self.pixelHash(of: screenshot.image)
if let pixelHash, let cached = cachedExtraction(for: pixelHash) {
return try finishedExcerpt(from: cached, context: context, image: screenshot.image)
}

let extracted: ExtractedScreenText
do {
extracted = try await textExtractor.extractText(from: screenshot.image)
Expand Down Expand Up @@ -89,6 +102,19 @@ final class ScreenshotContextGenerator {
throw ScreenshotContextGenerationError.failed(error.localizedDescription)
}

storeExtraction(extracted, for: pixelHash)
return try finishedExcerpt(from: extracted, context: context, image: screenshot.image)
}

/// Hygiene, normalization, bounding, and the meaningful-signal gate, shared by the fresh and
/// cache-hit paths so a hit stays byte-identical to re-OCRing the same pixels. The field-text
/// stripping in particular must rerun per call: the cached extraction may have been taken when
/// the user's own typed text differed.
private func finishedExcerpt(
from extracted: ExtractedScreenText,
context: FocusedInputSnapshot,
image: CGImage
) throws -> VisualContextExcerpt {
// Filter OCR corruption (garbled / symbol-noise / digit-substituted lines) and strip any
// line that merely echoes the user's own field text, then sanitize for prompt-injection
// safety. No model summarization: a base model conditions fine on cleaned raw context, and
Expand All @@ -102,7 +128,7 @@ final class ScreenshotContextGenerator {

if CotabbyDebugOptions.isEnabled {
saveDebugScreenshot(
screenshot.image,
image,
text: extracted.text,
name: sanitizedDebugName(from: context.applicationName)
)
Expand All @@ -122,6 +148,51 @@ final class ScreenshotContextGenerator {
return VisualContextExcerpt(text: finalContextText)
}

// MARK: - Extraction cache

/// FNV-1a over a strided sample of the image bytes, mixed with the dimensions. Sampling every
/// 16th byte keeps the hash sub-millisecond on Retina crops while still touching every row;
/// any real content change moves enough antialiased pixels that a stride collision is
/// vanishingly unlikely, and the worst case of one is reusing OCR text for a window whose
/// pixels barely changed. `nil` (no readable backing data) simply disables caching.
private static func pixelHash(of image: CGImage) -> UInt64? {
guard let data = image.dataProvider?.data,
let bytes = CFDataGetBytePtr(data) else {
return nil
}

let length = CFDataGetLength(data)
let prime: UInt64 = 0x0000_0100_0000_01B3
var hash: UInt64 = 0xcbf2_9ce4_8422_2325
var index = 0
// 17, not 16: with 4-byte pixels a multiple-of-4 stride lands on the same color channel
// forever, so a chroma-only change (e.g. a theme toggle with unchanged luminance) could
// hash identically. A stride coprime with the pixel size cycles through all four channels.
while index < length {
hash = (hash ^ UInt64(bytes[index])) &* prime
index += 17
}
hash = (hash ^ UInt64(image.width)) &* prime
hash = (hash ^ UInt64(image.height)) &* prime
return hash
Comment thread
greptile-apps[bot] marked this conversation as resolved.
}

private func cachedExtraction(for hash: UInt64) -> ExtractedScreenText? {
extractionCache.first(where: { $0.hash == hash })?.extracted
}

private func storeExtraction(_ extracted: ExtractedScreenText, for hash: UInt64?) {
guard let hash else {
return
}

extractionCache.removeAll { $0.hash == hash }
extractionCache.append((hash, extracted))
if extractionCache.count > Self.extractionCacheLimit {
extractionCache.removeFirst(extractionCache.count - Self.extractionCacheLimit)
}
}

private func captureScreenshot(
for context: FocusedInputSnapshot,
onStatusChange: (@Sendable (VisualContextStatus) async -> Void)?
Expand Down
3 changes: 2 additions & 1 deletion CotabbyTests/PermissionAndContextModelTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,8 @@ final class VisualContextModelTests: XCTestCase {
func test_defaultConfiguration_hasExpectedValues() {
let config = VisualContextConfiguration.default
XCTAssertEqual(config.snapshotDimension, 700)
XCTAssertEqual(config.maxImageDimension, 1600)
// 1200 is the measured-tradeoff OCR input cap; see the rationale on the default config.
XCTAssertEqual(config.maxImageDimension, 1200)
XCTAssertEqual(config.minRecognizedCharacterCount, 12)
XCTAssertEqual(config.maxRecognizedCharacters, 5000)
XCTAssertEqual(config.maxSummaryCharacters, 1500)
Expand Down
44 changes: 44 additions & 0 deletions CotabbyTests/ScreenshotContextGeneratorTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,34 @@ final class ScreenshotContextGeneratorTests: XCTestCase {
)
}

func test_generateContext_reusesExtractionForIdenticalPixels() async throws {
let line = "GeneralPaneView.swift should say Screen Recording is required for autocomplete context"
let extractor = CountingTextExtractor(
extracted: ExtractedScreenText(
text: line,
lineCount: 1,
lines: [OCRTextHygiene.OCRLine(text: line, confidence: 0.9)]
)
)
let generator = ScreenshotContextGenerator(
screenshotService: StubScreenshotCapture(
screenshot: CapturedWindowScreenshot(image: makeImage(), windowTitle: nil)
),
textExtractor: extractor,
configuration: .default
)

let first = try await generator.generateContext(for: makeSnapshot())
let second = try await generator.generateContext(for: makeSnapshot())

XCTAssertEqual(
extractor.extractionCount,
1,
"Re-capturing pixel-identical content must reuse the extraction instead of re-running Vision."
)
XCTAssertEqual(first.text, second.text, "A cache hit must produce the same excerpt as a fresh OCR.")
}

func test_generateContext_dropsLowConfidenceOCRLines() async throws {
// A clean, plausible sentence at low confidence must be dropped even though no other hygiene
// filter would catch it, proving real per-line Vision confidence now reaches the hygiene pass.
Expand Down Expand Up @@ -152,6 +180,22 @@ private struct StubScreenshotCapture: WindowScreenshotCapturing {
}
}

/// Counts Vision-pass invocations so the pixel-hash extraction cache can be asserted on.
@MainActor
private final class CountingTextExtractor: ScreenTextExtracting {
private let extracted: ExtractedScreenText
private(set) var extractionCount = 0

init(extracted: ExtractedScreenText) {
self.extracted = extracted
}

func extractText(from image: CGImage) async throws -> ExtractedScreenText {
extractionCount += 1
return extracted
}
}

private struct StubTextExtractor: ScreenTextExtracting {
enum Result {
case success(ExtractedScreenText)
Expand Down