diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index cfd245bd..02a03f32 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -175,6 +175,7 @@ 3E78D03ABA7141D344AB8285 /* he.txt in Resources */ = {isa = PBXBuildFile; fileRef = C9C000E46A1E404932F89C81 /* he.txt */; }; 3EF0A298B5590571B1C37282 /* FieldStyleCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = B7FBF2B766E728F25899B64E /* FieldStyleCache.swift */; }; 3F5630CFB7BA40B900E832A1 /* OCRTextHygieneTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5EED3CD2BC7B48DF35DEE562 /* OCRTextHygieneTests.swift */; }; + 3F87586426B5EF16B41CE62F /* LlamaSuggestionEngineStreamingTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */; }; 3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */; }; 3FCEF50FDD9EE01AE3711083 /* AXTreeDumpWriter.swift in Sources */ = {isa = PBXBuildFile; fileRef = B27492B04B627DA53BDAD938 /* AXTreeDumpWriter.swift */; }; 3FF6B7DE34A01C4AB7FA54E3 /* MacroTriggerStateMachine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1C201A65A6B040F90C528A3B /* MacroTriggerStateMachine.swift */; }; @@ -405,10 +406,12 @@ 9CEBD6AF4405F1BBE0E3D16C /* MidWordContinuationPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 357C18383B047F24A531BDCD /* MidWordContinuationPolicy.swift */; }; 9D0F4829D11BCD4DB1290410 /* InsertionStrategySelector.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0D2FEEA4304C86324BAADAB /* InsertionStrategySelector.swift */; }; 9E031B67A275BB3E049EFC2F /* frequency_dictionary_en_82_765.txt in Resources */ = {isa = PBXBuildFile; fileRef = 99FBB636008490B66CF26772 /* frequency_dictionary_en_82_765.txt */; }; + 9E4AED02831829A108A1AA85 /* StreamedGhostTextPolicyTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */; }; 9EB8E3DC796A0C8BFDE8E683 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = A3E8E86A14090BC7BD13BA76 /* AppDelegate.swift */; }; 9F2FDCABCC941CBECAA3B4AB /* CotabbyInference in Frameworks */ = {isa = PBXBuildFile; productRef = 48A46AD6B613CF06072603E4 /* CotabbyInference */; }; 9F6F88ED74ECA3E23A8E3CC0 /* SecureFieldDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1827565F4FAD3E4E61CA65C3 /* SecureFieldDetector.swift */; }; A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */ = {isa = PBXBuildFile; fileRef = 72B13136DF7318F3E96DF0D3 /* SuggestionCoordinator+Acceptance.swift */; }; + A0A2BD916B2CB22BAF32A62E /* StreamedGhostTextPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */; }; A0BB87E3665EF6C209034798 /* GhostSuggestionLayoutTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5AD3F4F9FBE82007E4E15F58 /* GhostSuggestionLayoutTests.swift */; }; A147C5EC3F2214A670F7556E /* FocusPollBackoffTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */; }; A1A612C90221E0FE1195754A /* SettingsCategory.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5D0AEFF86F8210CBE7CFCBAD /* SettingsCategory.swift */; }; @@ -500,6 +503,7 @@ C607A624A0FB697486C56B8E /* PowerSourceMonitor.swift in Sources */ = {isa = PBXBuildFile; fileRef = DB235F0DEA53295DAF8B4FA0 /* PowerSourceMonitor.swift */; }; C618C5595DA9C57C806A3E03 /* SettingsAttentionEvaluatorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2BC293F6125E2B14DCF05AD9 /* SettingsAttentionEvaluatorTests.swift */; }; C63F95C324C29940FAC6B973 /* de-100k.txt in Resources */ = {isa = PBXBuildFile; fileRef = 4B8665A5495891F9E3DDA48B /* de-100k.txt */; }; + C6925440737F37F537622F35 /* StreamedGhostTextPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */; }; C6A112B51525F988EA46F725 /* SystemResourceSamplerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9255CBCDE66253F521EE0F08 /* SystemResourceSamplerTests.swift */; }; C6A91AD96F52DB72947830C0 /* DownloadableModelCatalogView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB5C2AE9A7E55495D26AD074 /* DownloadableModelCatalogView.swift */; }; C71B594433F3B411CAE5DE7E /* FocusCapabilityResolverTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D4F6D5F94B238F7B4BE7C247 /* FocusCapabilityResolverTests.swift */; }; @@ -704,6 +708,7 @@ 292DC9D4D9D5D26AE882E39B /* EmojiCatalogMatcherTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiCatalogMatcherTests.swift; sourceTree = ""; }; 2930EC34057319130393696B /* KeyCodeLabelsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KeyCodeLabelsTests.swift; sourceTree = ""; }; 2960080A726E51198225147A /* InsertionStrategySelectorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InsertionStrategySelectorTests.swift; sourceTree = ""; }; + 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamedGhostTextPolicy.swift; sourceTree = ""; }; 29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeviceInfo.swift; sourceTree = ""; }; 2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluator.swift; sourceTree = ""; }; 2B7A28471B8526C2693FFF65 /* AcknowledgementsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AcknowledgementsView.swift; sourceTree = ""; }; @@ -939,6 +944,7 @@ D0AF9479EF020071CA64CCC1 /* HuggingFaceModelsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HuggingFaceModelsTests.swift; sourceTree = ""; }; D1123AB515110BD0CBA39490 /* HomePaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HomePaneView.swift; sourceTree = ""; }; D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DecodeStopPolicy.swift; sourceTree = ""; }; + D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamedGhostTextPolicyTests.swift; sourceTree = ""; }; D2D0FE44138BCA8B2EE05AFE /* TypoCaseTransferTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TypoCaseTransferTests.swift; sourceTree = ""; }; D2F46767D9D1F0D44E239CA8 /* DownloadFileRescuerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuerTests.swift; sourceTree = ""; }; D3A2AC525DC664DB540D4F19 /* ClipboardRelevanceFilter.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ClipboardRelevanceFilter.swift; sourceTree = ""; }; @@ -961,6 +967,7 @@ D9C1C921A1CDA2ADFC39EA01 /* AppsPaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppsPaneView.swift; sourceTree = ""; }; DB0CE9AB1286367BA2E82392 /* SettingsContainerView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsContainerView.swift; sourceTree = ""; }; DB235F0DEA53295DAF8B4FA0 /* PowerSourceMonitor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PowerSourceMonitor.swift; sourceTree = ""; }; + DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaSuggestionEngineStreamingTests.swift; sourceTree = ""; }; DDE858CB1E687E3CEB8FDD5B /* SuggestionRequestFactory.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionRequestFactory.swift; sourceTree = ""; }; DDF6A4E9CE93FD53C60E67E3 /* EmojiQueryRun.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiQueryRun.swift; sourceTree = ""; }; DEB16474A67CE1D210B944C9 /* SuggestionSubsystemContracts.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionSubsystemContracts.swift; sourceTree = ""; }; @@ -1366,6 +1373,7 @@ 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */, AABCC3FD99B1824A81E665F3 /* LlamaSuggestionEngineCancellationTests.swift */, 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */, + DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */, 9030FAAB468119A0236284A6 /* LLMIOFileHandlerTests.swift */, D8083D44ABCDCFA68A4CD497 /* MacroEngineTests.swift */, 22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */, @@ -1397,6 +1405,7 @@ D562A73C7C680F2AA65F9F7F /* SpellingDictionaryResourceTests.swift */, E0871985CB1F877EC422E18C /* SpellingLanguageResolverTests.swift */, 9B3179B40A81DF121D1221C6 /* StaticTextRunWalkThrottleTests.swift */, + D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */, C05B0439348261163B37C508 /* SuggestionAvailabilityEvaluatorTests.swift */, EC04832FBD5311352F35241B /* SuggestionCaretLayoutRepairTests.swift */, C375227649689775275AA4B3 /* SuggestionCoordinatorAcceptanceTests.swift */, @@ -1608,6 +1617,7 @@ D4B56C250DDEF3E81F9DCBD7 /* SentenceBoundaryClassifier.swift */, 2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */, 0348A7053E5683C68879A71A /* SpellingLanguageResolver.swift */, + 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */, 3609CC88A5280B3AA40414DF /* SuggestionAvailabilityEvaluator.swift */, B2F95847D76893C8A5B504B4 /* SuggestionOverlayStabilityGate.swift */, DDE858CB1E687E3CEB8FDD5B /* SuggestionRequestFactory.swift */, @@ -1992,6 +2002,7 @@ D6AD25168F108DA8D60E76EF /* SpellingDictionaryPicker.swift in Sources */, 257C2A5D299365C1D98527A8 /* SpellingLanguageResolver.swift in Sources */, 753C5A939E986B1A0FB25664 /* StaticTextRunWalkThrottle.swift in Sources */, + A0A2BD916B2CB22BAF32A62E /* StreamedGhostTextPolicy.swift in Sources */, 333C09921443BDDF21A9753D /* SuggestionAvailabilityEvaluator.swift in Sources */, EC4ED03BE4C7DD0E6319F310 /* SuggestionCoordinator+Acceptance.swift in Sources */, AC4A369EC73115E1F698934D /* SuggestionCoordinator+Input.swift in Sources */, @@ -2215,6 +2226,7 @@ 94F037A3F9D7CE52CC70CA0F /* SpellingDictionaryPicker.swift in Sources */, 1BDEC75125ADFCD67F3C406D /* SpellingLanguageResolver.swift in Sources */, B50EDCA5C4C5FE4FC548AA74 /* StaticTextRunWalkThrottle.swift in Sources */, + C6925440737F37F537622F35 /* StreamedGhostTextPolicy.swift in Sources */, 4F369F5284DDCEABF082E59B /* SuggestionAvailabilityEvaluator.swift in Sources */, A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */, D2F1DD215989BF32675308C2 /* SuggestionCoordinator+Input.swift in Sources */, @@ -2338,6 +2350,7 @@ E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */, BE3CB85508055D159C35020A /* LlamaSuggestionEngineCancellationTests.swift in Sources */, E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */, + 3F87586426B5EF16B41CE62F /* LlamaSuggestionEngineStreamingTests.swift in Sources */, 8429B116328C392DCA018D95 /* MacroEngineTests.swift in Sources */, 3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */, 87806DE08881D11F2608A13D /* MarkerSelectionSynthesizerTests.swift in Sources */, @@ -2368,6 +2381,7 @@ 303652F15C0FE55595669D81 /* SpellingDictionaryResourceTests.swift in Sources */, 66D0D9F605AF462F569A5CFD /* SpellingLanguageResolverTests.swift in Sources */, 96C3128BCB17A05A7C7DEFF7 /* StaticTextRunWalkThrottleTests.swift in Sources */, + 9E4AED02831829A108A1AA85 /* StreamedGhostTextPolicyTests.swift in Sources */, 88BCD795A14E1C9308F7BB31 /* SuggestionAvailabilityEvaluatorTests.swift in Sources */, EB9B5E5F7326AB72E0E44C70 /* SuggestionCaretLayoutRepairTests.swift in Sources */, 5B404450B412A6102F514250 /* SuggestionCoordinatorAcceptanceTests.swift in Sources */, diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift index fcfae5e9..f051fb24 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift @@ -126,13 +126,26 @@ extension SuggestionCoordinator { /// result (or failure) only while it is still the current work. Extracted from /// `generateFromCurrentFocus` so that function stays within the project's complexity budget. private func dispatchGeneration(request: SuggestionRequest, workID: UInt64) { + // A new generation starts a new stream; the previous request's rendered-partial state + // must not gate the new partials' monotonic checks. `isStreamDrainScheduled` is left + // alone on purpose: an already-enqueued drain block cannot be unscheduled, and it + // self-heals either way — it finds nil and clears the flag, or it finds a partial the + // new generation queued in the meantime and renders it under the same work-id guards. + // Resetting the flag here would instead double-schedule a drain for one partial. + streamRenderedText = nil + pendingStreamPartial = nil workController.replaceGenerationWork(for: workID) { [weak self] in guard let self else { return } do { - let result = try await suggestionEngine.generateSuggestion(for: request) + let result = try await suggestionEngine.generateSuggestion( + for: request, + onPartial: { [weak self] partial in + self?.queueStreamedPartial(partial, workID: workID) + } + ) guard !Task.isCancelled, self.workController.isCurrent(workID) else { return } @@ -189,6 +202,77 @@ extension SuggestionCoordinator { return value } + // MARK: - Streamed partial rendering + + /// Coalesces streamed partials to at most one render per runloop turn. Tokens arrive every + /// 10-50ms from the engine, and rendering each one would stack session updates and overlay + /// layout on the main actor; latest-wins coalescing bounds that work while the authoritative + /// final result still arrives through `apply`. + private func queueStreamedPartial(_ partial: SuggestionResult, workID: UInt64) { + guard workController.isCurrent(workID) else { + return + } + pendingStreamPartial = PendingStreamPartial(result: partial, workID: workID) + guard !isStreamDrainScheduled else { + return + } + isStreamDrainScheduled = true + DispatchQueue.main.async { [weak self] in + self?.drainStreamedPartial() + } + } + + private func drainStreamedPartial() { + isStreamDrainScheduled = false + guard let pending = pendingStreamPartial else { + return + } + pendingStreamPartial = nil + applyStreamedPartial(pending.result, workID: pending.workID) + } + + /// Renders one streamed partial as a real, acceptable session. + /// + /// A real session rather than a cosmetic overlay because acceptance gates on the live session + /// (never on `state`), so the user can Tab into a stream the moment the first words appear; + /// accepting cancels the in-flight work (work id bump), freezing the suggestion at what was + /// streamed. Renders are monotonic (`StreamedGhostTextPolicy`) so reordered hops and + /// normalizer rewrites never shrink visible ghost text, and the materialize check stops + /// partials the moment the field text moves on without a keystroke (a keystroke already + /// bumped the work id before this runs). + private func applyStreamedPartial(_ partial: SuggestionResult, workID: UInt64) { + guard workController.isCurrent(workID) else { + return + } + guard StreamedGhostTextPolicy.isRenderableExtension( + candidate: partial.text, + currentlyRendered: streamRenderedText + ) else { + return + } + guard let rawContext = focusModel.snapshot.context else { + return + } + + let liveContext = interactionState.materializeContext(from: rawContext) + guard liveContext.generation == partial.generation else { + return + } + + _ = interactionState.startSession( + fullText: partial.text, + liveContext: liveContext, + latency: partial.latency + ) + streamRenderedText = partial.text + presentOverlay( + text: partial.text, + at: liveContext.caretRect, + context: liveContext, + isRightToLeft: TextDirectionDetector.isRightToLeft(liveContext.precedingText) + ) + } + /// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing, /// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate /// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget. @@ -761,6 +845,9 @@ extension SuggestionCoordinator { // Drop any pending accepted-tail guard whenever the suggestion state is torn down (user // typed, focus changed, predictions disabled). The final-chunk accept re-sets it afterward. lastAcceptedTail = nil + // Stream bookkeeping follows the session it was rendering for. + streamRenderedText = nil + pendingStreamPartial = nil latestSuggestionPreview = nil latestFullSuggestionPreview = nil latestRemainingSuggestionPreview = nil diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift index 6475bae3..61516984 100644 --- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift +++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift @@ -93,6 +93,20 @@ final class SuggestionCoordinator: ObservableObject { } var clipboardPrefaceMemo: ClipboardPrefaceMemo? + /// Streamed-render bookkeeping. Partial results hop in from the engine while a decode is + /// still running; they are coalesced (latest wins, drained once per runloop turn) so + /// token-rate deliveries cannot stack session and overlay layout work on the main actor, and + /// `streamRenderedText` carries the monotonic-extension state for `StreamedGhostTextPolicy`. + /// All of it is scoped to the current work id and reset when a new generation dispatches. + struct PendingStreamPartial { + let result: SuggestionResult + let workID: UInt64 + } + + var pendingStreamPartial: PendingStreamPartial? + var isStreamDrainScheduled = false + var streamRenderedText: String? + /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop. /// /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift index c47b2329..7b1b0b45 100644 --- a/Cotabby/Models/SuggestionSubsystemContracts.swift +++ b/Cotabby/Models/SuggestionSubsystemContracts.swift @@ -89,6 +89,15 @@ protocol EmojiInputIntercepting: AnyObject { @MainActor protocol SuggestionGenerating: AnyObject { func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult + /// Streaming variant: `onPartial` receives cumulative, already-normalized partial results on + /// the main actor while the engine decodes, so ghost text can render after the first words + /// instead of waiting for the full completion. The returned result remains the authoritative + /// final answer; partials are best-effort hints the renderer may coalesce or drop. Engines + /// that cannot stream rely on the default, which degrades to the single-shot path. + func generateSuggestion( + for request: SuggestionRequest, + onPartial: (@MainActor (SuggestionResult) -> Void)? + ) async throws -> SuggestionResult /// Clears backend-local continuation state when the focused editing context is no longer /// continuous. Stateless engines may implement this as a no-op. func resetCachedGenerationContext() async @@ -102,6 +111,13 @@ protocol SuggestionGenerating: AnyObject { extension SuggestionGenerating { func prewarm(for request: SuggestionRequest) async {} + + func generateSuggestion( + for request: SuggestionRequest, + onPartial: (@MainActor (SuggestionResult) -> Void)? + ) async throws -> SuggestionResult { + try await generateSuggestion(for: request) + } } /// Behavior-shaped view of the llama runtime that `LlamaSuggestionEngine` depends on: run one @@ -112,6 +128,15 @@ extension SuggestionGenerating { @MainActor protocol LlamaRuntimeGenerating: AnyObject { func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String + /// Streaming variant: `onPartialRawText` receives the cumulative raw completion after each + /// sampled token, called from the decode thread (hence `@Sendable`); callers own hopping to + /// their actor. The returned string is still the authoritative final completion. + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions, + onPartialRawText: (@Sendable (String) -> Void)? + ) async throws -> String func resetPromptCache() /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup: @@ -125,6 +150,18 @@ extension LlamaRuntimeGenerating { func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {} } +extension LlamaRuntimeGenerating { + /// Default for fakes that only exercise the single-shot contract: ignore the partial hook. + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions, + onPartialRawText: (@Sendable (String) -> Void)? + ) async throws -> String { + try await generate(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options) + } +} + @MainActor protocol SuggestionSettingsProviding: AnyObject { var snapshot: SuggestionSettingsSnapshot { get } diff --git a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift index aba81f35..570a6dc9 100644 --- a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift @@ -44,6 +44,16 @@ final class FoundationModelSuggestionEngine { } func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult { + try await generateSuggestion(for: request, onPartial: nil) + } + + /// Streaming variant: Apple's response stream already yields cumulative snapshots; each one is + /// normalized and forwarded so ghost text can render before the stream finishes. The previous + /// implementation deliberately discarded the partials pending coordinator support. + func generateSuggestion( + for request: SuggestionRequest, + onPartial: (@MainActor (SuggestionResult) -> Void)? + ) async throws -> SuggestionResult { availabilityService.refresh() let baseMetadata: Logger.Metadata = [ @@ -98,6 +108,23 @@ final class FoundationModelSuggestionEngine { rawSuggestion = partial.content didReceiveSnapshot = true try Task.checkCancellation() + // This engine is main-actor confined, so partials forward inline (no hop). Empty + // normalizations are withheld; the coordinator's monotonic policy handles the rest. + if let onPartial { + let partialNormalized = SuggestionTextNormalizer.normalizeDetailed( + rawSuggestion, + for: request, + promptEchoCandidates: [prompt] + ).text + if !partialNormalized.isEmpty { + onPartial(SuggestionResult( + generation: request.generation, + rawText: rawSuggestion, + text: partialNormalized, + latency: Date().timeIntervalSince(startTime) + )) + } + } } try Task.checkCancellation() // Apple's documented contract is at least one snapshot on a successful stream, so a diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index a18be48b..cee4882a 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -133,10 +133,13 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { /// Prepares the prompt context, reusing cached KV state when safe, then samples a short completion. /// Holds `autocompleteLock` for the full call to prevent concurrent KV cache mutation. + /// `onPartialRawText` receives the cumulative raw completion after each sampled token, on the + /// calling (detached) thread, so the UI can render ghost text before the decode finishes. func generate( prompt: String, cachedPrefixBytes: Int? = nil, - options: LlamaGenerationOptions + options: LlamaGenerationOptions, + onPartialRawText: ((String) -> Void)? = nil ) throws -> String { let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate") @@ -185,7 +188,11 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for // the next request. Token selection is delegated to the engine's built-in sampler. - let decode = runEngineSampledDecode(sequenceID: sequenceID, options: options) + let decode = runEngineSampledDecode( + sequenceID: sequenceID, + options: options, + onPartialRawText: onPartialRawText + ) if decode.engineCancelled { // The engine's per-sequence abort flag is set-once; an aborted sequence would refuse // every future decode, so drop it and let the next request build fresh. @@ -351,10 +358,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { /// The shipping decoder: delegates token selection to the engine's built-in sampler /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token. /// `engineCancelled` reports that the native abort flag fired; the sequence must then be - /// discarded because the flag is set-once for a sequence's lifetime. + /// discarded because the flag is set-once for a sequence's lifetime. `onPartialRawText` + /// receives the cumulative raw completion after each sampled token, on the calling thread. private func runEngineSampledDecode( sequenceID: Int32, - options: LlamaGenerationOptions + options: LlamaGenerationOptions, + onPartialRawText: ((String) -> Void)? = nil ) -> (text: String, engineCancelled: Bool) { var generatedText = "" var tokensGenerated = 0 @@ -388,6 +397,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { generatedText += piece tokensGenerated += 1 sumLogprob += Double(result.logprob) + // Cumulative text, not the delta: consumers render whole partials, and cumulative + // semantics make late or reordered deliveries harmless downstream. + onPartialRawText?(generatedText) // Stop at the first natural sentence boundary instead of running the full token budget. // This keeps completions tight and is latency-positive (fewer tokens), and it adds no diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index 6cc22d7f..889522ca 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -100,6 +100,22 @@ final class LlamaRuntimeManager: ObservableObject { prompt: String, cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions + ) async throws -> String { + try await generate( + prompt: prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options, + onPartialRawText: nil + ) + } + + /// Streaming variant: `onPartialRawText` is invoked from the decode thread with the cumulative + /// raw completion after each sampled token; see `LlamaRuntimeGenerating`. + func generate( + prompt: String, + cachedPrefixBytes: Int? = nil, + options: LlamaGenerationOptions, + onPartialRawText: (@Sendable (String) -> Void)? ) async throws -> String { _ = try await preparedRuntime() @@ -113,7 +129,8 @@ final class LlamaRuntimeManager: ObservableObject { try core.generate( prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, - options: options + options: options, + onPartialRawText: onPartialRawText ) } return try await withTaskCancellationHandler { diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index c6b9f344..4092bbb9 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -56,6 +56,17 @@ final class LlamaSuggestionEngine { /// Executes one generation request and packages the raw and normalized result for the coordinator. func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult { + try await generateSuggestion(for: request, onPartial: nil) + } + + /// Streaming variant: cumulative raw partials from the decode thread are normalized and + /// forwarded to `onPartial` on the main actor, so the coordinator can paint ghost text while + /// the decode is still running. Empty normalizations are withheld (there is nothing useful to + /// paint), and the returned result remains the authoritative final completion. + func generateSuggestion( + for request: SuggestionRequest, + onPartial: (@MainActor (SuggestionResult) -> Void)? + ) async throws -> SuggestionResult { let baseMetadata: Logger.Metadata = [ "request_id": .string(request.requestID), "engine": .string("llama") @@ -77,11 +88,39 @@ final class LlamaSuggestionEngine { "max_tokens": .stringConvertible(request.maxPredictionTokens) ]) { _, new in new } ) - let rawSuggestion = try await runtimeManager.generate( - prompt: request.prompt, - cachedPrefixBytes: cachedPrefixBytes, - options: Self.makeGenerationOptions(for: request) - ) + let options = Self.makeGenerationOptions(for: request) + let rawSuggestion: String + if let onPartial { + rawSuggestion = try await runtimeManager.generate( + prompt: request.prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options, + onPartialRawText: { raw in + // Decode-thread callback; normalization and delivery hop to the main + // actor. Hops are independent tasks, so a shorter cumulative can land + // after a longer one — the coordinator's monotonic render policy makes + // that harmless. + Task { @MainActor in + let normalized = SuggestionTextNormalizer.normalizeDetailed(raw, for: request).text + guard !normalized.isEmpty else { + return + } + onPartial(SuggestionResult( + generation: request.generation, + rawText: raw, + text: normalized, + latency: Date().timeIntervalSince(startTime) + )) + } + } + ) + } else { + rawSuggestion = try await runtimeManager.generate( + prompt: request.prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options + ) + } try Task.checkCancellation() promptCacheHintTracker.recordSuccessfulRequest(request) diff --git a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift index 454e9dc2..65b0cfbe 100644 --- a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift +++ b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift @@ -31,6 +31,13 @@ final class SuggestionEngineRouter { } func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult { + try await generateSuggestion(for: request, onPartial: nil) + } + + func generateSuggestion( + for request: SuggestionRequest, + onPartial: (@MainActor (SuggestionResult) -> Void)? + ) async throws -> SuggestionResult { let metadata: Logger.Metadata = [ "request_id": .string(request.requestID), "engine": .string(engineMetadataLabel(for: suggestionSettings.selectedEngine)) @@ -39,7 +46,7 @@ final class SuggestionEngineRouter { case .appleIntelligence: CotabbyLogger.suggestion.debug("Routing to Apple Intelligence engine", metadata: metadata) do { - let result = try await foundationModelEngine.generateSuggestion(for: request) + let result = try await foundationModelEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: "Apple Intelligence", latency: result.latency) return result } catch SuggestionClientError.unsupportedLanguageOrLocale(let message) { @@ -52,12 +59,13 @@ final class SuggestionEngineRouter { ) return try await generateOpenSourceFallback( for: request, - appleFailureMessage: message + appleFailureMessage: message, + onPartial: onPartial ) } case .llamaOpenSource: CotabbyLogger.suggestion.debug("Routing to open-source llama engine", metadata: metadata) - let result = try await llamaEngine.generateSuggestion(for: request) + let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency) return result } @@ -107,10 +115,11 @@ final class SuggestionEngineRouter { /// coordinator backend-agnostic while giving local models a chance to handle that text. private func generateOpenSourceFallback( for request: SuggestionRequest, - appleFailureMessage: String + appleFailureMessage: String, + onPartial: (@MainActor (SuggestionResult) -> Void)? = nil ) async throws -> SuggestionResult { do { - let result = try await llamaEngine.generateSuggestion(for: request) + let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial) recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency) return result } catch SuggestionClientError.cancelled { diff --git a/Cotabby/Support/StreamedGhostTextPolicy.swift b/Cotabby/Support/StreamedGhostTextPolicy.swift new file mode 100644 index 00000000..e7332106 --- /dev/null +++ b/Cotabby/Support/StreamedGhostTextPolicy.swift @@ -0,0 +1,22 @@ +import Foundation + +/// Decides whether a streamed cumulative partial may replace the currently rendered ghost text. +/// +/// Streamed renders are monotonic by policy: a candidate must strictly extend what is already on +/// screen. Two real hazards motivate this rather than trusting arrival order. Partials hop from +/// the decode thread to the main actor as independent tasks, so a shorter, older cumulative can +/// land after a longer one; and the text normalizer runs on every cumulative snapshot, so its +/// output for a longer raw string is not guaranteed to extend its output for a shorter one (for +/// example when a boundary rule trims a trailing fragment). Dropping non-extensions costs nothing: +/// the next partial or the authoritative final result supersedes it. +enum StreamedGhostTextPolicy { + static func isRenderableExtension(candidate: String, currentlyRendered: String?) -> Bool { + guard !candidate.isEmpty else { + return false + } + guard let currentlyRendered, !currentlyRendered.isEmpty else { + return true + } + return candidate.count > currentlyRendered.count && candidate.hasPrefix(currentlyRendered) + } +} diff --git a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift new file mode 100644 index 00000000..fc138f70 --- /dev/null +++ b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift @@ -0,0 +1,128 @@ +import CoreGraphics +import Foundation +import XCTest +@testable import Cotabby + +/// Tests for the llama engine's streaming contract: cumulative raw partials from the runtime are +/// normalized and forwarded to `onPartial` on the main actor, and the final result still goes +/// through the existing single-shot path (tracker recording, normalization, latency). +@MainActor +final class LlamaSuggestionEngineStreamingTests: XCTestCase { + + func test_streamingGeneration_forwardsNormalizedCumulativePartials() async throws { + let runtime = StreamingFakeRuntime() + runtime.partialRawTexts = [" wor", " world ag"] + runtime.finalText = " world again" + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + + var partials: [SuggestionResult] = [] + let result = try await engine.generateSuggestion(for: makeRequest(prompt: "Hello")) { partial in + partials.append(partial) + } + + // Partials hop to the main actor as tasks; drain before asserting. + try await drainUntil { partials.count >= 2 } + + XCTAssertEqual(result.rawText, " world again") + XCTAssertEqual(partials.map(\.rawText), [" wor", " world ag"]) + XCTAssertFalse(partials.contains { $0.text.isEmpty }, "Empty normalizations must be withheld, not forwarded.") + XCTAssertEqual(partials.map(\.generation), [1, 1], "Partials must carry the request generation for stale guards.") + } + + func test_plainGeneration_neverInvokesPartialHook() async throws { + let runtime = StreamingFakeRuntime() + runtime.partialRawTexts = [" wor"] + runtime.finalText = " world" + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + + _ = try await engine.generateSuggestion(for: makeRequest(prompt: "Hello")) + + try await drainUntil { true } + XCTAssertEqual(runtime.streamingCallCount, 0, "The single-shot entry point must use the non-streaming runtime path.") + } + + // MARK: - Helpers + + /// Pumps the main actor until `condition` holds or a bounded number of yields elapse, so the + /// forwarded-partial tasks get a chance to run without arbitrary sleeps. + private func drainUntil(_ condition: () -> Bool) async throws { + for _ in 0..<200 where !condition() { + try await Task.sleep(nanoseconds: 2_000_000) + } + } + + private func makeRequest(prompt: String) -> SuggestionRequest { + let snapshot = FocusedInputSnapshot( + applicationName: "TestApp", + bundleIdentifier: "com.example.TestApp", + processIdentifier: 123, + elementIdentifier: "field", + role: "AXTextField", + subrole: nil, + caretRect: .zero, + inputFrameRect: nil, + caretSource: "test", + caretQuality: .exact, + observedCharWidth: nil, + precedingText: prompt, + trailingText: "", + selection: NSRange(location: prompt.count, length: 0), + isSecure: false + ) + let context = FocusedInputContext(snapshot: snapshot, generation: 1) + + return SuggestionRequest( + context: context, + prefixText: prompt, + prompt: prompt, + generation: context.generation, + maxPredictionTokens: 8, + temperature: 0.1, + topK: 20, + topP: 0.7, + minP: 0.08, + repetitionPenalty: 1.05, + randomSeed: 42, + maxSuffixCharacters: 192, + completionLengthInstruction: "Return only the next few words.", + userName: nil, + customRules: [], + languageInstruction: nil, + clipboardContext: nil, + visualContextSummary: nil, + isMultiLineEnabled: false + ) + } +} + +/// Runtime fake that emits staged cumulative raw partials through the streaming entry point and +/// counts which entry point was used. +@MainActor +private final class StreamingFakeRuntime: LlamaRuntimeGenerating { + var partialRawTexts: [String] = [] + var finalText = "" + private(set) var streamingCallCount = 0 + + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions + ) async throws -> String { + finalText + } + + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions, + onPartialRawText: (@Sendable (String) -> Void)? + ) async throws -> String { + streamingCallCount += 1 + for partial in partialRawTexts { + onPartialRawText?(partial) + } + return finalText + } + + func resetPromptCache() {} +} diff --git a/CotabbyTests/StreamedGhostTextPolicyTests.swift b/CotabbyTests/StreamedGhostTextPolicyTests.swift new file mode 100644 index 00000000..014330c7 --- /dev/null +++ b/CotabbyTests/StreamedGhostTextPolicyTests.swift @@ -0,0 +1,42 @@ +import XCTest +@testable import Cotabby + +/// Tests for the streamed-render monotonicity policy: out-of-order or normalizer-shrunk partials +/// must never replace longer ghost text already on screen. +final class StreamedGhostTextPolicyTests: XCTestCase { + func test_firstNonEmptyPartialRenders() { + XCTAssertTrue(StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: nil)) + XCTAssertTrue(StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: "")) + } + + func test_emptyCandidateNeverRenders() { + XCTAssertFalse(StreamedGhostTextPolicy.isRenderableExtension(candidate: "", currentlyRendered: nil)) + XCTAssertFalse(StreamedGhostTextPolicy.isRenderableExtension(candidate: "", currentlyRendered: " wor")) + } + + func test_strictExtensionRenders() { + XCTAssertTrue( + StreamedGhostTextPolicy.isRenderableExtension(candidate: " world", currentlyRendered: " wor") + ) + } + + func test_staleShorterPartialIsDropped() { + XCTAssertFalse( + StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: " world") + ) + } + + func test_equalTextIsDroppedAsRedundant() { + XCTAssertFalse( + StreamedGhostTextPolicy.isRenderableExtension(candidate: " world", currentlyRendered: " world") + ) + } + + func test_divergentRewriteIsDropped() { + // A normalizer can legally rewrite a fragment rather than extend it; the render must wait + // for the authoritative final result instead of flickering through rewrites. + XCTAssertFalse( + StreamedGhostTextPolicy.isRenderableExtension(candidate: " worse idea", currentlyRendered: " world") + ) + } +}