From de7021b064534235576b909bf17fb62b1ebeb89c Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Thu, 11 Jun 2026 19:04:50 -0700
Subject: [PATCH 1/2] perf(stream): render ghost text while the model is still
 decoding

Generation was single-shot end to end: the llama decode loop accumulated
pieces privately and the FM engine consumed Apple's cumulative snapshots
internally, so nothing reached the overlay until the full completion
finished and perceived latency was prefill plus the entire decode. Both
engines now forward cumulative, normalized partial results through a new
streaming variant of the generation contract, and the coordinator paints
them as real acceptable sessions: coalesced to one render per runloop
turn, monotonic by policy (reordered hops and normalizer rewrites can
never shrink visible text), guarded by the same work-id and materialize
checks the final apply uses, and frozen at whatever was streamed if the
user Tabs mid-decode. The final result remains authoritative and flows
through the unchanged apply path.
---
 Cotabby.xcodeproj/project.pbxproj             |  14 ++
 .../SuggestionCoordinator+Prediction.swift    |  85 +++++++++++-
 .../Coordinators/SuggestionCoordinator.swift  |  15 ++
 .../Models/SuggestionSubsystemContracts.swift |  37 +++++
 .../FoundationModelSuggestionEngine.swift     |  27 ++++
 .../Services/Runtime/LlamaRuntimeCore.swift   |  20 ++-
 .../Runtime/LlamaRuntimeManager.swift         |  19 ++-
 .../Runtime/LlamaSuggestionEngine.swift       |  49 ++++++-
 .../Runtime/SuggestionEngineRouter.swift      |  19 ++-
 Cotabby/Support/StreamedGhostTextPolicy.swift |  22 +++
 .../LlamaSuggestionEngineStreamingTests.swift | 128 ++++++++++++++++++
 .../StreamedGhostTextPolicyTests.swift        |  42 ++++++
 12 files changed, 461 insertions(+), 16 deletions(-)
 create mode 100644 Cotabby/Support/StreamedGhostTextPolicy.swift
 create mode 100644 CotabbyTests/LlamaSuggestionEngineStreamingTests.swift
 create mode 100644 CotabbyTests/StreamedGhostTextPolicyTests.swift

diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj
index cfd245bd..02a03f32 100644
--- a/Cotabby.xcodeproj/project.pbxproj
+++ b/Cotabby.xcodeproj/project.pbxproj
@@ -175,6 +175,7 @@
 		3E78D03ABA7141D344AB8285 /* he.txt in Resources */ = {isa = PBXBuildFile; fileRef = C9C000E46A1E404932F89C81 /* he.txt */; };
 		3EF0A298B5590571B1C37282 /* FieldStyleCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = B7FBF2B766E728F25899B64E /* FieldStyleCache.swift */; };
 		3F5630CFB7BA40B900E832A1 /* OCRTextHygieneTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5EED3CD2BC7B48DF35DEE562 /* OCRTextHygieneTests.swift */; };
+		3F87586426B5EF16B41CE62F /* LlamaSuggestionEngineStreamingTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */; };
 		3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */; };
 		3FCEF50FDD9EE01AE3711083 /* AXTreeDumpWriter.swift in Sources */ = {isa = PBXBuildFile; fileRef = B27492B04B627DA53BDAD938 /* AXTreeDumpWriter.swift */; };
 		3FF6B7DE34A01C4AB7FA54E3 /* MacroTriggerStateMachine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1C201A65A6B040F90C528A3B /* MacroTriggerStateMachine.swift */; };
@@ -405,10 +406,12 @@
 		9CEBD6AF4405F1BBE0E3D16C /* MidWordContinuationPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 357C18383B047F24A531BDCD /* MidWordContinuationPolicy.swift */; };
 		9D0F4829D11BCD4DB1290410 /* InsertionStrategySelector.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0D2FEEA4304C86324BAADAB /* InsertionStrategySelector.swift */; };
 		9E031B67A275BB3E049EFC2F /* frequency_dictionary_en_82_765.txt in Resources */ = {isa = PBXBuildFile; fileRef = 99FBB636008490B66CF26772 /* frequency_dictionary_en_82_765.txt */; };
+		9E4AED02831829A108A1AA85 /* StreamedGhostTextPolicyTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */; };
 		9EB8E3DC796A0C8BFDE8E683 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = A3E8E86A14090BC7BD13BA76 /* AppDelegate.swift */; };
 		9F2FDCABCC941CBECAA3B4AB /* CotabbyInference in Frameworks */ = {isa = PBXBuildFile; productRef = 48A46AD6B613CF06072603E4 /* CotabbyInference */; };
 		9F6F88ED74ECA3E23A8E3CC0 /* SecureFieldDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1827565F4FAD3E4E61CA65C3 /* SecureFieldDetector.swift */; };
 		A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */ = {isa = PBXBuildFile; fileRef = 72B13136DF7318F3E96DF0D3 /* SuggestionCoordinator+Acceptance.swift */; };
+		A0A2BD916B2CB22BAF32A62E /* StreamedGhostTextPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */; };
 		A0BB87E3665EF6C209034798 /* GhostSuggestionLayoutTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5AD3F4F9FBE82007E4E15F58 /* GhostSuggestionLayoutTests.swift */; };
 		A147C5EC3F2214A670F7556E /* FocusPollBackoffTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */; };
 		A1A612C90221E0FE1195754A /* SettingsCategory.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5D0AEFF86F8210CBE7CFCBAD /* SettingsCategory.swift */; };
@@ -500,6 +503,7 @@
 		C607A624A0FB697486C56B8E /* PowerSourceMonitor.swift in Sources */ = {isa = PBXBuildFile; fileRef = DB235F0DEA53295DAF8B4FA0 /* PowerSourceMonitor.swift */; };
 		C618C5595DA9C57C806A3E03 /* SettingsAttentionEvaluatorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2BC293F6125E2B14DCF05AD9 /* SettingsAttentionEvaluatorTests.swift */; };
 		C63F95C324C29940FAC6B973 /* de-100k.txt in Resources */ = {isa = PBXBuildFile; fileRef = 4B8665A5495891F9E3DDA48B /* de-100k.txt */; };
+		C6925440737F37F537622F35 /* StreamedGhostTextPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */; };
 		C6A112B51525F988EA46F725 /* SystemResourceSamplerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9255CBCDE66253F521EE0F08 /* SystemResourceSamplerTests.swift */; };
 		C6A91AD96F52DB72947830C0 /* DownloadableModelCatalogView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB5C2AE9A7E55495D26AD074 /* DownloadableModelCatalogView.swift */; };
 		C71B594433F3B411CAE5DE7E /* FocusCapabilityResolverTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D4F6D5F94B238F7B4BE7C247 /* FocusCapabilityResolverTests.swift */; };
@@ -704,6 +708,7 @@
 		292DC9D4D9D5D26AE882E39B /* EmojiCatalogMatcherTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiCatalogMatcherTests.swift; sourceTree = "<group>"; };
 		2930EC34057319130393696B /* KeyCodeLabelsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KeyCodeLabelsTests.swift; sourceTree = "<group>"; };
 		2960080A726E51198225147A /* InsertionStrategySelectorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InsertionStrategySelectorTests.swift; sourceTree = "<group>"; };
+		299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamedGhostTextPolicy.swift; sourceTree = "<group>"; };
 		29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeviceInfo.swift; sourceTree = "<group>"; };
 		2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluator.swift; sourceTree = "<group>"; };
 		2B7A28471B8526C2693FFF65 /* AcknowledgementsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AcknowledgementsView.swift; sourceTree = "<group>"; };
@@ -939,6 +944,7 @@
 		D0AF9479EF020071CA64CCC1 /* HuggingFaceModelsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HuggingFaceModelsTests.swift; sourceTree = "<group>"; };
 		D1123AB515110BD0CBA39490 /* HomePaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HomePaneView.swift; sourceTree = "<group>"; };
 		D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DecodeStopPolicy.swift; sourceTree = "<group>"; };
+		D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamedGhostTextPolicyTests.swift; sourceTree = "<group>"; };
 		D2D0FE44138BCA8B2EE05AFE /* TypoCaseTransferTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TypoCaseTransferTests.swift; sourceTree = "<group>"; };
 		D2F46767D9D1F0D44E239CA8 /* DownloadFileRescuerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuerTests.swift; sourceTree = "<group>"; };
 		D3A2AC525DC664DB540D4F19 /* ClipboardRelevanceFilter.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ClipboardRelevanceFilter.swift; sourceTree = "<group>"; };
@@ -961,6 +967,7 @@
 		D9C1C921A1CDA2ADFC39EA01 /* AppsPaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppsPaneView.swift; sourceTree = "<group>"; };
 		DB0CE9AB1286367BA2E82392 /* SettingsContainerView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsContainerView.swift; sourceTree = "<group>"; };
 		DB235F0DEA53295DAF8B4FA0 /* PowerSourceMonitor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PowerSourceMonitor.swift; sourceTree = "<group>"; };
+		DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaSuggestionEngineStreamingTests.swift; sourceTree = "<group>"; };
 		DDE858CB1E687E3CEB8FDD5B /* SuggestionRequestFactory.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionRequestFactory.swift; sourceTree = "<group>"; };
 		DDF6A4E9CE93FD53C60E67E3 /* EmojiQueryRun.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiQueryRun.swift; sourceTree = "<group>"; };
 		DEB16474A67CE1D210B944C9 /* SuggestionSubsystemContracts.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionSubsystemContracts.swift; sourceTree = "<group>"; };
@@ -1366,6 +1373,7 @@
 				0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */,
 				AABCC3FD99B1824A81E665F3 /* LlamaSuggestionEngineCancellationTests.swift */,
 				26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */,
+				DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */,
 				9030FAAB468119A0236284A6 /* LLMIOFileHandlerTests.swift */,
 				D8083D44ABCDCFA68A4CD497 /* MacroEngineTests.swift */,
 				22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */,
@@ -1397,6 +1405,7 @@
 				D562A73C7C680F2AA65F9F7F /* SpellingDictionaryResourceTests.swift */,
 				E0871985CB1F877EC422E18C /* SpellingLanguageResolverTests.swift */,
 				9B3179B40A81DF121D1221C6 /* StaticTextRunWalkThrottleTests.swift */,
+				D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */,
 				C05B0439348261163B37C508 /* SuggestionAvailabilityEvaluatorTests.swift */,
 				EC04832FBD5311352F35241B /* SuggestionCaretLayoutRepairTests.swift */,
 				C375227649689775275AA4B3 /* SuggestionCoordinatorAcceptanceTests.swift */,
@@ -1608,6 +1617,7 @@
 				D4B56C250DDEF3E81F9DCBD7 /* SentenceBoundaryClassifier.swift */,
 				2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */,
 				0348A7053E5683C68879A71A /* SpellingLanguageResolver.swift */,
+				299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */,
 				3609CC88A5280B3AA40414DF /* SuggestionAvailabilityEvaluator.swift */,
 				B2F95847D76893C8A5B504B4 /* SuggestionOverlayStabilityGate.swift */,
 				DDE858CB1E687E3CEB8FDD5B /* SuggestionRequestFactory.swift */,
@@ -1992,6 +2002,7 @@
 				D6AD25168F108DA8D60E76EF /* SpellingDictionaryPicker.swift in Sources */,
 				257C2A5D299365C1D98527A8 /* SpellingLanguageResolver.swift in Sources */,
 				753C5A939E986B1A0FB25664 /* StaticTextRunWalkThrottle.swift in Sources */,
+				A0A2BD916B2CB22BAF32A62E /* StreamedGhostTextPolicy.swift in Sources */,
 				333C09921443BDDF21A9753D /* SuggestionAvailabilityEvaluator.swift in Sources */,
 				EC4ED03BE4C7DD0E6319F310 /* SuggestionCoordinator+Acceptance.swift in Sources */,
 				AC4A369EC73115E1F698934D /* SuggestionCoordinator+Input.swift in Sources */,
@@ -2215,6 +2226,7 @@
 				94F037A3F9D7CE52CC70CA0F /* SpellingDictionaryPicker.swift in Sources */,
 				1BDEC75125ADFCD67F3C406D /* SpellingLanguageResolver.swift in Sources */,
 				B50EDCA5C4C5FE4FC548AA74 /* StaticTextRunWalkThrottle.swift in Sources */,
+				C6925440737F37F537622F35 /* StreamedGhostTextPolicy.swift in Sources */,
 				4F369F5284DDCEABF082E59B /* SuggestionAvailabilityEvaluator.swift in Sources */,
 				A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */,
 				D2F1DD215989BF32675308C2 /* SuggestionCoordinator+Input.swift in Sources */,
@@ -2338,6 +2350,7 @@
 				E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */,
 				BE3CB85508055D159C35020A /* LlamaSuggestionEngineCancellationTests.swift in Sources */,
 				E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */,
+				3F87586426B5EF16B41CE62F /* LlamaSuggestionEngineStreamingTests.swift in Sources */,
 				8429B116328C392DCA018D95 /* MacroEngineTests.swift in Sources */,
 				3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */,
 				87806DE08881D11F2608A13D /* MarkerSelectionSynthesizerTests.swift in Sources */,
@@ -2368,6 +2381,7 @@
 				303652F15C0FE55595669D81 /* SpellingDictionaryResourceTests.swift in Sources */,
 				66D0D9F605AF462F569A5CFD /* SpellingLanguageResolverTests.swift in Sources */,
 				96C3128BCB17A05A7C7DEFF7 /* StaticTextRunWalkThrottleTests.swift in Sources */,
+				9E4AED02831829A108A1AA85 /* StreamedGhostTextPolicyTests.swift in Sources */,
 				88BCD795A14E1C9308F7BB31 /* SuggestionAvailabilityEvaluatorTests.swift in Sources */,
 				EB9B5E5F7326AB72E0E44C70 /* SuggestionCaretLayoutRepairTests.swift in Sources */,
 				5B404450B412A6102F514250 /* SuggestionCoordinatorAcceptanceTests.swift in Sources */,
diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
index fcfae5e9..dc0a0f75 100644
--- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
+++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
@@ -126,13 +126,22 @@ extension SuggestionCoordinator {
     /// result (or failure) only while it is still the current work. Extracted from
     /// `generateFromCurrentFocus` so that function stays within the project's complexity budget.
     private func dispatchGeneration(request: SuggestionRequest, workID: UInt64) {
+        // A new generation starts a new stream; the previous request's rendered-partial state
+        // must not gate the new partials' monotonic checks.
+        streamRenderedText = nil
+        pendingStreamPartial = nil
         workController.replaceGenerationWork(for: workID) { [weak self] in
             guard let self else {
                 return
             }
 
             do {
-                let result = try await suggestionEngine.generateSuggestion(for: request)
+                let result = try await suggestionEngine.generateSuggestion(
+                    for: request,
+                    onPartial: { [weak self] partial in
+                        self?.queueStreamedPartial(partial, workID: workID)
+                    }
+                )
                 guard !Task.isCancelled, self.workController.isCurrent(workID) else {
                     return
                 }
@@ -189,6 +198,77 @@ extension SuggestionCoordinator {
         return value
     }
 
+    // MARK: - Streamed partial rendering
+
+    /// Coalesces streamed partials to at most one render per runloop turn. Tokens arrive every
+    /// 10-50ms from the engine, and rendering each one would stack session updates and overlay
+    /// layout on the main actor; latest-wins coalescing bounds that work while the authoritative
+    /// final result still arrives through `apply`.
+    func queueStreamedPartial(_ partial: SuggestionResult, workID: UInt64) {
+        guard workController.isCurrent(workID) else {
+            return
+        }
+        pendingStreamPartial = PendingStreamPartial(result: partial, workID: workID)
+        guard !isStreamDrainScheduled else {
+            return
+        }
+        isStreamDrainScheduled = true
+        DispatchQueue.main.async { [weak self] in
+            self?.drainStreamedPartial()
+        }
+    }
+
+    private func drainStreamedPartial() {
+        isStreamDrainScheduled = false
+        guard let pending = pendingStreamPartial else {
+            return
+        }
+        pendingStreamPartial = nil
+        applyStreamedPartial(pending.result, workID: pending.workID)
+    }
+
+    /// Renders one streamed partial as a real, acceptable session.
+    ///
+    /// A real session rather than a cosmetic overlay because acceptance gates on the live session
+    /// (never on `state`), so the user can Tab into a stream the moment the first words appear;
+    /// accepting cancels the in-flight work (work id bump), freezing the suggestion at what was
+    /// streamed. Renders are monotonic (`StreamedGhostTextPolicy`) so reordered hops and
+    /// normalizer rewrites never shrink visible ghost text, and the materialize check stops
+    /// partials the moment the field text moves on without a keystroke (a keystroke already
+    /// bumped the work id before this runs).
+    private func applyStreamedPartial(_ partial: SuggestionResult, workID: UInt64) {
+        guard workController.isCurrent(workID) else {
+            return
+        }
+        guard StreamedGhostTextPolicy.isRenderableExtension(
+            candidate: partial.text,
+            currentlyRendered: streamRenderedText
+        ) else {
+            return
+        }
+        guard let rawContext = focusModel.snapshot.context else {
+            return
+        }
+
+        let liveContext = interactionState.materializeContext(from: rawContext)
+        guard liveContext.generation == partial.generation else {
+            return
+        }
+
+        _ = interactionState.startSession(
+            fullText: partial.text,
+            liveContext: liveContext,
+            latency: partial.latency
+        )
+        streamRenderedText = partial.text
+        presentOverlay(
+            text: partial.text,
+            at: liveContext.caretRect,
+            context: liveContext,
+            isRightToLeft: TextDirectionDetector.isRightToLeft(liveContext.precedingText)
+        )
+    }
+
     /// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing,
     /// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate
     /// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget.
@@ -761,6 +841,9 @@ extension SuggestionCoordinator {
         // Drop any pending accepted-tail guard whenever the suggestion state is torn down (user
         // typed, focus changed, predictions disabled). The final-chunk accept re-sets it afterward.
         lastAcceptedTail = nil
+        // Stream bookkeeping follows the session it was rendering for.
+        streamRenderedText = nil
+        pendingStreamPartial = nil
         latestSuggestionPreview = nil
         latestFullSuggestionPreview = nil
         latestRemainingSuggestionPreview = nil
diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
index 6475bae3..5668d688 100644
--- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift
+++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
@@ -93,6 +93,21 @@ final class SuggestionCoordinator: ObservableObject {
     }
 
     var clipboardPrefaceMemo: ClipboardPrefaceMemo?
+    /// Streamed-render bookkeeping. Partial results hop in from the engine while a decode is
+    /// still running; they are coalesced (latest wins, drained once per runloop turn) so
+    /// token-rate deliveries cannot stack session and overlay layout work on the main actor, and
+    /// `streamRenderedText` carries the monotonic-extension state for `StreamedGhostTextPolicy`.
+    /// All of it is scoped to the current work id and reset when a new generation dispatches.
+    struct PendingStreamPartial {
+        let result: SuggestionResult
+        let workID: UInt64
+    }
+
+    var pendingStreamPartial: PendingStreamPartial?
+    var isStreamDrainScheduled = false
+    var streamRenderedText: String?
+
+
     /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop.
     ///
     /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this
diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift
index c47b2329..7b1b0b45 100644
--- a/Cotabby/Models/SuggestionSubsystemContracts.swift
+++ b/Cotabby/Models/SuggestionSubsystemContracts.swift
@@ -89,6 +89,15 @@ protocol EmojiInputIntercepting: AnyObject {
 @MainActor
 protocol SuggestionGenerating: AnyObject {
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult
+    /// Streaming variant: `onPartial` receives cumulative, already-normalized partial results on
+    /// the main actor while the engine decodes, so ghost text can render after the first words
+    /// instead of waiting for the full completion. The returned result remains the authoritative
+    /// final answer; partials are best-effort hints the renderer may coalesce or drop. Engines
+    /// that cannot stream rely on the default, which degrades to the single-shot path.
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult
     /// Clears backend-local continuation state when the focused editing context is no longer
     /// continuous. Stateless engines may implement this as a no-op.
     func resetCachedGenerationContext() async
@@ -102,6 +111,13 @@ protocol SuggestionGenerating: AnyObject {
 
 extension SuggestionGenerating {
     func prewarm(for request: SuggestionRequest) async {}
+
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request)
+    }
 }
 
 /// Behavior-shaped view of the llama runtime that `LlamaSuggestionEngine` depends on: run one
@@ -112,6 +128,15 @@ extension SuggestionGenerating {
 @MainActor
 protocol LlamaRuntimeGenerating: AnyObject {
     func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String
+    /// Streaming variant: `onPartialRawText` receives the cumulative raw completion after each
+    /// sampled token, called from the decode thread (hence `@Sendable`); callers own hopping to
+    /// their actor. The returned string is still the authoritative final completion.
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
+    ) async throws -> String
     func resetPromptCache()
     /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next
     /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup:
@@ -125,6 +150,18 @@ extension LlamaRuntimeGenerating {
     func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {}
 }
 
+extension LlamaRuntimeGenerating {
+    /// Default for fakes that only exercise the single-shot contract: ignore the partial hook.
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
+    ) async throws -> String {
+        try await generate(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options)
+    }
+}
+
 @MainActor
 protocol SuggestionSettingsProviding: AnyObject {
     var snapshot: SuggestionSettingsSnapshot { get }
diff --git a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift
index aba81f35..570a6dc9 100644
--- a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift
@@ -44,6 +44,16 @@ final class FoundationModelSuggestionEngine {
     }
 
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request, onPartial: nil)
+    }
+
+    /// Streaming variant: Apple's response stream already yields cumulative snapshots; each one is
+    /// normalized and forwarded so ghost text can render before the stream finishes. The previous
+    /// implementation deliberately discarded the partials pending coordinator support.
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
         availabilityService.refresh()
 
         let baseMetadata: Logger.Metadata = [
@@ -98,6 +108,23 @@ final class FoundationModelSuggestionEngine {
                 rawSuggestion = partial.content
                 didReceiveSnapshot = true
                 try Task.checkCancellation()
+                // This engine is main-actor confined, so partials forward inline (no hop). Empty
+                // normalizations are withheld; the coordinator's monotonic policy handles the rest.
+                if let onPartial {
+                    let partialNormalized = SuggestionTextNormalizer.normalizeDetailed(
+                        rawSuggestion,
+                        for: request,
+                        promptEchoCandidates: [prompt]
+                    ).text
+                    if !partialNormalized.isEmpty {
+                        onPartial(SuggestionResult(
+                            generation: request.generation,
+                            rawText: rawSuggestion,
+                            text: partialNormalized,
+                            latency: Date().timeIntervalSince(startTime)
+                        ))
+                    }
+                }
             }
             try Task.checkCancellation()
             // Apple's documented contract is at least one snapshot on a successful stream, so a
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index a18be48b..cee4882a 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -133,10 +133,13 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
     /// Prepares the prompt context, reusing cached KV state when safe, then samples a short completion.
     /// Holds `autocompleteLock` for the full call to prevent concurrent KV cache mutation.
+    /// `onPartialRawText` receives the cumulative raw completion after each sampled token, on the
+    /// calling (detached) thread, so the UI can render ghost text before the decode finishes.
     func generate(
         prompt: String,
         cachedPrefixBytes: Int? = nil,
-        options: LlamaGenerationOptions
+        options: LlamaGenerationOptions,
+        onPartialRawText: ((String) -> Void)? = nil
     ) throws -> String {
         let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate")
 
@@ -185,7 +188,11 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
         // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for
         // the next request. Token selection is delegated to the engine's built-in sampler.
-        let decode = runEngineSampledDecode(sequenceID: sequenceID, options: options)
+        let decode = runEngineSampledDecode(
+            sequenceID: sequenceID,
+            options: options,
+            onPartialRawText: onPartialRawText
+        )
         if decode.engineCancelled {
             // The engine's per-sequence abort flag is set-once; an aborted sequence would refuse
             // every future decode, so drop it and let the next request build fresh.
@@ -351,10 +358,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
     /// The shipping decoder: delegates token selection to the engine's built-in sampler
     /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
     /// `engineCancelled` reports that the native abort flag fired; the sequence must then be
-    /// discarded because the flag is set-once for a sequence's lifetime.
+    /// discarded because the flag is set-once for a sequence's lifetime. `onPartialRawText`
+    /// receives the cumulative raw completion after each sampled token, on the calling thread.
     private func runEngineSampledDecode(
         sequenceID: Int32,
-        options: LlamaGenerationOptions
+        options: LlamaGenerationOptions,
+        onPartialRawText: ((String) -> Void)? = nil
     ) -> (text: String, engineCancelled: Bool) {
         var generatedText = ""
         var tokensGenerated = 0
@@ -388,6 +397,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             generatedText += piece
             tokensGenerated += 1
             sumLogprob += Double(result.logprob)
+            // Cumulative text, not the delta: consumers render whole partials, and cumulative
+            // semantics make late or reordered deliveries harmless downstream.
+            onPartialRawText?(generatedText)
 
             // Stop at the first natural sentence boundary instead of running the full token budget.
             // This keeps completions tight and is latency-positive (fewer tokens), and it adds no
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index 6cc22d7f..889522ca 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -100,6 +100,22 @@ final class LlamaRuntimeManager: ObservableObject {
         prompt: String,
         cachedPrefixBytes: Int? = nil,
         options: LlamaGenerationOptions
+    ) async throws -> String {
+        try await generate(
+            prompt: prompt,
+            cachedPrefixBytes: cachedPrefixBytes,
+            options: options,
+            onPartialRawText: nil
+        )
+    }
+
+    /// Streaming variant: `onPartialRawText` is invoked from the decode thread with the cumulative
+    /// raw completion after each sampled token; see `LlamaRuntimeGenerating`.
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int? = nil,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
     ) async throws -> String {
         _ = try await preparedRuntime()
 
@@ -113,7 +129,8 @@ final class LlamaRuntimeManager: ObservableObject {
                 try core.generate(
                     prompt: prompt,
                     cachedPrefixBytes: cachedPrefixBytes,
-                    options: options
+                    options: options,
+                    onPartialRawText: onPartialRawText
                 )
             }
             return try await withTaskCancellationHandler {
diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index c6b9f344..4092bbb9 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -56,6 +56,17 @@ final class LlamaSuggestionEngine {
 
     /// Executes one generation request and packages the raw and normalized result for the coordinator.
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request, onPartial: nil)
+    }
+
+    /// Streaming variant: cumulative raw partials from the decode thread are normalized and
+    /// forwarded to `onPartial` on the main actor, so the coordinator can paint ghost text while
+    /// the decode is still running. Empty normalizations are withheld (there is nothing useful to
+    /// paint), and the returned result remains the authoritative final completion.
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
         let baseMetadata: Logger.Metadata = [
             "request_id": .string(request.requestID),
             "engine": .string("llama")
@@ -77,11 +88,39 @@ final class LlamaSuggestionEngine {
                     "max_tokens": .stringConvertible(request.maxPredictionTokens)
                 ]) { _, new in new }
             )
-            let rawSuggestion = try await runtimeManager.generate(
-                prompt: request.prompt,
-                cachedPrefixBytes: cachedPrefixBytes,
-                options: Self.makeGenerationOptions(for: request)
-            )
+            let options = Self.makeGenerationOptions(for: request)
+            let rawSuggestion: String
+            if let onPartial {
+                rawSuggestion = try await runtimeManager.generate(
+                    prompt: request.prompt,
+                    cachedPrefixBytes: cachedPrefixBytes,
+                    options: options,
+                    onPartialRawText: { raw in
+                        // Decode-thread callback; normalization and delivery hop to the main
+                        // actor. Hops are independent tasks, so a shorter cumulative can land
+                        // after a longer one — the coordinator's monotonic render policy makes
+                        // that harmless.
+                        Task { @MainActor in
+                            let normalized = SuggestionTextNormalizer.normalizeDetailed(raw, for: request).text
+                            guard !normalized.isEmpty else {
+                                return
+                            }
+                            onPartial(SuggestionResult(
+                                generation: request.generation,
+                                rawText: raw,
+                                text: normalized,
+                                latency: Date().timeIntervalSince(startTime)
+                            ))
+                        }
+                    }
+                )
+            } else {
+                rawSuggestion = try await runtimeManager.generate(
+                    prompt: request.prompt,
+                    cachedPrefixBytes: cachedPrefixBytes,
+                    options: options
+                )
+            }
             try Task.checkCancellation()
 
             promptCacheHintTracker.recordSuccessfulRequest(request)
diff --git a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift
index 454e9dc2..65b0cfbe 100644
--- a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift
+++ b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift
@@ -31,6 +31,13 @@ final class SuggestionEngineRouter {
     }
 
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request, onPartial: nil)
+    }
+
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
         let metadata: Logger.Metadata = [
             "request_id": .string(request.requestID),
             "engine": .string(engineMetadataLabel(for: suggestionSettings.selectedEngine))
@@ -39,7 +46,7 @@ final class SuggestionEngineRouter {
         case .appleIntelligence:
             CotabbyLogger.suggestion.debug("Routing to Apple Intelligence engine", metadata: metadata)
             do {
-                let result = try await foundationModelEngine.generateSuggestion(for: request)
+                let result = try await foundationModelEngine.generateSuggestion(for: request, onPartial: onPartial)
                 recordPerformanceMetric(modelName: "Apple Intelligence", latency: result.latency)
                 return result
             } catch SuggestionClientError.unsupportedLanguageOrLocale(let message) {
@@ -52,12 +59,13 @@ final class SuggestionEngineRouter {
                 )
                 return try await generateOpenSourceFallback(
                     for: request,
-                    appleFailureMessage: message
+                    appleFailureMessage: message,
+                    onPartial: onPartial
                 )
             }
         case .llamaOpenSource:
             CotabbyLogger.suggestion.debug("Routing to open-source llama engine", metadata: metadata)
-            let result = try await llamaEngine.generateSuggestion(for: request)
+            let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial)
             recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency)
             return result
         }
@@ -107,10 +115,11 @@ final class SuggestionEngineRouter {
     /// coordinator backend-agnostic while giving local models a chance to handle that text.
     private func generateOpenSourceFallback(
         for request: SuggestionRequest,
-        appleFailureMessage: String
+        appleFailureMessage: String,
+        onPartial: (@MainActor (SuggestionResult) -> Void)? = nil
     ) async throws -> SuggestionResult {
         do {
-            let result = try await llamaEngine.generateSuggestion(for: request)
+            let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial)
             recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency)
             return result
         } catch SuggestionClientError.cancelled {
diff --git a/Cotabby/Support/StreamedGhostTextPolicy.swift b/Cotabby/Support/StreamedGhostTextPolicy.swift
new file mode 100644
index 00000000..e7332106
--- /dev/null
+++ b/Cotabby/Support/StreamedGhostTextPolicy.swift
@@ -0,0 +1,22 @@
+import Foundation
+
+/// Decides whether a streamed cumulative partial may replace the currently rendered ghost text.
+///
+/// Streamed renders are monotonic by policy: a candidate must strictly extend what is already on
+/// screen. Two real hazards motivate this rather than trusting arrival order. Partials hop from
+/// the decode thread to the main actor as independent tasks, so a shorter, older cumulative can
+/// land after a longer one; and the text normalizer runs on every cumulative snapshot, so its
+/// output for a longer raw string is not guaranteed to extend its output for a shorter one (for
+/// example when a boundary rule trims a trailing fragment). Dropping non-extensions costs nothing:
+/// the next partial or the authoritative final result supersedes it.
+enum StreamedGhostTextPolicy {
+    static func isRenderableExtension(candidate: String, currentlyRendered: String?) -> Bool {
+        guard !candidate.isEmpty else {
+            return false
+        }
+        guard let currentlyRendered, !currentlyRendered.isEmpty else {
+            return true
+        }
+        return candidate.count > currentlyRendered.count && candidate.hasPrefix(currentlyRendered)
+    }
+}
diff --git a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift
new file mode 100644
index 00000000..fc138f70
--- /dev/null
+++ b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift
@@ -0,0 +1,128 @@
+import CoreGraphics
+import Foundation
+import XCTest
+@testable import Cotabby
+
+/// Tests for the llama engine's streaming contract: cumulative raw partials from the runtime are
+/// normalized and forwarded to `onPartial` on the main actor, and the final result still goes
+/// through the existing single-shot path (tracker recording, normalization, latency).
+@MainActor
+final class LlamaSuggestionEngineStreamingTests: XCTestCase {
+
+    func test_streamingGeneration_forwardsNormalizedCumulativePartials() async throws {
+        let runtime = StreamingFakeRuntime()
+        runtime.partialRawTexts = [" wor", " world ag"]
+        runtime.finalText = " world again"
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+
+        var partials: [SuggestionResult] = []
+        let result = try await engine.generateSuggestion(for: makeRequest(prompt: "Hello")) { partial in
+            partials.append(partial)
+        }
+
+        // Partials hop to the main actor as tasks; drain before asserting.
+        try await drainUntil { partials.count >= 2 }
+
+        XCTAssertEqual(result.rawText, " world again")
+        XCTAssertEqual(partials.map(\.rawText), [" wor", " world ag"])
+        XCTAssertFalse(partials.contains { $0.text.isEmpty }, "Empty normalizations must be withheld, not forwarded.")
+        XCTAssertEqual(partials.map(\.generation), [1, 1], "Partials must carry the request generation for stale guards.")
+    }
+
+    func test_plainGeneration_neverInvokesPartialHook() async throws {
+        let runtime = StreamingFakeRuntime()
+        runtime.partialRawTexts = [" wor"]
+        runtime.finalText = " world"
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+
+        _ = try await engine.generateSuggestion(for: makeRequest(prompt: "Hello"))
+
+        try await drainUntil { true }
+        XCTAssertEqual(runtime.streamingCallCount, 0, "The single-shot entry point must use the non-streaming runtime path.")
+    }
+
+    // MARK: - Helpers
+
+    /// Pumps the main actor until `condition` holds or a bounded number of yields elapse, so the
+    /// forwarded-partial tasks get a chance to run without arbitrary sleeps.
+    private func drainUntil(_ condition: () -> Bool) async throws {
+        for _ in 0..<200 where !condition() {
+            try await Task.sleep(nanoseconds: 2_000_000)
+        }
+    }
+
+    private func makeRequest(prompt: String) -> SuggestionRequest {
+        let snapshot = FocusedInputSnapshot(
+            applicationName: "TestApp",
+            bundleIdentifier: "com.example.TestApp",
+            processIdentifier: 123,
+            elementIdentifier: "field",
+            role: "AXTextField",
+            subrole: nil,
+            caretRect: .zero,
+            inputFrameRect: nil,
+            caretSource: "test",
+            caretQuality: .exact,
+            observedCharWidth: nil,
+            precedingText: prompt,
+            trailingText: "",
+            selection: NSRange(location: prompt.count, length: 0),
+            isSecure: false
+        )
+        let context = FocusedInputContext(snapshot: snapshot, generation: 1)
+
+        return SuggestionRequest(
+            context: context,
+            prefixText: prompt,
+            prompt: prompt,
+            generation: context.generation,
+            maxPredictionTokens: 8,
+            temperature: 0.1,
+            topK: 20,
+            topP: 0.7,
+            minP: 0.08,
+            repetitionPenalty: 1.05,
+            randomSeed: 42,
+            maxSuffixCharacters: 192,
+            completionLengthInstruction: "Return only the next few words.",
+            userName: nil,
+            customRules: [],
+            languageInstruction: nil,
+            clipboardContext: nil,
+            visualContextSummary: nil,
+            isMultiLineEnabled: false
+        )
+    }
+}
+
+/// Runtime fake that emits staged cumulative raw partials through the streaming entry point and
+/// counts which entry point was used.
+@MainActor
+private final class StreamingFakeRuntime: LlamaRuntimeGenerating {
+    var partialRawTexts: [String] = []
+    var finalText = ""
+    private(set) var streamingCallCount = 0
+
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions
+    ) async throws -> String {
+        finalText
+    }
+
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
+    ) async throws -> String {
+        streamingCallCount += 1
+        for partial in partialRawTexts {
+            onPartialRawText?(partial)
+        }
+        return finalText
+    }
+
+    func resetPromptCache() {}
+}
diff --git a/CotabbyTests/StreamedGhostTextPolicyTests.swift b/CotabbyTests/StreamedGhostTextPolicyTests.swift
new file mode 100644
index 00000000..014330c7
--- /dev/null
+++ b/CotabbyTests/StreamedGhostTextPolicyTests.swift
@@ -0,0 +1,42 @@
+import XCTest
+@testable import Cotabby
+
+/// Tests for the streamed-render monotonicity policy: out-of-order or normalizer-shrunk partials
+/// must never replace longer ghost text already on screen.
+final class StreamedGhostTextPolicyTests: XCTestCase {
+    func test_firstNonEmptyPartialRenders() {
+        XCTAssertTrue(StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: nil))
+        XCTAssertTrue(StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: ""))
+    }
+
+    func test_emptyCandidateNeverRenders() {
+        XCTAssertFalse(StreamedGhostTextPolicy.isRenderableExtension(candidate: "", currentlyRendered: nil))
+        XCTAssertFalse(StreamedGhostTextPolicy.isRenderableExtension(candidate: "", currentlyRendered: " wor"))
+    }
+
+    func test_strictExtensionRenders() {
+        XCTAssertTrue(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " world", currentlyRendered: " wor")
+        )
+    }
+
+    func test_staleShorterPartialIsDropped() {
+        XCTAssertFalse(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: " world")
+        )
+    }
+
+    func test_equalTextIsDroppedAsRedundant() {
+        XCTAssertFalse(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " world", currentlyRendered: " world")
+        )
+    }
+
+    func test_divergentRewriteIsDropped() {
+        // A normalizer can legally rewrite a fragment rather than extend it; the render must wait
+        // for the authoritative final result instead of flickering through rewrites.
+        XCTAssertFalse(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " worse idea", currentlyRendered: " world")
+        )
+    }
+}

From 99a9e2624629bed2f379918efdf4260b9a3d263c Mon Sep 17 00:00:00 2001
From: Jacob Fu <141651335+FuJacob@users.noreply.github.com>
Date: Thu, 11 Jun 2026 19:21:46 -0700
Subject: [PATCH 2/2] review: make queueStreamedPartial private; document why
 the drain flag survives a dispatch reset

---
 .../Coordinators/SuggestionCoordinator+Prediction.swift   | 8 ++++++--
 Cotabby/App/Coordinators/SuggestionCoordinator.swift      | 1 -
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
index dc0a0f75..f051fb24 100644
--- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
+++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
@@ -127,7 +127,11 @@ extension SuggestionCoordinator {
     /// `generateFromCurrentFocus` so that function stays within the project's complexity budget.
     private func dispatchGeneration(request: SuggestionRequest, workID: UInt64) {
         // A new generation starts a new stream; the previous request's rendered-partial state
-        // must not gate the new partials' monotonic checks.
+        // must not gate the new partials' monotonic checks. `isStreamDrainScheduled` is left
+        // alone on purpose: an already-enqueued drain block cannot be unscheduled, and it
+        // self-heals either way — it finds nil and clears the flag, or it finds a partial the
+        // new generation queued in the meantime and renders it under the same work-id guards.
+        // Resetting the flag here would instead double-schedule a drain for one partial.
         streamRenderedText = nil
         pendingStreamPartial = nil
         workController.replaceGenerationWork(for: workID) { [weak self] in
@@ -204,7 +208,7 @@ extension SuggestionCoordinator {
     /// 10-50ms from the engine, and rendering each one would stack session updates and overlay
     /// layout on the main actor; latest-wins coalescing bounds that work while the authoritative
     /// final result still arrives through `apply`.
-    func queueStreamedPartial(_ partial: SuggestionResult, workID: UInt64) {
+    private func queueStreamedPartial(_ partial: SuggestionResult, workID: UInt64) {
         guard workController.isCurrent(workID) else {
             return
         }
diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
index 5668d688..61516984 100644
--- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift
+++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
@@ -107,7 +107,6 @@ final class SuggestionCoordinator: ObservableObject {
     var isStreamDrainScheduled = false
     var streamRenderedText: String?
 
-
     /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop.
     ///
     /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this