diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj
index cfd245bd..02a03f32 100644
--- a/Cotabby.xcodeproj/project.pbxproj
+++ b/Cotabby.xcodeproj/project.pbxproj
@@ -175,6 +175,7 @@
 		3E78D03ABA7141D344AB8285 /* he.txt in Resources */ = {isa = PBXBuildFile; fileRef = C9C000E46A1E404932F89C81 /* he.txt */; };
 		3EF0A298B5590571B1C37282 /* FieldStyleCache.swift in Sources */ = {isa = PBXBuildFile; fileRef = B7FBF2B766E728F25899B64E /* FieldStyleCache.swift */; };
 		3F5630CFB7BA40B900E832A1 /* OCRTextHygieneTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5EED3CD2BC7B48DF35DEE562 /* OCRTextHygieneTests.swift */; };
+		3F87586426B5EF16B41CE62F /* LlamaSuggestionEngineStreamingTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */; };
 		3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */; };
 		3FCEF50FDD9EE01AE3711083 /* AXTreeDumpWriter.swift in Sources */ = {isa = PBXBuildFile; fileRef = B27492B04B627DA53BDAD938 /* AXTreeDumpWriter.swift */; };
 		3FF6B7DE34A01C4AB7FA54E3 /* MacroTriggerStateMachine.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1C201A65A6B040F90C528A3B /* MacroTriggerStateMachine.swift */; };
@@ -405,10 +406,12 @@
 		9CEBD6AF4405F1BBE0E3D16C /* MidWordContinuationPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 357C18383B047F24A531BDCD /* MidWordContinuationPolicy.swift */; };
 		9D0F4829D11BCD4DB1290410 /* InsertionStrategySelector.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0D2FEEA4304C86324BAADAB /* InsertionStrategySelector.swift */; };
 		9E031B67A275BB3E049EFC2F /* frequency_dictionary_en_82_765.txt in Resources */ = {isa = PBXBuildFile; fileRef = 99FBB636008490B66CF26772 /* frequency_dictionary_en_82_765.txt */; };
+		9E4AED02831829A108A1AA85 /* StreamedGhostTextPolicyTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */; };
 		9EB8E3DC796A0C8BFDE8E683 /* AppDelegate.swift in Sources */ = {isa = PBXBuildFile; fileRef = A3E8E86A14090BC7BD13BA76 /* AppDelegate.swift */; };
 		9F2FDCABCC941CBECAA3B4AB /* CotabbyInference in Frameworks */ = {isa = PBXBuildFile; productRef = 48A46AD6B613CF06072603E4 /* CotabbyInference */; };
 		9F6F88ED74ECA3E23A8E3CC0 /* SecureFieldDetector.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1827565F4FAD3E4E61CA65C3 /* SecureFieldDetector.swift */; };
 		A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */ = {isa = PBXBuildFile; fileRef = 72B13136DF7318F3E96DF0D3 /* SuggestionCoordinator+Acceptance.swift */; };
+		A0A2BD916B2CB22BAF32A62E /* StreamedGhostTextPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */; };
 		A0BB87E3665EF6C209034798 /* GhostSuggestionLayoutTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5AD3F4F9FBE82007E4E15F58 /* GhostSuggestionLayoutTests.swift */; };
 		A147C5EC3F2214A670F7556E /* FocusPollBackoffTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */; };
 		A1A612C90221E0FE1195754A /* SettingsCategory.swift in Sources */ = {isa = PBXBuildFile; fileRef = 5D0AEFF86F8210CBE7CFCBAD /* SettingsCategory.swift */; };
@@ -500,6 +503,7 @@
 		C607A624A0FB697486C56B8E /* PowerSourceMonitor.swift in Sources */ = {isa = PBXBuildFile; fileRef = DB235F0DEA53295DAF8B4FA0 /* PowerSourceMonitor.swift */; };
 		C618C5595DA9C57C806A3E03 /* SettingsAttentionEvaluatorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 2BC293F6125E2B14DCF05AD9 /* SettingsAttentionEvaluatorTests.swift */; };
 		C63F95C324C29940FAC6B973 /* de-100k.txt in Resources */ = {isa = PBXBuildFile; fileRef = 4B8665A5495891F9E3DDA48B /* de-100k.txt */; };
+		C6925440737F37F537622F35 /* StreamedGhostTextPolicy.swift in Sources */ = {isa = PBXBuildFile; fileRef = 299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */; };
 		C6A112B51525F988EA46F725 /* SystemResourceSamplerTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 9255CBCDE66253F521EE0F08 /* SystemResourceSamplerTests.swift */; };
 		C6A91AD96F52DB72947830C0 /* DownloadableModelCatalogView.swift in Sources */ = {isa = PBXBuildFile; fileRef = BB5C2AE9A7E55495D26AD074 /* DownloadableModelCatalogView.swift */; };
 		C71B594433F3B411CAE5DE7E /* FocusCapabilityResolverTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = D4F6D5F94B238F7B4BE7C247 /* FocusCapabilityResolverTests.swift */; };
@@ -704,6 +708,7 @@
 		292DC9D4D9D5D26AE882E39B /* EmojiCatalogMatcherTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiCatalogMatcherTests.swift; sourceTree = "<group>"; };
 		2930EC34057319130393696B /* KeyCodeLabelsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = KeyCodeLabelsTests.swift; sourceTree = "<group>"; };
 		2960080A726E51198225147A /* InsertionStrategySelectorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InsertionStrategySelectorTests.swift; sourceTree = "<group>"; };
+		299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamedGhostTextPolicy.swift; sourceTree = "<group>"; };
 		29ED42C4BDD0C521101AF95E /* DeviceInfo.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeviceInfo.swift; sourceTree = "<group>"; };
 		2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsAttentionEvaluator.swift; sourceTree = "<group>"; };
 		2B7A28471B8526C2693FFF65 /* AcknowledgementsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AcknowledgementsView.swift; sourceTree = "<group>"; };
@@ -939,6 +944,7 @@
 		D0AF9479EF020071CA64CCC1 /* HuggingFaceModelsTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HuggingFaceModelsTests.swift; sourceTree = "<group>"; };
 		D1123AB515110BD0CBA39490 /* HomePaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = HomePaneView.swift; sourceTree = "<group>"; };
 		D12ABBCE23A946C22894945B /* DecodeStopPolicy.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DecodeStopPolicy.swift; sourceTree = "<group>"; };
+		D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = StreamedGhostTextPolicyTests.swift; sourceTree = "<group>"; };
 		D2D0FE44138BCA8B2EE05AFE /* TypoCaseTransferTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TypoCaseTransferTests.swift; sourceTree = "<group>"; };
 		D2F46767D9D1F0D44E239CA8 /* DownloadFileRescuerTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadFileRescuerTests.swift; sourceTree = "<group>"; };
 		D3A2AC525DC664DB540D4F19 /* ClipboardRelevanceFilter.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ClipboardRelevanceFilter.swift; sourceTree = "<group>"; };
@@ -961,6 +967,7 @@
 		D9C1C921A1CDA2ADFC39EA01 /* AppsPaneView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AppsPaneView.swift; sourceTree = "<group>"; };
 		DB0CE9AB1286367BA2E82392 /* SettingsContainerView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SettingsContainerView.swift; sourceTree = "<group>"; };
 		DB235F0DEA53295DAF8B4FA0 /* PowerSourceMonitor.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PowerSourceMonitor.swift; sourceTree = "<group>"; };
+		DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaSuggestionEngineStreamingTests.swift; sourceTree = "<group>"; };
 		DDE858CB1E687E3CEB8FDD5B /* SuggestionRequestFactory.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionRequestFactory.swift; sourceTree = "<group>"; };
 		DDF6A4E9CE93FD53C60E67E3 /* EmojiQueryRun.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EmojiQueryRun.swift; sourceTree = "<group>"; };
 		DEB16474A67CE1D210B944C9 /* SuggestionSubsystemContracts.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionSubsystemContracts.swift; sourceTree = "<group>"; };
@@ -1366,6 +1373,7 @@
 				0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */,
 				AABCC3FD99B1824A81E665F3 /* LlamaSuggestionEngineCancellationTests.swift */,
 				26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */,
+				DDC034BBCBAC5E7989D4C85B /* LlamaSuggestionEngineStreamingTests.swift */,
 				9030FAAB468119A0236284A6 /* LLMIOFileHandlerTests.swift */,
 				D8083D44ABCDCFA68A4CD497 /* MacroEngineTests.swift */,
 				22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */,
@@ -1397,6 +1405,7 @@
 				D562A73C7C680F2AA65F9F7F /* SpellingDictionaryResourceTests.swift */,
 				E0871985CB1F877EC422E18C /* SpellingLanguageResolverTests.swift */,
 				9B3179B40A81DF121D1221C6 /* StaticTextRunWalkThrottleTests.swift */,
+				D1AA6A6F4C3A54B5DA2A0022 /* StreamedGhostTextPolicyTests.swift */,
 				C05B0439348261163B37C508 /* SuggestionAvailabilityEvaluatorTests.swift */,
 				EC04832FBD5311352F35241B /* SuggestionCaretLayoutRepairTests.swift */,
 				C375227649689775275AA4B3 /* SuggestionCoordinatorAcceptanceTests.swift */,
@@ -1608,6 +1617,7 @@
 				D4B56C250DDEF3E81F9DCBD7 /* SentenceBoundaryClassifier.swift */,
 				2A02336442BB735EE2E8D064 /* SettingsAttentionEvaluator.swift */,
 				0348A7053E5683C68879A71A /* SpellingLanguageResolver.swift */,
+				299BD7B741DA4AAE6A061BAD /* StreamedGhostTextPolicy.swift */,
 				3609CC88A5280B3AA40414DF /* SuggestionAvailabilityEvaluator.swift */,
 				B2F95847D76893C8A5B504B4 /* SuggestionOverlayStabilityGate.swift */,
 				DDE858CB1E687E3CEB8FDD5B /* SuggestionRequestFactory.swift */,
@@ -1992,6 +2002,7 @@
 				D6AD25168F108DA8D60E76EF /* SpellingDictionaryPicker.swift in Sources */,
 				257C2A5D299365C1D98527A8 /* SpellingLanguageResolver.swift in Sources */,
 				753C5A939E986B1A0FB25664 /* StaticTextRunWalkThrottle.swift in Sources */,
+				A0A2BD916B2CB22BAF32A62E /* StreamedGhostTextPolicy.swift in Sources */,
 				333C09921443BDDF21A9753D /* SuggestionAvailabilityEvaluator.swift in Sources */,
 				EC4ED03BE4C7DD0E6319F310 /* SuggestionCoordinator+Acceptance.swift in Sources */,
 				AC4A369EC73115E1F698934D /* SuggestionCoordinator+Input.swift in Sources */,
@@ -2215,6 +2226,7 @@
 				94F037A3F9D7CE52CC70CA0F /* SpellingDictionaryPicker.swift in Sources */,
 				1BDEC75125ADFCD67F3C406D /* SpellingLanguageResolver.swift in Sources */,
 				B50EDCA5C4C5FE4FC548AA74 /* StaticTextRunWalkThrottle.swift in Sources */,
+				C6925440737F37F537622F35 /* StreamedGhostTextPolicy.swift in Sources */,
 				4F369F5284DDCEABF082E59B /* SuggestionAvailabilityEvaluator.swift in Sources */,
 				A0657CE0488F69F0BD559CBC /* SuggestionCoordinator+Acceptance.swift in Sources */,
 				D2F1DD215989BF32675308C2 /* SuggestionCoordinator+Input.swift in Sources */,
@@ -2338,6 +2350,7 @@
 				E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */,
 				BE3CB85508055D159C35020A /* LlamaSuggestionEngineCancellationTests.swift in Sources */,
 				E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */,
+				3F87586426B5EF16B41CE62F /* LlamaSuggestionEngineStreamingTests.swift in Sources */,
 				8429B116328C392DCA018D95 /* MacroEngineTests.swift in Sources */,
 				3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */,
 				87806DE08881D11F2608A13D /* MarkerSelectionSynthesizerTests.swift in Sources */,
@@ -2368,6 +2381,7 @@
 				303652F15C0FE55595669D81 /* SpellingDictionaryResourceTests.swift in Sources */,
 				66D0D9F605AF462F569A5CFD /* SpellingLanguageResolverTests.swift in Sources */,
 				96C3128BCB17A05A7C7DEFF7 /* StaticTextRunWalkThrottleTests.swift in Sources */,
+				9E4AED02831829A108A1AA85 /* StreamedGhostTextPolicyTests.swift in Sources */,
 				88BCD795A14E1C9308F7BB31 /* SuggestionAvailabilityEvaluatorTests.swift in Sources */,
 				EB9B5E5F7326AB72E0E44C70 /* SuggestionCaretLayoutRepairTests.swift in Sources */,
 				5B404450B412A6102F514250 /* SuggestionCoordinatorAcceptanceTests.swift in Sources */,
diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
index fcfae5e9..f051fb24 100644
--- a/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
+++ b/Cotabby/App/Coordinators/SuggestionCoordinator+Prediction.swift
@@ -126,13 +126,26 @@ extension SuggestionCoordinator {
     /// result (or failure) only while it is still the current work. Extracted from
     /// `generateFromCurrentFocus` so that function stays within the project's complexity budget.
     private func dispatchGeneration(request: SuggestionRequest, workID: UInt64) {
+        // A new generation starts a new stream; the previous request's rendered-partial state
+        // must not gate the new partials' monotonic checks. `isStreamDrainScheduled` is left
+        // alone on purpose: an already-enqueued drain block cannot be unscheduled, and it
+        // self-heals either way — it finds nil and clears the flag, or it finds a partial the
+        // new generation queued in the meantime and renders it under the same work-id guards.
+        // Resetting the flag here would instead double-schedule a drain for one partial.
+        streamRenderedText = nil
+        pendingStreamPartial = nil
         workController.replaceGenerationWork(for: workID) { [weak self] in
             guard let self else {
                 return
             }
 
             do {
-                let result = try await suggestionEngine.generateSuggestion(for: request)
+                let result = try await suggestionEngine.generateSuggestion(
+                    for: request,
+                    onPartial: { [weak self] partial in
+                        self?.queueStreamedPartial(partial, workID: workID)
+                    }
+                )
                 guard !Task.isCancelled, self.workController.isCurrent(workID) else {
                     return
                 }
@@ -189,6 +202,77 @@ extension SuggestionCoordinator {
         return value
     }
 
+    // MARK: - Streamed partial rendering
+
+    /// Coalesces streamed partials to at most one render per runloop turn. Tokens arrive every
+    /// 10-50ms from the engine, and rendering each one would stack session updates and overlay
+    /// layout on the main actor; latest-wins coalescing bounds that work while the authoritative
+    /// final result still arrives through `apply`.
+    private func queueStreamedPartial(_ partial: SuggestionResult, workID: UInt64) {
+        guard workController.isCurrent(workID) else {
+            return
+        }
+        pendingStreamPartial = PendingStreamPartial(result: partial, workID: workID)
+        guard !isStreamDrainScheduled else {
+            return
+        }
+        isStreamDrainScheduled = true
+        DispatchQueue.main.async { [weak self] in
+            self?.drainStreamedPartial()
+        }
+    }
+
+    private func drainStreamedPartial() {
+        isStreamDrainScheduled = false
+        guard let pending = pendingStreamPartial else {
+            return
+        }
+        pendingStreamPartial = nil
+        applyStreamedPartial(pending.result, workID: pending.workID)
+    }
+
+    /// Renders one streamed partial as a real, acceptable session.
+    ///
+    /// A real session rather than a cosmetic overlay because acceptance gates on the live session
+    /// (never on `state`), so the user can Tab into a stream the moment the first words appear;
+    /// accepting cancels the in-flight work (work id bump), freezing the suggestion at what was
+    /// streamed. Renders are monotonic (`StreamedGhostTextPolicy`) so reordered hops and
+    /// normalizer rewrites never shrink visible ghost text, and the materialize check stops
+    /// partials the moment the field text moves on without a keystroke (a keystroke already
+    /// bumped the work id before this runs).
+    private func applyStreamedPartial(_ partial: SuggestionResult, workID: UInt64) {
+        guard workController.isCurrent(workID) else {
+            return
+        }
+        guard StreamedGhostTextPolicy.isRenderableExtension(
+            candidate: partial.text,
+            currentlyRendered: streamRenderedText
+        ) else {
+            return
+        }
+        guard let rawContext = focusModel.snapshot.context else {
+            return
+        }
+
+        let liveContext = interactionState.materializeContext(from: rawContext)
+        guard liveContext.generation == partial.generation else {
+            return
+        }
+
+        _ = interactionState.startSession(
+            fullText: partial.text,
+            liveContext: liveContext,
+            latency: partial.latency
+        )
+        streamRenderedText = partial.text
+        presentOverlay(
+            text: partial.text,
+            at: liveContext.caretRect,
+            context: liveContext,
+            isRightToLeft: TextDirectionDetector.isRightToLeft(liveContext.precedingText)
+        )
+    }
+
     /// Runs the typo gate for the current word. Returns `true` when it handled the cycle by suppressing,
     /// offering, or applying a correction; `false` proceeds with a normal continuation. Kept separate
     /// so `generateFromCurrentFocus` stays within the project's cyclomatic-complexity budget.
@@ -761,6 +845,9 @@ extension SuggestionCoordinator {
         // Drop any pending accepted-tail guard whenever the suggestion state is torn down (user
         // typed, focus changed, predictions disabled). The final-chunk accept re-sets it afterward.
         lastAcceptedTail = nil
+        // Stream bookkeeping follows the session it was rendering for.
+        streamRenderedText = nil
+        pendingStreamPartial = nil
         latestSuggestionPreview = nil
         latestFullSuggestionPreview = nil
         latestRemainingSuggestionPreview = nil
diff --git a/Cotabby/App/Coordinators/SuggestionCoordinator.swift b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
index 6475bae3..61516984 100644
--- a/Cotabby/App/Coordinators/SuggestionCoordinator.swift
+++ b/Cotabby/App/Coordinators/SuggestionCoordinator.swift
@@ -93,6 +93,20 @@ final class SuggestionCoordinator: ObservableObject {
     }
 
     var clipboardPrefaceMemo: ClipboardPrefaceMemo?
+    /// Streamed-render bookkeeping. Partial results hop in from the engine while a decode is
+    /// still running; they are coalesced (latest wins, drained once per runloop turn) so
+    /// token-rate deliveries cannot stack session and overlay layout work on the main actor, and
+    /// `streamRenderedText` carries the monotonic-extension state for `StreamedGhostTextPolicy`.
+    /// All of it is scoped to the current work id and reset when a new generation dispatches.
+    struct PendingStreamPartial {
+        let result: SuggestionResult
+        let workID: UInt64
+    }
+
+    var pendingStreamPartial: PendingStreamPartial?
+    var isStreamDrainScheduled = false
+    var streamRenderedText: String?
+
     /// Monotonic cancellation token for the "wait until the host publishes typed text to AX" loop.
     ///
     /// Keystrokes can arrive faster than Chromium publishes contenteditable updates. Without this
diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift
index c47b2329..7b1b0b45 100644
--- a/Cotabby/Models/SuggestionSubsystemContracts.swift
+++ b/Cotabby/Models/SuggestionSubsystemContracts.swift
@@ -89,6 +89,15 @@ protocol EmojiInputIntercepting: AnyObject {
 @MainActor
 protocol SuggestionGenerating: AnyObject {
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult
+    /// Streaming variant: `onPartial` receives cumulative, already-normalized partial results on
+    /// the main actor while the engine decodes, so ghost text can render after the first words
+    /// instead of waiting for the full completion. The returned result remains the authoritative
+    /// final answer; partials are best-effort hints the renderer may coalesce or drop. Engines
+    /// that cannot stream rely on the default, which degrades to the single-shot path.
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult
     /// Clears backend-local continuation state when the focused editing context is no longer
     /// continuous. Stateless engines may implement this as a no-op.
     func resetCachedGenerationContext() async
@@ -102,6 +111,13 @@ protocol SuggestionGenerating: AnyObject {
 
 extension SuggestionGenerating {
     func prewarm(for request: SuggestionRequest) async {}
+
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request)
+    }
 }
 
 /// Behavior-shaped view of the llama runtime that `LlamaSuggestionEngine` depends on: run one
@@ -112,6 +128,15 @@ extension SuggestionGenerating {
 @MainActor
 protocol LlamaRuntimeGenerating: AnyObject {
     func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String
+    /// Streaming variant: `onPartialRawText` receives the cumulative raw completion after each
+    /// sampled token, called from the decode thread (hence `@Sendable`); callers own hopping to
+    /// their actor. The returned string is still the authoritative final completion.
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
+    ) async throws -> String
     func resetPromptCache()
     /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next
     /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup:
@@ -125,6 +150,18 @@ extension LlamaRuntimeGenerating {
     func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {}
 }
 
+extension LlamaRuntimeGenerating {
+    /// Default for fakes that only exercise the single-shot contract: ignore the partial hook.
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
+    ) async throws -> String {
+        try await generate(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options)
+    }
+}
+
 @MainActor
 protocol SuggestionSettingsProviding: AnyObject {
     var snapshot: SuggestionSettingsSnapshot { get }
diff --git a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift
index aba81f35..570a6dc9 100644
--- a/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/FoundationModelSuggestionEngine.swift
@@ -44,6 +44,16 @@ final class FoundationModelSuggestionEngine {
     }
 
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request, onPartial: nil)
+    }
+
+    /// Streaming variant: Apple's response stream already yields cumulative snapshots; each one is
+    /// normalized and forwarded so ghost text can render before the stream finishes. The previous
+    /// implementation deliberately discarded the partials pending coordinator support.
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
         availabilityService.refresh()
 
         let baseMetadata: Logger.Metadata = [
@@ -98,6 +108,23 @@ final class FoundationModelSuggestionEngine {
                 rawSuggestion = partial.content
                 didReceiveSnapshot = true
                 try Task.checkCancellation()
+                // This engine is main-actor confined, so partials forward inline (no hop). Empty
+                // normalizations are withheld; the coordinator's monotonic policy handles the rest.
+                if let onPartial {
+                    let partialNormalized = SuggestionTextNormalizer.normalizeDetailed(
+                        rawSuggestion,
+                        for: request,
+                        promptEchoCandidates: [prompt]
+                    ).text
+                    if !partialNormalized.isEmpty {
+                        onPartial(SuggestionResult(
+                            generation: request.generation,
+                            rawText: rawSuggestion,
+                            text: partialNormalized,
+                            latency: Date().timeIntervalSince(startTime)
+                        ))
+                    }
+                }
             }
             try Task.checkCancellation()
             // Apple's documented contract is at least one snapshot on a successful stream, so a
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index a18be48b..cee4882a 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -133,10 +133,13 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
     /// Prepares the prompt context, reusing cached KV state when safe, then samples a short completion.
     /// Holds `autocompleteLock` for the full call to prevent concurrent KV cache mutation.
+    /// `onPartialRawText` receives the cumulative raw completion after each sampled token, on the
+    /// calling (detached) thread, so the UI can render ghost text before the decode finishes.
     func generate(
         prompt: String,
         cachedPrefixBytes: Int? = nil,
-        options: LlamaGenerationOptions
+        options: LlamaGenerationOptions,
+        onPartialRawText: ((String) -> Void)? = nil
     ) throws -> String {
         let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate")
 
@@ -185,7 +188,11 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
         // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for
         // the next request. Token selection is delegated to the engine's built-in sampler.
-        let decode = runEngineSampledDecode(sequenceID: sequenceID, options: options)
+        let decode = runEngineSampledDecode(
+            sequenceID: sequenceID,
+            options: options,
+            onPartialRawText: onPartialRawText
+        )
         if decode.engineCancelled {
             // The engine's per-sequence abort flag is set-once; an aborted sequence would refuse
             // every future decode, so drop it and let the next request build fresh.
@@ -351,10 +358,12 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
     /// The shipping decoder: delegates token selection to the engine's built-in sampler
     /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
     /// `engineCancelled` reports that the native abort flag fired; the sequence must then be
-    /// discarded because the flag is set-once for a sequence's lifetime.
+    /// discarded because the flag is set-once for a sequence's lifetime. `onPartialRawText`
+    /// receives the cumulative raw completion after each sampled token, on the calling thread.
     private func runEngineSampledDecode(
         sequenceID: Int32,
-        options: LlamaGenerationOptions
+        options: LlamaGenerationOptions,
+        onPartialRawText: ((String) -> Void)? = nil
     ) -> (text: String, engineCancelled: Bool) {
         var generatedText = ""
         var tokensGenerated = 0
@@ -388,6 +397,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             generatedText += piece
             tokensGenerated += 1
             sumLogprob += Double(result.logprob)
+            // Cumulative text, not the delta: consumers render whole partials, and cumulative
+            // semantics make late or reordered deliveries harmless downstream.
+            onPartialRawText?(generatedText)
 
             // Stop at the first natural sentence boundary instead of running the full token budget.
             // This keeps completions tight and is latency-positive (fewer tokens), and it adds no
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index 6cc22d7f..889522ca 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -100,6 +100,22 @@ final class LlamaRuntimeManager: ObservableObject {
         prompt: String,
         cachedPrefixBytes: Int? = nil,
         options: LlamaGenerationOptions
+    ) async throws -> String {
+        try await generate(
+            prompt: prompt,
+            cachedPrefixBytes: cachedPrefixBytes,
+            options: options,
+            onPartialRawText: nil
+        )
+    }
+
+    /// Streaming variant: `onPartialRawText` is invoked from the decode thread with the cumulative
+    /// raw completion after each sampled token; see `LlamaRuntimeGenerating`.
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int? = nil,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
     ) async throws -> String {
         _ = try await preparedRuntime()
 
@@ -113,7 +129,8 @@ final class LlamaRuntimeManager: ObservableObject {
                 try core.generate(
                     prompt: prompt,
                     cachedPrefixBytes: cachedPrefixBytes,
-                    options: options
+                    options: options,
+                    onPartialRawText: onPartialRawText
                 )
             }
             return try await withTaskCancellationHandler {
diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index c6b9f344..4092bbb9 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -56,6 +56,17 @@ final class LlamaSuggestionEngine {
 
     /// Executes one generation request and packages the raw and normalized result for the coordinator.
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request, onPartial: nil)
+    }
+
+    /// Streaming variant: cumulative raw partials from the decode thread are normalized and
+    /// forwarded to `onPartial` on the main actor, so the coordinator can paint ghost text while
+    /// the decode is still running. Empty normalizations are withheld (there is nothing useful to
+    /// paint), and the returned result remains the authoritative final completion.
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
         let baseMetadata: Logger.Metadata = [
             "request_id": .string(request.requestID),
             "engine": .string("llama")
@@ -77,11 +88,39 @@ final class LlamaSuggestionEngine {
                     "max_tokens": .stringConvertible(request.maxPredictionTokens)
                 ]) { _, new in new }
             )
-            let rawSuggestion = try await runtimeManager.generate(
-                prompt: request.prompt,
-                cachedPrefixBytes: cachedPrefixBytes,
-                options: Self.makeGenerationOptions(for: request)
-            )
+            let options = Self.makeGenerationOptions(for: request)
+            let rawSuggestion: String
+            if let onPartial {
+                rawSuggestion = try await runtimeManager.generate(
+                    prompt: request.prompt,
+                    cachedPrefixBytes: cachedPrefixBytes,
+                    options: options,
+                    onPartialRawText: { raw in
+                        // Decode-thread callback; normalization and delivery hop to the main
+                        // actor. Hops are independent tasks, so a shorter cumulative can land
+                        // after a longer one — the coordinator's monotonic render policy makes
+                        // that harmless.
+                        Task { @MainActor in
+                            let normalized = SuggestionTextNormalizer.normalizeDetailed(raw, for: request).text
+                            guard !normalized.isEmpty else {
+                                return
+                            }
+                            onPartial(SuggestionResult(
+                                generation: request.generation,
+                                rawText: raw,
+                                text: normalized,
+                                latency: Date().timeIntervalSince(startTime)
+                            ))
+                        }
+                    }
+                )
+            } else {
+                rawSuggestion = try await runtimeManager.generate(
+                    prompt: request.prompt,
+                    cachedPrefixBytes: cachedPrefixBytes,
+                    options: options
+                )
+            }
             try Task.checkCancellation()
 
             promptCacheHintTracker.recordSuccessfulRequest(request)
diff --git a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift
index 454e9dc2..65b0cfbe 100644
--- a/Cotabby/Services/Runtime/SuggestionEngineRouter.swift
+++ b/Cotabby/Services/Runtime/SuggestionEngineRouter.swift
@@ -31,6 +31,13 @@ final class SuggestionEngineRouter {
     }
 
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
+        try await generateSuggestion(for: request, onPartial: nil)
+    }
+
+    func generateSuggestion(
+        for request: SuggestionRequest,
+        onPartial: (@MainActor (SuggestionResult) -> Void)?
+    ) async throws -> SuggestionResult {
         let metadata: Logger.Metadata = [
             "request_id": .string(request.requestID),
             "engine": .string(engineMetadataLabel(for: suggestionSettings.selectedEngine))
@@ -39,7 +46,7 @@ final class SuggestionEngineRouter {
         case .appleIntelligence:
             CotabbyLogger.suggestion.debug("Routing to Apple Intelligence engine", metadata: metadata)
             do {
-                let result = try await foundationModelEngine.generateSuggestion(for: request)
+                let result = try await foundationModelEngine.generateSuggestion(for: request, onPartial: onPartial)
                 recordPerformanceMetric(modelName: "Apple Intelligence", latency: result.latency)
                 return result
             } catch SuggestionClientError.unsupportedLanguageOrLocale(let message) {
@@ -52,12 +59,13 @@ final class SuggestionEngineRouter {
                 )
                 return try await generateOpenSourceFallback(
                     for: request,
-                    appleFailureMessage: message
+                    appleFailureMessage: message,
+                    onPartial: onPartial
                 )
             }
         case .llamaOpenSource:
             CotabbyLogger.suggestion.debug("Routing to open-source llama engine", metadata: metadata)
-            let result = try await llamaEngine.generateSuggestion(for: request)
+            let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial)
             recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency)
             return result
         }
@@ -107,10 +115,11 @@ final class SuggestionEngineRouter {
     /// coordinator backend-agnostic while giving local models a chance to handle that text.
     private func generateOpenSourceFallback(
         for request: SuggestionRequest,
-        appleFailureMessage: String
+        appleFailureMessage: String,
+        onPartial: (@MainActor (SuggestionResult) -> Void)? = nil
     ) async throws -> SuggestionResult {
         do {
-            let result = try await llamaEngine.generateSuggestion(for: request)
+            let result = try await llamaEngine.generateSuggestion(for: request, onPartial: onPartial)
             recordPerformanceMetric(modelName: llamaModelNameProvider() ?? "Llama", latency: result.latency)
             return result
         } catch SuggestionClientError.cancelled {
diff --git a/Cotabby/Support/StreamedGhostTextPolicy.swift b/Cotabby/Support/StreamedGhostTextPolicy.swift
new file mode 100644
index 00000000..e7332106
--- /dev/null
+++ b/Cotabby/Support/StreamedGhostTextPolicy.swift
@@ -0,0 +1,22 @@
+import Foundation
+
+/// Decides whether a streamed cumulative partial may replace the currently rendered ghost text.
+///
+/// Streamed renders are monotonic by policy: a candidate must strictly extend what is already on
+/// screen. Two real hazards motivate this rather than trusting arrival order. Partials hop from
+/// the decode thread to the main actor as independent tasks, so a shorter, older cumulative can
+/// land after a longer one; and the text normalizer runs on every cumulative snapshot, so its
+/// output for a longer raw string is not guaranteed to extend its output for a shorter one (for
+/// example when a boundary rule trims a trailing fragment). Dropping non-extensions costs nothing:
+/// the next partial or the authoritative final result supersedes it.
+enum StreamedGhostTextPolicy {
+    static func isRenderableExtension(candidate: String, currentlyRendered: String?) -> Bool {
+        guard !candidate.isEmpty else {
+            return false
+        }
+        guard let currentlyRendered, !currentlyRendered.isEmpty else {
+            return true
+        }
+        return candidate.count > currentlyRendered.count && candidate.hasPrefix(currentlyRendered)
+    }
+}
diff --git a/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift
new file mode 100644
index 00000000..fc138f70
--- /dev/null
+++ b/CotabbyTests/LlamaSuggestionEngineStreamingTests.swift
@@ -0,0 +1,128 @@
+import CoreGraphics
+import Foundation
+import XCTest
+@testable import Cotabby
+
+/// Tests for the llama engine's streaming contract: cumulative raw partials from the runtime are
+/// normalized and forwarded to `onPartial` on the main actor, and the final result still goes
+/// through the existing single-shot path (tracker recording, normalization, latency).
+@MainActor
+final class LlamaSuggestionEngineStreamingTests: XCTestCase {
+
+    func test_streamingGeneration_forwardsNormalizedCumulativePartials() async throws {
+        let runtime = StreamingFakeRuntime()
+        runtime.partialRawTexts = [" wor", " world ag"]
+        runtime.finalText = " world again"
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+
+        var partials: [SuggestionResult] = []
+        let result = try await engine.generateSuggestion(for: makeRequest(prompt: "Hello")) { partial in
+            partials.append(partial)
+        }
+
+        // Partials hop to the main actor as tasks; drain before asserting.
+        try await drainUntil { partials.count >= 2 }
+
+        XCTAssertEqual(result.rawText, " world again")
+        XCTAssertEqual(partials.map(\.rawText), [" wor", " world ag"])
+        XCTAssertFalse(partials.contains { $0.text.isEmpty }, "Empty normalizations must be withheld, not forwarded.")
+        XCTAssertEqual(partials.map(\.generation), [1, 1], "Partials must carry the request generation for stale guards.")
+    }
+
+    func test_plainGeneration_neverInvokesPartialHook() async throws {
+        let runtime = StreamingFakeRuntime()
+        runtime.partialRawTexts = [" wor"]
+        runtime.finalText = " world"
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+
+        _ = try await engine.generateSuggestion(for: makeRequest(prompt: "Hello"))
+
+        try await drainUntil { true }
+        XCTAssertEqual(runtime.streamingCallCount, 0, "The single-shot entry point must use the non-streaming runtime path.")
+    }
+
+    // MARK: - Helpers
+
+    /// Pumps the main actor until `condition` holds or a bounded number of yields elapse, so the
+    /// forwarded-partial tasks get a chance to run without arbitrary sleeps.
+    private func drainUntil(_ condition: () -> Bool) async throws {
+        for _ in 0..<200 where !condition() {
+            try await Task.sleep(nanoseconds: 2_000_000)
+        }
+    }
+
+    private func makeRequest(prompt: String) -> SuggestionRequest {
+        let snapshot = FocusedInputSnapshot(
+            applicationName: "TestApp",
+            bundleIdentifier: "com.example.TestApp",
+            processIdentifier: 123,
+            elementIdentifier: "field",
+            role: "AXTextField",
+            subrole: nil,
+            caretRect: .zero,
+            inputFrameRect: nil,
+            caretSource: "test",
+            caretQuality: .exact,
+            observedCharWidth: nil,
+            precedingText: prompt,
+            trailingText: "",
+            selection: NSRange(location: prompt.count, length: 0),
+            isSecure: false
+        )
+        let context = FocusedInputContext(snapshot: snapshot, generation: 1)
+
+        return SuggestionRequest(
+            context: context,
+            prefixText: prompt,
+            prompt: prompt,
+            generation: context.generation,
+            maxPredictionTokens: 8,
+            temperature: 0.1,
+            topK: 20,
+            topP: 0.7,
+            minP: 0.08,
+            repetitionPenalty: 1.05,
+            randomSeed: 42,
+            maxSuffixCharacters: 192,
+            completionLengthInstruction: "Return only the next few words.",
+            userName: nil,
+            customRules: [],
+            languageInstruction: nil,
+            clipboardContext: nil,
+            visualContextSummary: nil,
+            isMultiLineEnabled: false
+        )
+    }
+}
+
+/// Runtime fake that emits staged cumulative raw partials through the streaming entry point and
+/// counts which entry point was used.
+@MainActor
+private final class StreamingFakeRuntime: LlamaRuntimeGenerating {
+    var partialRawTexts: [String] = []
+    var finalText = ""
+    private(set) var streamingCallCount = 0
+
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions
+    ) async throws -> String {
+        finalText
+    }
+
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        onPartialRawText: (@Sendable (String) -> Void)?
+    ) async throws -> String {
+        streamingCallCount += 1
+        for partial in partialRawTexts {
+            onPartialRawText?(partial)
+        }
+        return finalText
+    }
+
+    func resetPromptCache() {}
+}
diff --git a/CotabbyTests/StreamedGhostTextPolicyTests.swift b/CotabbyTests/StreamedGhostTextPolicyTests.swift
new file mode 100644
index 00000000..014330c7
--- /dev/null
+++ b/CotabbyTests/StreamedGhostTextPolicyTests.swift
@@ -0,0 +1,42 @@
+import XCTest
+@testable import Cotabby
+
+/// Tests for the streamed-render monotonicity policy: out-of-order or normalizer-shrunk partials
+/// must never replace longer ghost text already on screen.
+final class StreamedGhostTextPolicyTests: XCTestCase {
+    func test_firstNonEmptyPartialRenders() {
+        XCTAssertTrue(StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: nil))
+        XCTAssertTrue(StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: ""))
+    }
+
+    func test_emptyCandidateNeverRenders() {
+        XCTAssertFalse(StreamedGhostTextPolicy.isRenderableExtension(candidate: "", currentlyRendered: nil))
+        XCTAssertFalse(StreamedGhostTextPolicy.isRenderableExtension(candidate: "", currentlyRendered: " wor"))
+    }
+
+    func test_strictExtensionRenders() {
+        XCTAssertTrue(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " world", currentlyRendered: " wor")
+        )
+    }
+
+    func test_staleShorterPartialIsDropped() {
+        XCTAssertFalse(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " wor", currentlyRendered: " world")
+        )
+    }
+
+    func test_equalTextIsDroppedAsRedundant() {
+        XCTAssertFalse(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " world", currentlyRendered: " world")
+        )
+    }
+
+    func test_divergentRewriteIsDropped() {
+        // A normalizer can legally rewrite a fragment rather than extend it; the render must wait
+        // for the authoritative final result instead of flickering through rewrites.
+        XCTAssertFalse(
+            StreamedGhostTextPolicy.isRenderableExtension(candidate: " worse idea", currentlyRendered: " world")
+        )
+    }
+}