diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj
index a2988d19..cfd245bd 100644
--- a/Cotabby.xcodeproj/project.pbxproj
+++ b/Cotabby.xcodeproj/project.pbxproj
@@ -568,6 +568,7 @@
 		E51FA12B690428CA431328FC /* WritingPaneView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D48B95B6665109B6C6A63B42 /* WritingPaneView.swift */; };
 		E54F5F03E16859D5A1E3437A /* MacroController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4638C74239D1DE2DC4D87975 /* MacroController.swift */; };
 		E5CB34ED76BAE87E8A858112 /* WebContentFieldDetectorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 210F9AD332273FE2EB3A9A01 /* WebContentFieldDetectorTests.swift */; };
+		E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */; };
 		E6EE3C13FA31F261CD734C69 /* DownloadOutcomeClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DE1975F3B5F4A70478DBF41 /* DownloadOutcomeClassifier.swift */; };
 		E853B9C7AF93FA595DC417B2 /* EmojiVariantResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1A8414BEB7E34F57607E37FE /* EmojiVariantResolver.swift */; };
 		E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */; };
@@ -696,6 +697,7 @@
 		24F613F0E2F7046E6532A09C /* OnboardingTemplateFeatureList.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OnboardingTemplateFeatureList.swift; sourceTree = "<group>"; };
 		262BE2F1E97389FE8D7A5FB9 /* Cotabby.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Cotabby.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		264CA64B2AB1611F82E5B760 /* WelcomeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WelcomeView.swift; sourceTree = "<group>"; };
+		26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaSuggestionEnginePrewarmTests.swift; sourceTree = "<group>"; };
 		273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusPollBackoffTests.swift; sourceTree = "<group>"; };
 		27A5D63F390E9B7A7FE343FE /* SystemResourceSampler.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SystemResourceSampler.swift; sourceTree = "<group>"; };
 		28B7EB84781C0ED57844585E /* OnboardingTemplateTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OnboardingTemplateTests.swift; sourceTree = "<group>"; };
@@ -1363,6 +1365,7 @@
 				4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */,
 				0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */,
 				AABCC3FD99B1824A81E665F3 /* LlamaSuggestionEngineCancellationTests.swift */,
+				26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */,
 				9030FAAB468119A0236284A6 /* LLMIOFileHandlerTests.swift */,
 				D8083D44ABCDCFA68A4CD497 /* MacroEngineTests.swift */,
 				22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */,
@@ -2334,6 +2337,7 @@
 				E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */,
 				E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */,
 				BE3CB85508055D159C35020A /* LlamaSuggestionEngineCancellationTests.swift in Sources */,
+				E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */,
 				8429B116328C392DCA018D95 /* MacroEngineTests.swift in Sources */,
 				3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */,
 				87806DE08881D11F2608A13D /* MarkerSelectionSynthesizerTests.swift in Sources */,
diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift
index f50b4d72..c47b2329 100644
--- a/Cotabby/Models/SuggestionSubsystemContracts.swift
+++ b/Cotabby/Models/SuggestionSubsystemContracts.swift
@@ -93,10 +93,10 @@ protocol SuggestionGenerating: AnyObject {
     /// continuous. Stateless engines may implement this as a no-op.
     func resetCachedGenerationContext() async
     /// Best-effort warmup hook the coordinator calls after focus arrives on an editable surface.
-    /// Engines that benefit from prefix caching or weight loading (Apple Foundation Models) use it
-    /// to prime the next request; engines that do not (llama already keeps its KV cache hot) can
-    /// rely on the default no-op extension. Failures are intentionally swallowed by implementations
-    /// because prewarming is opportunistic.
+    /// Apple Foundation Models primes its session here, and the llama engine prefills the new
+    /// field's prompt KV (a focus change destroys the previous field's native sequence, so without
+    /// this the first suggestion in every field pays the full cold prompt decode). Failures are
+    /// intentionally swallowed by implementations because prewarming is opportunistic.
     func prewarm(for request: SuggestionRequest) async
 }
 
@@ -113,6 +113,16 @@ extension SuggestionGenerating {
 protocol LlamaRuntimeGenerating: AnyObject {
     func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String
     func resetPromptCache()
+    /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next
+    /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup:
+    /// callers treat failures as "no cache primed", never as a user-facing error.
+    func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws
+}
+
+extension LlamaRuntimeGenerating {
+    /// Default no-op so test fakes that only exercise the generate/cancel contract keep compiling;
+    /// the production manager overrides this with a real prompt prefill.
+    func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {}
 }
 
 @MainActor
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
index 5384b9cf..a18be48b 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift
@@ -34,6 +34,24 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
     private var autocompletePromptTokens: [Int32] = []
     private var autocompleteSamplingFingerprint: SamplingFingerprint?
 
+    /// The sequence the in-flight autocomplete operation is decoding into, published for
+    /// `abortInFlightGeneration` to target from the canceller's thread. Guarded by its own lock
+    /// because the abort fires while `autocompleteLock` is held by the very work being aborted.
+    private let abortTargetLock = NSLock()
+    private var abortTargetSequenceID: Int32 = -1
+
+    /// One loud line per model load when the engine rejects partial KV trims (llama.cpp cannot
+    /// drop mid-sequence ranges on hybrid/recurrent or SWA caches). Without this signal the
+    /// prefix-reuse fast path degrades silently to a full prompt re-prefill on every request.
+    private var loggedTrimRejectionForCurrentModel = false
+
+    /// True once the loaded model has rejected a partial KV trim (hybrid/recurrent and SWA caches
+    /// reject them unconditionally). On such models prefix reuse can never succeed, so prewarm
+    /// prefills are pure double work: the warmed sequence cannot be trimmed back to prompt-only
+    /// state, and the following generate's reuse trim is rejected too, forcing a second full
+    /// decode of the same prompt. Guarded by `autocompleteLock`; reset on model load.
+    private var modelRejectsPartialTrims = false
+
     /// Coordinates model lifecycle with in-flight operations. `generate()` and `summarize()`
     /// increment the active count on entry and decrement on exit. `shutdown()` sets the
     /// shutting-down flag and blocks until all active operations finish before unloading.
@@ -95,6 +113,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             backendName: "CotabbyInferenceEngine (llama.cpp in-process)"
         )
         self.preparedRuntime = result
+        loggedTrimRejectionForCurrentModel = false
+        modelRejectsPartialTrims = false
         CotabbyLogger.runtime.info(
             "Model loaded",
             metadata: [
@@ -118,9 +138,73 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         cachedPrefixBytes: Int? = nil,
         options: LlamaGenerationOptions
     ) throws -> String {
-        guard let preparedRuntime else {
-            throw LlamaRuntimeError.unavailable("The llama model is not loaded.")
+        let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate")
+
+        lifecycleCondition.lock()
+        guard !isShuttingDown else {
+            lifecycleCondition.unlock()
+            throw LlamaRuntimeError.unavailable("The runtime is shutting down.")
         }
+        activeOperationCount += 1
+        lifecycleCondition.unlock()
+
+        defer {
+            lifecycleCondition.lock()
+            activeOperationCount -= 1
+            lifecycleCondition.broadcast()
+            lifecycleCondition.unlock()
+        }
+
+        autocompleteLock.lock()
+        defer { autocompleteLock.unlock() }
+        // Registered before `obtainAutocompleteSequence` because that call publishes the abort
+        // target ahead of its prompt decode; every exit (including a cancelled prefill throwing)
+        // must clear it so a late abort can never flag a recycled sequence slot.
+        defer { clearAbortTarget() }
+
+        let sequenceID = try obtainAutocompleteSequence(
+            promptTokens: preparation.promptTokens,
+            promptBytes: preparation.promptBytes,
+            fingerprint: preparation.fingerprint,
+            cachedPrefixBytes: preparation.cachedPrefixBytes,
+            options: options
+        )
+
+        defer {
+            // Trim sampled tokens so KV retains only the prompt for the next request. A rejected
+            // trim leaves the sampled tokens in KV while the tracker records prompt-only state;
+            // that mismatch self-heals (the next reuse trim is rejected too and rebuilds fresh),
+            // but it also proves this model can never reuse, so remember that for `prefill`.
+            if !engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) {
+                modelRejectsPartialTrims = true
+            }
+            autocompletePromptBytes = preparation.promptBytes
+            autocompletePromptTokens = preparation.promptTokens
+            autocompleteSamplingFingerprint = preparation.fingerprint
+        }
+
+        // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for
+        // the next request. Token selection is delegated to the engine's built-in sampler.
+        let decode = runEngineSampledDecode(sequenceID: sequenceID, options: options)
+        if decode.engineCancelled {
+            // The engine's per-sequence abort flag is set-once; an aborted sequence would refuse
+            // every future decode, so drop it and let the next request build fresh.
+            engine.destroySequence(sequenceID)
+            autocompleteSequenceID = -1
+        }
+        return decode.text
+    }
+
+    /// Decodes `prompt` into the autocomplete KV cache without sampling, so the next `generate`
+    /// whose prompt extends this one only pays for the typed delta. This is the llama half of
+    /// prewarm-on-focus: a focus change destroys the previous field's sequence, and without a
+    /// prefill the first suggestion in every field pays the full cold prompt decode.
+    func prefill(
+        prompt: String,
+        cachedPrefixBytes: Int? = nil,
+        options: LlamaGenerationOptions
+    ) throws {
+        let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "prefill")
 
         lifecycleCondition.lock()
         guard !isShuttingDown else {
@@ -137,6 +221,88 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
             lifecycleCondition.unlock()
         }
 
+        autocompleteLock.lock()
+        defer { autocompleteLock.unlock() }
+        // Same exit guarantee as `generate`: see the comment there.
+        defer { clearAbortTarget() }
+
+        // On models that reject partial trims (the hybrid/SWA catalog families), a warmed
+        // sequence can never be reused, so prefilling would only double the cold decode the
+        // first real request pays anyway. The flag is learned from the first rejected trim
+        // after model load; until then one speculative prefill may still run and be discarded.
+        guard !modelRejectsPartialTrims else {
+            CotabbyLogger.runtime.debug("Prefill skipped: the loaded model rejects partial KV trims")
+            return
+        }
+
+        // A superseding generation cancels the warmup task before contending on the lock above.
+        // The engine-level abort only reaches a decode that already published its target, so close
+        // the window where the cancel landed while this prefill was still tokenizing or queued.
+        guard !Task.isCancelled else {
+            throw CancellationError()
+        }
+
+        let sequenceID = try obtainAutocompleteSequence(
+            promptTokens: preparation.promptTokens,
+            promptBytes: preparation.promptBytes,
+            fingerprint: preparation.fingerprint,
+            cachedPrefixBytes: preparation.cachedPrefixBytes,
+            options: options
+        )
+
+        // `decodePrompt` samples one seed token beyond the prompt, so the trim is what restores
+        // prompt-only KV. If it is rejected, the warmed sequence still carries the seed and can
+        // never be trimmed by the following generate either: drop it instead of recording tracker
+        // facts the KV does not match, and remember that warming this model is pointless.
+        if engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) {
+            autocompletePromptBytes = preparation.promptBytes
+            autocompletePromptTokens = preparation.promptTokens
+            autocompleteSamplingFingerprint = preparation.fingerprint
+        } else {
+            modelRejectsPartialTrims = true
+            engine.destroySequence(sequenceID)
+            autocompleteSequenceID = -1
+            logTrimRejectionIfNeeded(reusableTokenCount: preparation.promptTokens.count)
+        }
+    }
+
+    /// Aborts the in-flight autocomplete operation's native work mid-prefill. Task cancellation is
+    /// only polled between sampled tokens, so without this an uninterruptible prompt decode makes
+    /// the next request wait out the entire stale prefill. Safe from any thread: the engine flag
+    /// is atomic and its sequence lookup is mutex-guarded; a no-op when nothing is in flight.
+    func abortInFlightGeneration() {
+        abortTargetLock.lock()
+        let target = abortTargetSequenceID
+        abortTargetLock.unlock()
+        guard target >= 0 else {
+            return
+        }
+        engine.cancelSequence(target)
+    }
+
+    private func setAbortTarget(_ sequenceID: Int32) {
+        abortTargetLock.lock()
+        abortTargetSequenceID = sequenceID
+        abortTargetLock.unlock()
+    }
+
+    private func clearAbortTarget() {
+        abortTargetLock.lock()
+        abortTargetSequenceID = -1
+        abortTargetLock.unlock()
+    }
+
+    /// Shared tokenize/truncate/log front half of `generate` and `prefill`.
+    private func preparedPrompt(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions,
+        kind: String
+    ) throws -> PreparedPrompt {
+        guard let preparedRuntime else {
+            throw LlamaRuntimeError.unavailable("The llama model is not loaded.")
+        }
+
         let promptBytes = Array(prompt.utf8)
         let allPromptTokens = tokenize(prompt)
         guard !allPromptTokens.isEmpty else {
@@ -149,7 +315,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         CotabbyLogger.runtime.debug(
             "Decode start",
             metadata: [
-                "kind": .string("generate"),
+                "kind": .string(kind),
                 "prompt_tokens": .stringConvertible(allPromptTokens.count),
                 "max_tokens": .stringConvertible(options.maxPredictionTokens),
                 "cached_prefix_bytes": .string(cachedPrefixBytes.map(String.init) ?? "none")
@@ -157,51 +323,44 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         )
 
         let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens)
-        var promptTokens: [Int32]
-        var adjustedCachedPrefixBytes: Int?
         if allPromptTokens.count > maxPromptTokens {
-            promptTokens = Array(allPromptTokens.suffix(maxPromptTokens))
-            adjustedCachedPrefixBytes = nil
-        } else {
-            promptTokens = allPromptTokens
-            adjustedCachedPrefixBytes = cachedPrefixBytes
+            return PreparedPrompt(
+                promptBytes: promptBytes,
+                promptTokens: Array(allPromptTokens.suffix(maxPromptTokens)),
+                cachedPrefixBytes: nil,
+                fingerprint: SamplingFingerprint(options: options)
+            )
         }
-
-        let fingerprint = SamplingFingerprint(options: options)
-
-        autocompleteLock.lock()
-        defer { autocompleteLock.unlock() }
-
-        let sequenceID = try obtainAutocompleteSequence(
-            promptTokens: promptTokens,
+        return PreparedPrompt(
             promptBytes: promptBytes,
-            fingerprint: fingerprint,
-            cachedPrefixBytes: adjustedCachedPrefixBytes,
-            options: options
+            promptTokens: allPromptTokens,
+            cachedPrefixBytes: cachedPrefixBytes,
+            fingerprint: SamplingFingerprint(options: options)
         )
+    }
 
-        defer {
-            // Trim sampled tokens so KV retains only the prompt for the next request.
-            _ = engine.trimKV(sequenceID, Int32(promptTokens.count))
-            autocompletePromptBytes = promptBytes
-            autocompletePromptTokens = promptTokens
-            autocompleteSamplingFingerprint = fingerprint
-        }
-
-        // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for
-        // the next request. Token selection is delegated to the engine's built-in sampler.
-        return runEngineSampledDecode(sequenceID: sequenceID, options: options)
+    private struct PreparedPrompt {
+        let promptBytes: [UInt8]
+        let promptTokens: [Int32]
+        let cachedPrefixBytes: Int?
+        let fingerprint: SamplingFingerprint
     }
 
     // MARK: - Decoders
 
     /// The shipping decoder: delegates token selection to the engine's built-in sampler
     /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token.
-    private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String {
+    /// `engineCancelled` reports that the native abort flag fired; the sequence must then be
+    /// discarded because the flag is set-once for a sequence's lifetime.
+    private func runEngineSampledDecode(
+        sequenceID: Int32,
+        options: LlamaGenerationOptions
+    ) -> (text: String, engineCancelled: Bool) {
         var generatedText = ""
         var tokensGenerated = 0
         var sumLogprob = 0.0
         var stopReason = "budget_exhausted"
+        var engineCancelled = false
 
         for _ in 0 ..< options.maxPredictionTokens {
             // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new
@@ -217,6 +376,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
 
             if result.was_cancelled {
                 stopReason = "engine_cancelled"
+                engineCancelled = true
                 break
             }
             if result.is_eos {
@@ -255,9 +415,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         )
 
         if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) {
-            return ""
+            return ("", engineCancelled)
         }
-        return generatedText
+        return (generatedText, engineCancelled)
     }
 
     /// Low-confidence gate for the sampled decoder: drop completions the model itself was unsure
@@ -373,37 +533,59 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
                     newPromptTokenCount: promptTokens.count
                 )
 
-                if reusableTokenCount > 0,
-                   engine.trimKV(autocompleteSequenceID, Int32(reusableTokenCount)) {
-
-                    let remaining = Array(promptTokens[reusableTokenCount...])
-                    if !remaining.isEmpty {
-                        // Seed for the reuse path is sampled at the end of this decodePrompt; apply
-                        // the word-continuation constraint to it just like the fresh path does.
-                        engine.setForceWordContinuation(autocompleteSequenceID, options.forceWordContinuation)
-                        // Per-token log-probabilities cost two O(vocab) passes each in the engine;
-                        // only compute them when the confidence gate would actually read them.
-                        // Re-assert per request: the floor is not part of the sampling fingerprint,
-                        // so a reused sequence must not carry a stale flag.
-                        engine.setComputeLogprob(
-                            autocompleteSequenceID,
-                            options.confidenceFloor > -.infinity
-                        )
-                        var mutableRemaining = remaining
-                        let status = engine.decodePrompt(
-                            autocompleteSequenceID,
-                            &mutableRemaining,
-                            Int32(mutableRemaining.count),
-                            Int32(reusableTokenCount)
-                        )
-                        if status != .ok {
-                            // Reuse failed mid-decode; fall through to fresh build.
-                            engine.destroySequence(autocompleteSequenceID)
-                            autocompleteSequenceID = -1
-                            return try buildFreshSequence(promptTokens: promptTokens, options: options)
+                if reusableTokenCount > 0 {
+                    if engine.trimKV(autocompleteSequenceID, Int32(reusableTokenCount)) {
+                        let remaining = Array(promptTokens[reusableTokenCount...])
+                        if !remaining.isEmpty {
+                            // Seed for the reuse path is sampled at the end of this decodePrompt;
+                            // apply the word-continuation constraint to it like the fresh path does.
+                            engine.setForceWordContinuation(
+                                autocompleteSequenceID,
+                                options.forceWordContinuation
+                            )
+                            // Per-token log-probabilities cost two O(vocab) passes each in the
+                            // engine; only compute them when the confidence gate would actually
+                            // read them. Re-assert per request: the floor is not part of the
+                            // sampling fingerprint, so a reused sequence must not carry a stale flag.
+                            engine.setComputeLogprob(
+                                autocompleteSequenceID,
+                                options.confidenceFloor > -.infinity
+                            )
+                            setAbortTarget(autocompleteSequenceID)
+                            var mutableRemaining = remaining
+                            let status = engine.decodePrompt(
+                                autocompleteSequenceID,
+                                &mutableRemaining,
+                                Int32(mutableRemaining.count),
+                                Int32(reusableTokenCount)
+                            )
+                            if status == .cancelled {
+                                // The caller's request was superseded mid-prefill. Do NOT rebuild
+                                // fresh here: that would decode the full stale prompt right after
+                                // its cancellation. The aborted sequence is unusable (set-once
+                                // flag, partially decoded KV), so drop it and surface the cancel.
+                                engine.destroySequence(autocompleteSequenceID)
+                                autocompleteSequenceID = -1
+                                throw CancellationError()
+                            }
+                            if status != .ok {
+                                // Reuse failed mid-decode; fall through to fresh build.
+                                engine.destroySequence(autocompleteSequenceID)
+                                autocompleteSequenceID = -1
+                                return try buildFreshSequence(promptTokens: promptTokens, options: options)
+                            }
                         }
+                        CotabbyLogger.runtime.debug(
+                            "KV prefix reused",
+                            metadata: [
+                                "reused_tokens": .stringConvertible(reusableTokenCount),
+                                "decoded_delta_tokens": .stringConvertible(promptTokens.count - reusableTokenCount)
+                            ]
+                        )
+                        return autocompleteSequenceID
                     }
-                    return autocompleteSequenceID
+
+                    logTrimRejectionIfNeeded(reusableTokenCount: reusableTokenCount)
                 }
             }
         }
@@ -433,10 +615,16 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         // would be summed and then discarded.
         engine.setComputeLogprob(seqID, options.confidenceFloor > -.infinity)
 
+        setAbortTarget(seqID)
         var tokens = promptTokens
         let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0)
         guard status == .ok else {
             engine.destroySequence(seqID)
+            if status == .cancelled {
+                // Superseded mid-prefill; the abort exists precisely so the next request does not
+                // wait out the rest of this decode. Quiet cancellation, no runtime error.
+                throw CancellationError()
+            }
             throw LlamaRuntimeError.generationFailed("Prompt decoding failed.")
         }
 
@@ -444,6 +632,31 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable {
         return seqID
     }
 
+    /// Surfaces "this model cannot reuse its prompt KV" once per model load at info level, then
+    /// per-event at debug. llama.cpp rejects partial sequence removal on hybrid (recurrent) and
+    /// SWA caches — which includes the current catalog families — and the silent fallback is a
+    /// full prompt re-prefill on every keystroke pause: the difference between decoding a few
+    /// delta tokens and the entire prompt.
+    private func logTrimRejectionIfNeeded(reusableTokenCount: Int) {
+        modelRejectsPartialTrims = true
+        if !loggedTrimRejectionForCurrentModel {
+            loggedTrimRejectionForCurrentModel = true
+            CotabbyLogger.runtime.info(
+                "KV prefix reuse unavailable: the engine rejected a partial trim, so every request re-decodes its full prompt",
+                metadata: [
+                    "model": .string(preparedRuntime?.resolvedRuntime.modelDisplayName ?? "unknown"),
+                    "rejected_reusable_tokens": .stringConvertible(reusableTokenCount)
+                ]
+            )
+            return
+        }
+
+        CotabbyLogger.runtime.debug(
+            "KV prefix trim rejected; rebuilding sequence",
+            metadata: ["rejected_reusable_tokens": .stringConvertible(reusableTokenCount)]
+        )
+    }
+
     // MARK: - Private: helpers
 
     private func tokenize(_ text: String) -> [Int32] {
diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
index be4d9917..6cc22d7f 100644
--- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
+++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift
@@ -129,6 +129,11 @@ final class LlamaRuntimeManager: ObservableObject {
                 return partial
             } onCancel: {
                 task.cancel()
+                // Task cancellation is only polled between sampled tokens, so an in-flight prompt
+                // prefill would otherwise run to completion while holding the autocomplete lock,
+                // making the superseding request wait out the whole stale decode. The engine-level
+                // abort interrupts the decode at its next batch chunk.
+                core.abortInFlightGeneration()
             }
         } catch is CancellationError {
             CotabbyLogger.runtime.debug("Generation cancelled")
@@ -150,6 +155,36 @@ final class LlamaRuntimeManager: ObservableObject {
         core.resetPromptCache()
     }
 
+    /// Decodes `prompt` into the native prompt cache without sampling (the llama half of
+    /// prewarm-on-focus). Best-effort by contract: cancellation is silent and failures only log,
+    /// because a missed warmup just means the next generate pays the cold prefill it would have
+    /// paid anyway. Errors are deliberately kept out of `diagnostics.lastError`.
+    func prefill(
+        prompt: String,
+        cachedPrefixBytes: Int? = nil,
+        options: LlamaGenerationOptions
+    ) async throws {
+        _ = try await preparedRuntime()
+
+        let core = self.core
+        let task = Task.detached {
+            try core.prefill(
+                prompt: prompt,
+                cachedPrefixBytes: cachedPrefixBytes,
+                options: options
+            )
+        }
+        try await withTaskCancellationHandler {
+            try await task.value
+            try Task.checkCancellation()
+        } onCancel: {
+            task.cancel()
+            // A prefill is superseded the moment a real generation arrives; abort its native
+            // decode so the generation does not queue behind a warmup for a stale prompt.
+            core.abortInFlightGeneration()
+        }
+    }
+
     /// Cancels any retained prepared runtime and releases backend resources.
     /// Shutdown runs on a detached thread so it does not block the main actor.
     func stop() {
diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
index 45a2195c..c6b9f344 100644
--- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
+++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift
@@ -12,11 +12,48 @@ import Logging
 final class LlamaSuggestionEngine {
     private let runtimeManager: LlamaRuntimeGenerating
     private var promptCacheHintTracker = LlamaPromptCacheHintTracker()
+    /// The focus-time warmup in flight, if any. A real generation cancels it on entry so it never
+    /// queues behind a warmup for a prompt the user has already typed past.
+    private var inflightPrewarmTask: Task<Void, Never>?
 
     init(runtimeManager: LlamaRuntimeGenerating) {
         self.runtimeManager = runtimeManager
     }
 
+    /// Prefills the prompt KV for the field the user just focused, so the first real suggestion
+    /// there only decodes the typed delta instead of the whole cold prompt.
+    ///
+    /// The protocol default used to be a no-op here on the assumption that llama "keeps its KV
+    /// cache hot", but a focus change resets the cached generation context and destroys the native
+    /// sequence, so the first request in every field paid a full prefill. Best-effort by design:
+    /// failures are swallowed (a missed warmup costs nothing the cold path would not have paid)
+    /// and the tracker only records the prompt after the native decode actually succeeded.
+    func prewarm(for request: SuggestionRequest) async {
+        inflightPrewarmTask?.cancel()
+        let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request)
+        let options = Self.makeGenerationOptions(for: request)
+        let task = Task { [weak self, runtimeManager] in
+            do {
+                try await runtimeManager.prefill(
+                    prompt: request.prompt,
+                    cachedPrefixBytes: cachedPrefixBytes,
+                    options: options
+                )
+                guard !Task.isCancelled else {
+                    return
+                }
+                self?.promptCacheHintTracker.recordSuccessfulRequest(request)
+            } catch {
+                CotabbyLogger.suggestion.debug(
+                    "Llama prewarm skipped: \(error.localizedDescription)",
+                    metadata: ["request_id": .string(request.requestID), "engine": .string("llama")]
+                )
+            }
+        }
+        inflightPrewarmTask = task
+        await task.value
+    }
+
     /// Executes one generation request and packages the raw and normalized result for the coordinator.
     func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult {
         let baseMetadata: Logger.Metadata = [
@@ -24,6 +61,11 @@ final class LlamaSuggestionEngine {
             "engine": .string("llama")
         ]
         do {
+            // A still-running focus warmup must not make this request wait behind it on the
+            // runtime's autocomplete lock; cancelling it aborts its native decode mid-chunk.
+            inflightPrewarmTask?.cancel()
+            inflightPrewarmTask = nil
+
             let startTime = Date()
             let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request)
             let hintDesc = cachedPrefixBytes.map(String.init) ?? "none"
@@ -38,20 +80,7 @@ final class LlamaSuggestionEngine {
             let rawSuggestion = try await runtimeManager.generate(
                 prompt: request.prompt,
                 cachedPrefixBytes: cachedPrefixBytes,
-                options: LlamaGenerationOptions(
-                    maxPredictionTokens: request.maxPredictionTokens,
-                    temperature: request.temperature,
-                    topK: request.topK,
-                    topP: request.topP,
-                    minP: request.minP,
-                    repetitionPenalty: request.repetitionPenalty,
-                    seed: request.randomSeed,
-                    singleLine: !request.isMultiLineEnabled,
-                    forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation(
-                        precedingText: request.context.precedingText,
-                        trailingText: request.context.trailingText
-                    )
-                )
+                options: Self.makeGenerationOptions(for: request)
             )
             try Task.checkCancellation()
 
@@ -143,9 +172,31 @@ final class LlamaSuggestionEngine {
     /// stale reuse; awaiting the runtime reset keeps native KV invalidation ordered before the next
     /// generation request that crosses this engine boundary.
     func resetCachedGenerationContext() async {
+        // The editing context moved on, so a warmup for the previous field's prompt is stale.
+        inflightPrewarmTask?.cancel()
+        inflightPrewarmTask = nil
         promptCacheHintTracker.reset()
         runtimeManager.resetPromptCache()
     }
+
+    /// One shared mapping from a request to engine options so prewarm prefills decode under the
+    /// exact sampling fingerprint the following generation will validate its KV reuse against.
+    private static func makeGenerationOptions(for request: SuggestionRequest) -> LlamaGenerationOptions {
+        LlamaGenerationOptions(
+            maxPredictionTokens: request.maxPredictionTokens,
+            temperature: request.temperature,
+            topK: request.topK,
+            topP: request.topP,
+            minP: request.minP,
+            repetitionPenalty: request.repetitionPenalty,
+            seed: request.randomSeed,
+            singleLine: !request.isMultiLineEnabled,
+            forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation(
+                precedingText: request.context.precedingText,
+                trailingText: request.context.trailingText
+            )
+        )
+    }
 }
 
 extension LlamaSuggestionEngine: SuggestionGenerating {}
diff --git a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift
new file mode 100644
index 00000000..f66c4f57
--- /dev/null
+++ b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift
@@ -0,0 +1,130 @@
+import CoreGraphics
+import Foundation
+import XCTest
+@testable import Cotabby
+
+/// Tests for the llama half of prewarm-on-focus: a focus change used to leave the llama engine's
+/// `prewarm` as the protocol no-op while the focus reset destroyed the native sequence, so the
+/// first suggestion in every field paid the full cold prompt decode. These pin the new contract:
+/// prewarm prefills through the runtime and primes the reuse hint only when the prefill succeeded.
+@MainActor
+final class LlamaSuggestionEnginePrewarmTests: XCTestCase {
+
+    func test_prewarm_prefillsAndPrimesTheReuseHint() async throws {
+        let runtime = RecordingPrewarmRuntime()
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+        let request = makeRequest(prompt: "hello wor")
+
+        await engine.prewarm(for: request)
+
+        XCTAssertEqual(runtime.prefillPrompts, ["hello wor"])
+
+        _ = try await engine.generateSuggestion(for: request)
+        XCTAssertEqual(
+            runtime.generateCachedPrefixBytes,
+            ["hello wor".utf8.count],
+            "A successful prefill should let the next identical-context request advertise full reuse."
+        )
+    }
+
+    func test_failedPrewarm_leavesReuseHintCold() async throws {
+        let runtime = RecordingPrewarmRuntime()
+        runtime.prefillError = LlamaRuntimeError.unavailable("not loaded")
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+        let request = makeRequest(prompt: "hello wor")
+
+        await engine.prewarm(for: request)
+
+        _ = try await engine.generateSuggestion(for: request)
+        XCTAssertEqual(
+            runtime.generateCachedPrefixBytes,
+            [nil],
+            "A failed prefill must not advertise reuse the native cache cannot back."
+        )
+    }
+
+    func test_resetClearsThePrimedHint() async throws {
+        let runtime = RecordingPrewarmRuntime()
+        let engine = LlamaSuggestionEngine(runtimeManager: runtime)
+        let request = makeRequest(prompt: "hello wor")
+
+        await engine.prewarm(for: request)
+        await engine.resetCachedGenerationContext()
+
+        _ = try await engine.generateSuggestion(for: request)
+        XCTAssertEqual(runtime.generateCachedPrefixBytes, [nil])
+    }
+
+    // MARK: - Helpers
+
+    private func makeRequest(prompt: String) -> SuggestionRequest {
+        let snapshot = FocusedInputSnapshot(
+            applicationName: "TestApp",
+            bundleIdentifier: "com.example.TestApp",
+            processIdentifier: 123,
+            elementIdentifier: "field",
+            role: "AXTextField",
+            subrole: nil,
+            caretRect: .zero,
+            inputFrameRect: nil,
+            caretSource: "test",
+            caretQuality: .exact,
+            observedCharWidth: nil,
+            precedingText: prompt,
+            trailingText: "",
+            selection: NSRange(location: prompt.count, length: 0),
+            isSecure: false
+        )
+        let context = FocusedInputContext(snapshot: snapshot, generation: 1)
+
+        return SuggestionRequest(
+            context: context,
+            prefixText: prompt,
+            prompt: prompt,
+            generation: context.generation,
+            maxPredictionTokens: 8,
+            temperature: 0.1,
+            topK: 20,
+            topP: 0.7,
+            minP: 0.08,
+            repetitionPenalty: 1.05,
+            randomSeed: 42,
+            maxSuffixCharacters: 192,
+            completionLengthInstruction: "Return only the next few words.",
+            userName: nil,
+            customRules: [],
+            languageInstruction: nil,
+            clipboardContext: nil,
+            visualContextSummary: nil,
+            isMultiLineEnabled: false
+        )
+    }
+}
+
+/// Records prefill calls and the reuse hints later generations advertise, so the prewarm contract
+/// can be exercised without loading a real model.
+@MainActor
+private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating {
+    var prefillError: Error?
+    var generateResult: Result<String, Error> = .success("ok")
+    private(set) var prefillPrompts: [String] = []
+    private(set) var generateCachedPrefixBytes: [Int?] = []
+
+    func generate(
+        prompt: String,
+        cachedPrefixBytes: Int?,
+        options: LlamaGenerationOptions
+    ) async throws -> String {
+        generateCachedPrefixBytes.append(cachedPrefixBytes)
+        return try generateResult.get()
+    }
+
+    func resetPromptCache() {}
+
+    func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {
+        if let prefillError {
+            throw prefillError
+        }
+        prefillPrompts.append(prompt)
+    }
+}