From 711bca68709a4b2784d154dfa5880098479d0b1c Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 18:14:18 -0700 Subject: [PATCH 1/2] perf(runtime): llama prewarm-on-focus, mid-prefill abort, and KV-reuse visibility Three llama-path gaps. The engine's prewarm hook was the protocol no-op while a focus change destroys the native sequence, so the first suggestion in every field paid the full cold prompt decode; prewarm now prefills the new field's prompt KV (no sampling) and primes the reuse hint only after the native decode succeeded. Prompt prefill was uninterruptible: Swift cancellation is polled between sampled tokens, and the engine's per-sequence abort flag was never set by the app, so a superseding request waited out the entire stale decode while it held the autocomplete lock; the manager's cancellation handlers now fire engine.cancelSequence through an abort-target handshake, with cancelled prefills surfacing as quiet CancellationError and aborted sequences destroyed (the native flag is set-once). And trimKV rejection, which llama.cpp returns for partial removals on hybrid/SWA caches and which silently degrades every request to a full re-prefill, now logs once per model load at info plus per-event reuse stats at debug. --- Cotabby.xcodeproj/project.pbxproj | 4 + .../Models/SuggestionSubsystemContracts.swift | 18 +- .../Services/Runtime/LlamaRuntimeCore.swift | 310 ++++++++++++++---- .../Runtime/LlamaRuntimeManager.swift | 35 ++ .../Runtime/LlamaSuggestionEngine.swift | 79 ++++- .../LlamaSuggestionEnginePrewarmTests.swift | 130 ++++++++ 6 files changed, 494 insertions(+), 82 deletions(-) create mode 100644 CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index a2988d19..cfd245bd 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -568,6 +568,7 @@ E51FA12B690428CA431328FC /* WritingPaneView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D48B95B6665109B6C6A63B42 /* WritingPaneView.swift */; }; E54F5F03E16859D5A1E3437A /* MacroController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4638C74239D1DE2DC4D87975 /* MacroController.swift */; }; E5CB34ED76BAE87E8A858112 /* WebContentFieldDetectorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 210F9AD332273FE2EB3A9A01 /* WebContentFieldDetectorTests.swift */; }; + E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */; }; E6EE3C13FA31F261CD734C69 /* DownloadOutcomeClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DE1975F3B5F4A70478DBF41 /* DownloadOutcomeClassifier.swift */; }; E853B9C7AF93FA595DC417B2 /* EmojiVariantResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1A8414BEB7E34F57607E37FE /* EmojiVariantResolver.swift */; }; E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */; }; @@ -696,6 +697,7 @@ 24F613F0E2F7046E6532A09C /* OnboardingTemplateFeatureList.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OnboardingTemplateFeatureList.swift; sourceTree = ""; }; 262BE2F1E97389FE8D7A5FB9 /* Cotabby.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Cotabby.app; sourceTree = BUILT_PRODUCTS_DIR; }; 264CA64B2AB1611F82E5B760 /* WelcomeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WelcomeView.swift; sourceTree = ""; }; + 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaSuggestionEnginePrewarmTests.swift; sourceTree = ""; }; 273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusPollBackoffTests.swift; sourceTree = ""; }; 27A5D63F390E9B7A7FE343FE /* SystemResourceSampler.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SystemResourceSampler.swift; sourceTree = ""; }; 28B7EB84781C0ED57844585E /* OnboardingTemplateTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OnboardingTemplateTests.swift; sourceTree = ""; }; @@ -1363,6 +1365,7 @@ 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */, 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */, AABCC3FD99B1824A81E665F3 /* LlamaSuggestionEngineCancellationTests.swift */, + 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */, 9030FAAB468119A0236284A6 /* LLMIOFileHandlerTests.swift */, D8083D44ABCDCFA68A4CD497 /* MacroEngineTests.swift */, 22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */, @@ -2334,6 +2337,7 @@ E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */, E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */, BE3CB85508055D159C35020A /* LlamaSuggestionEngineCancellationTests.swift in Sources */, + E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */, 8429B116328C392DCA018D95 /* MacroEngineTests.swift in Sources */, 3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */, 87806DE08881D11F2608A13D /* MarkerSelectionSynthesizerTests.swift in Sources */, diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift index f50b4d72..c47b2329 100644 --- a/Cotabby/Models/SuggestionSubsystemContracts.swift +++ b/Cotabby/Models/SuggestionSubsystemContracts.swift @@ -93,10 +93,10 @@ protocol SuggestionGenerating: AnyObject { /// continuous. Stateless engines may implement this as a no-op. func resetCachedGenerationContext() async /// Best-effort warmup hook the coordinator calls after focus arrives on an editable surface. - /// Engines that benefit from prefix caching or weight loading (Apple Foundation Models) use it - /// to prime the next request; engines that do not (llama already keeps its KV cache hot) can - /// rely on the default no-op extension. Failures are intentionally swallowed by implementations - /// because prewarming is opportunistic. + /// Apple Foundation Models primes its session here, and the llama engine prefills the new + /// field's prompt KV (a focus change destroys the previous field's native sequence, so without + /// this the first suggestion in every field pays the full cold prompt decode). Failures are + /// intentionally swallowed by implementations because prewarming is opportunistic. func prewarm(for request: SuggestionRequest) async } @@ -113,6 +113,16 @@ extension SuggestionGenerating { protocol LlamaRuntimeGenerating: AnyObject { func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String func resetPromptCache() + /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next + /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup: + /// callers treat failures as "no cache primed", never as a user-facing error. + func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws +} + +extension LlamaRuntimeGenerating { + /// Default no-op so test fakes that only exercise the generate/cancel contract keep compiling; + /// the production manager overrides this with a real prompt prefill. + func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {} } @MainActor diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 5384b9cf..784457d7 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -34,6 +34,17 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { private var autocompletePromptTokens: [Int32] = [] private var autocompleteSamplingFingerprint: SamplingFingerprint? + /// The sequence the in-flight autocomplete operation is decoding into, published for + /// `abortInFlightGeneration` to target from the canceller's thread. Guarded by its own lock + /// because the abort fires while `autocompleteLock` is held by the very work being aborted. + private let abortTargetLock = NSLock() + private var abortTargetSequenceID: Int32 = -1 + + /// One loud line per model load when the engine rejects partial KV trims (llama.cpp cannot + /// drop mid-sequence ranges on hybrid/recurrent or SWA caches). Without this signal the + /// prefix-reuse fast path degrades silently to a full prompt re-prefill on every request. + private var loggedTrimRejectionForCurrentModel = false + /// Coordinates model lifecycle with in-flight operations. `generate()` and `summarize()` /// increment the active count on entry and decrement on exit. `shutdown()` sets the /// shutting-down flag and blocks until all active operations finish before unloading. @@ -95,6 +106,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { backendName: "CotabbyInferenceEngine (llama.cpp in-process)" ) self.preparedRuntime = result + loggedTrimRejectionForCurrentModel = false CotabbyLogger.runtime.info( "Model loaded", metadata: [ @@ -118,10 +130,69 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions ) throws -> String { - guard let preparedRuntime else { - throw LlamaRuntimeError.unavailable("The llama model is not loaded.") + let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate") + + lifecycleCondition.lock() + guard !isShuttingDown else { + lifecycleCondition.unlock() + throw LlamaRuntimeError.unavailable("The runtime is shutting down.") + } + activeOperationCount += 1 + lifecycleCondition.unlock() + + defer { + lifecycleCondition.lock() + activeOperationCount -= 1 + lifecycleCondition.broadcast() + lifecycleCondition.unlock() } + autocompleteLock.lock() + defer { autocompleteLock.unlock() } + // Registered before `obtainAutocompleteSequence` because that call publishes the abort + // target ahead of its prompt decode; every exit (including a cancelled prefill throwing) + // must clear it so a late abort can never flag a recycled sequence slot. + defer { clearAbortTarget() } + + let sequenceID = try obtainAutocompleteSequence( + promptTokens: preparation.promptTokens, + promptBytes: preparation.promptBytes, + fingerprint: preparation.fingerprint, + cachedPrefixBytes: preparation.cachedPrefixBytes, + options: options + ) + + defer { + // Trim sampled tokens so KV retains only the prompt for the next request. + _ = engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) + autocompletePromptBytes = preparation.promptBytes + autocompletePromptTokens = preparation.promptTokens + autocompleteSamplingFingerprint = preparation.fingerprint + } + + // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for + // the next request. Token selection is delegated to the engine's built-in sampler. + let decode = runEngineSampledDecode(sequenceID: sequenceID, options: options) + if decode.engineCancelled { + // The engine's per-sequence abort flag is set-once; an aborted sequence would refuse + // every future decode, so drop it and let the next request build fresh. + engine.destroySequence(sequenceID) + autocompleteSequenceID = -1 + } + return decode.text + } + + /// Decodes `prompt` into the autocomplete KV cache without sampling, so the next `generate` + /// whose prompt extends this one only pays for the typed delta. This is the llama half of + /// prewarm-on-focus: a focus change destroys the previous field's sequence, and without a + /// prefill the first suggestion in every field pays the full cold prompt decode. + func prefill( + prompt: String, + cachedPrefixBytes: Int? = nil, + options: LlamaGenerationOptions + ) throws { + let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "prefill") + lifecycleCondition.lock() guard !isShuttingDown else { lifecycleCondition.unlock() @@ -137,6 +208,71 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { lifecycleCondition.unlock() } + autocompleteLock.lock() + defer { autocompleteLock.unlock() } + // Same exit guarantee as `generate`: see the comment there. + defer { clearAbortTarget() } + + // A superseding generation cancels the warmup task before contending on the lock above. + // The engine-level abort only reaches a decode that already published its target, so close + // the window where the cancel landed while this prefill was still tokenizing or queued. + guard !Task.isCancelled else { + throw CancellationError() + } + + let sequenceID = try obtainAutocompleteSequence( + promptTokens: preparation.promptTokens, + promptBytes: preparation.promptBytes, + fingerprint: preparation.fingerprint, + cachedPrefixBytes: preparation.cachedPrefixBytes, + options: options + ) + + // No decode loop ran, so KV already holds prompt-only state; record the tracker facts the + // next request validates its reuse against. + _ = engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) + autocompletePromptBytes = preparation.promptBytes + autocompletePromptTokens = preparation.promptTokens + autocompleteSamplingFingerprint = preparation.fingerprint + } + + /// Aborts the in-flight autocomplete operation's native work mid-prefill. Task cancellation is + /// only polled between sampled tokens, so without this an uninterruptible prompt decode makes + /// the next request wait out the entire stale prefill. Safe from any thread: the engine flag + /// is atomic and its sequence lookup is mutex-guarded; a no-op when nothing is in flight. + func abortInFlightGeneration() { + abortTargetLock.lock() + let target = abortTargetSequenceID + abortTargetLock.unlock() + guard target >= 0 else { + return + } + engine.cancelSequence(target) + } + + private func setAbortTarget(_ sequenceID: Int32) { + abortTargetLock.lock() + abortTargetSequenceID = sequenceID + abortTargetLock.unlock() + } + + private func clearAbortTarget() { + abortTargetLock.lock() + abortTargetSequenceID = -1 + abortTargetLock.unlock() + } + + /// Shared tokenize/truncate/log front half of `generate` and `prefill`. + private func preparedPrompt( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions, + kind: String + ) throws -> PreparedPrompt { + guard let preparedRuntime else { + throw LlamaRuntimeError.unavailable("The llama model is not loaded.") + } + let promptBytes = Array(prompt.utf8) let allPromptTokens = tokenize(prompt) guard !allPromptTokens.isEmpty else { @@ -149,7 +285,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { CotabbyLogger.runtime.debug( "Decode start", metadata: [ - "kind": .string("generate"), + "kind": .string(kind), "prompt_tokens": .stringConvertible(allPromptTokens.count), "max_tokens": .stringConvertible(options.maxPredictionTokens), "cached_prefix_bytes": .string(cachedPrefixBytes.map(String.init) ?? "none") @@ -157,51 +293,44 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ) let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens) - var promptTokens: [Int32] - var adjustedCachedPrefixBytes: Int? if allPromptTokens.count > maxPromptTokens { - promptTokens = Array(allPromptTokens.suffix(maxPromptTokens)) - adjustedCachedPrefixBytes = nil - } else { - promptTokens = allPromptTokens - adjustedCachedPrefixBytes = cachedPrefixBytes + return PreparedPrompt( + promptBytes: promptBytes, + promptTokens: Array(allPromptTokens.suffix(maxPromptTokens)), + cachedPrefixBytes: nil, + fingerprint: SamplingFingerprint(options: options) + ) } - - let fingerprint = SamplingFingerprint(options: options) - - autocompleteLock.lock() - defer { autocompleteLock.unlock() } - - let sequenceID = try obtainAutocompleteSequence( - promptTokens: promptTokens, + return PreparedPrompt( promptBytes: promptBytes, - fingerprint: fingerprint, - cachedPrefixBytes: adjustedCachedPrefixBytes, - options: options + promptTokens: allPromptTokens, + cachedPrefixBytes: cachedPrefixBytes, + fingerprint: SamplingFingerprint(options: options) ) + } - defer { - // Trim sampled tokens so KV retains only the prompt for the next request. - _ = engine.trimKV(sequenceID, Int32(promptTokens.count)) - autocompletePromptBytes = promptBytes - autocompletePromptTokens = promptTokens - autocompleteSamplingFingerprint = fingerprint - } - - // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for - // the next request. Token selection is delegated to the engine's built-in sampler. - return runEngineSampledDecode(sequenceID: sequenceID, options: options) + private struct PreparedPrompt { + let promptBytes: [UInt8] + let promptTokens: [Int32] + let cachedPrefixBytes: Int? + let fingerprint: SamplingFingerprint } // MARK: - Decoders /// The shipping decoder: delegates token selection to the engine's built-in sampler /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token. - private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String { + /// `engineCancelled` reports that the native abort flag fired; the sequence must then be + /// discarded because the flag is set-once for a sequence's lifetime. + private func runEngineSampledDecode( + sequenceID: Int32, + options: LlamaGenerationOptions + ) -> (text: String, engineCancelled: Bool) { var generatedText = "" var tokensGenerated = 0 var sumLogprob = 0.0 var stopReason = "budget_exhausted" + var engineCancelled = false for _ in 0 ..< options.maxPredictionTokens { // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new @@ -217,6 +346,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { if result.was_cancelled { stopReason = "engine_cancelled" + engineCancelled = true break } if result.is_eos { @@ -255,9 +385,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ) if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) { - return "" + return ("", engineCancelled) } - return generatedText + return (generatedText, engineCancelled) } /// Low-confidence gate for the sampled decoder: drop completions the model itself was unsure @@ -373,37 +503,59 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { newPromptTokenCount: promptTokens.count ) - if reusableTokenCount > 0, - engine.trimKV(autocompleteSequenceID, Int32(reusableTokenCount)) { - - let remaining = Array(promptTokens[reusableTokenCount...]) - if !remaining.isEmpty { - // Seed for the reuse path is sampled at the end of this decodePrompt; apply - // the word-continuation constraint to it just like the fresh path does. - engine.setForceWordContinuation(autocompleteSequenceID, options.forceWordContinuation) - // Per-token log-probabilities cost two O(vocab) passes each in the engine; - // only compute them when the confidence gate would actually read them. - // Re-assert per request: the floor is not part of the sampling fingerprint, - // so a reused sequence must not carry a stale flag. - engine.setComputeLogprob( - autocompleteSequenceID, - options.confidenceFloor > -.infinity - ) - var mutableRemaining = remaining - let status = engine.decodePrompt( - autocompleteSequenceID, - &mutableRemaining, - Int32(mutableRemaining.count), - Int32(reusableTokenCount) - ) - if status != .ok { - // Reuse failed mid-decode; fall through to fresh build. - engine.destroySequence(autocompleteSequenceID) - autocompleteSequenceID = -1 - return try buildFreshSequence(promptTokens: promptTokens, options: options) + if reusableTokenCount > 0 { + if engine.trimKV(autocompleteSequenceID, Int32(reusableTokenCount)) { + let remaining = Array(promptTokens[reusableTokenCount...]) + if !remaining.isEmpty { + // Seed for the reuse path is sampled at the end of this decodePrompt; + // apply the word-continuation constraint to it like the fresh path does. + engine.setForceWordContinuation( + autocompleteSequenceID, + options.forceWordContinuation + ) + // Per-token log-probabilities cost two O(vocab) passes each in the + // engine; only compute them when the confidence gate would actually + // read them. Re-assert per request: the floor is not part of the + // sampling fingerprint, so a reused sequence must not carry a stale flag. + engine.setComputeLogprob( + autocompleteSequenceID, + options.confidenceFloor > -.infinity + ) + setAbortTarget(autocompleteSequenceID) + var mutableRemaining = remaining + let status = engine.decodePrompt( + autocompleteSequenceID, + &mutableRemaining, + Int32(mutableRemaining.count), + Int32(reusableTokenCount) + ) + if status == .cancelled { + // The caller's request was superseded mid-prefill. Do NOT rebuild + // fresh here: that would decode the full stale prompt right after + // its cancellation. The aborted sequence is unusable (set-once + // flag, partially decoded KV), so drop it and surface the cancel. + engine.destroySequence(autocompleteSequenceID) + autocompleteSequenceID = -1 + throw CancellationError() + } + if status != .ok { + // Reuse failed mid-decode; fall through to fresh build. + engine.destroySequence(autocompleteSequenceID) + autocompleteSequenceID = -1 + return try buildFreshSequence(promptTokens: promptTokens, options: options) + } } + CotabbyLogger.runtime.debug( + "KV prefix reused", + metadata: [ + "reused_tokens": .stringConvertible(reusableTokenCount), + "decoded_delta_tokens": .stringConvertible(promptTokens.count - reusableTokenCount) + ] + ) + return autocompleteSequenceID } - return autocompleteSequenceID + + logTrimRejectionIfNeeded(reusableTokenCount: reusableTokenCount) } } } @@ -433,10 +585,16 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // would be summed and then discarded. engine.setComputeLogprob(seqID, options.confidenceFloor > -.infinity) + setAbortTarget(seqID) var tokens = promptTokens let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0) guard status == .ok else { engine.destroySequence(seqID) + if status == .cancelled { + // Superseded mid-prefill; the abort exists precisely so the next request does not + // wait out the rest of this decode. Quiet cancellation, no runtime error. + throw CancellationError() + } throw LlamaRuntimeError.generationFailed("Prompt decoding failed.") } @@ -444,6 +602,30 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return seqID } + /// Surfaces "this model cannot reuse its prompt KV" once per model load at info level, then + /// per-event at debug. llama.cpp rejects partial sequence removal on hybrid (recurrent) and + /// SWA caches — which includes the current catalog families — and the silent fallback is a + /// full prompt re-prefill on every keystroke pause: the difference between decoding a few + /// delta tokens and the entire prompt. + private func logTrimRejectionIfNeeded(reusableTokenCount: Int) { + if !loggedTrimRejectionForCurrentModel { + loggedTrimRejectionForCurrentModel = true + CotabbyLogger.runtime.info( + "KV prefix reuse unavailable: the engine rejected a partial trim, so every request re-decodes its full prompt", + metadata: [ + "model": .string(preparedRuntime?.resolvedRuntime.modelDisplayName ?? "unknown"), + "rejected_reusable_tokens": .stringConvertible(reusableTokenCount) + ] + ) + return + } + + CotabbyLogger.runtime.debug( + "KV prefix trim rejected; rebuilding sequence", + metadata: ["rejected_reusable_tokens": .stringConvertible(reusableTokenCount)] + ) + } + // MARK: - Private: helpers private func tokenize(_ text: String) -> [Int32] { diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index be4d9917..6cc22d7f 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -129,6 +129,11 @@ final class LlamaRuntimeManager: ObservableObject { return partial } onCancel: { task.cancel() + // Task cancellation is only polled between sampled tokens, so an in-flight prompt + // prefill would otherwise run to completion while holding the autocomplete lock, + // making the superseding request wait out the whole stale decode. The engine-level + // abort interrupts the decode at its next batch chunk. + core.abortInFlightGeneration() } } catch is CancellationError { CotabbyLogger.runtime.debug("Generation cancelled") @@ -150,6 +155,36 @@ final class LlamaRuntimeManager: ObservableObject { core.resetPromptCache() } + /// Decodes `prompt` into the native prompt cache without sampling (the llama half of + /// prewarm-on-focus). Best-effort by contract: cancellation is silent and failures only log, + /// because a missed warmup just means the next generate pays the cold prefill it would have + /// paid anyway. Errors are deliberately kept out of `diagnostics.lastError`. + func prefill( + prompt: String, + cachedPrefixBytes: Int? = nil, + options: LlamaGenerationOptions + ) async throws { + _ = try await preparedRuntime() + + let core = self.core + let task = Task.detached { + try core.prefill( + prompt: prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options + ) + } + try await withTaskCancellationHandler { + try await task.value + try Task.checkCancellation() + } onCancel: { + task.cancel() + // A prefill is superseded the moment a real generation arrives; abort its native + // decode so the generation does not queue behind a warmup for a stale prompt. + core.abortInFlightGeneration() + } + } + /// Cancels any retained prepared runtime and releases backend resources. /// Shutdown runs on a detached thread so it does not block the main actor. func stop() { diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index 45a2195c..c6b9f344 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -12,11 +12,48 @@ import Logging final class LlamaSuggestionEngine { private let runtimeManager: LlamaRuntimeGenerating private var promptCacheHintTracker = LlamaPromptCacheHintTracker() + /// The focus-time warmup in flight, if any. A real generation cancels it on entry so it never + /// queues behind a warmup for a prompt the user has already typed past. + private var inflightPrewarmTask: Task? init(runtimeManager: LlamaRuntimeGenerating) { self.runtimeManager = runtimeManager } + /// Prefills the prompt KV for the field the user just focused, so the first real suggestion + /// there only decodes the typed delta instead of the whole cold prompt. + /// + /// The protocol default used to be a no-op here on the assumption that llama "keeps its KV + /// cache hot", but a focus change resets the cached generation context and destroys the native + /// sequence, so the first request in every field paid a full prefill. Best-effort by design: + /// failures are swallowed (a missed warmup costs nothing the cold path would not have paid) + /// and the tracker only records the prompt after the native decode actually succeeded. + func prewarm(for request: SuggestionRequest) async { + inflightPrewarmTask?.cancel() + let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request) + let options = Self.makeGenerationOptions(for: request) + let task = Task { [weak self, runtimeManager] in + do { + try await runtimeManager.prefill( + prompt: request.prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options + ) + guard !Task.isCancelled else { + return + } + self?.promptCacheHintTracker.recordSuccessfulRequest(request) + } catch { + CotabbyLogger.suggestion.debug( + "Llama prewarm skipped: \(error.localizedDescription)", + metadata: ["request_id": .string(request.requestID), "engine": .string("llama")] + ) + } + } + inflightPrewarmTask = task + await task.value + } + /// Executes one generation request and packages the raw and normalized result for the coordinator. func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult { let baseMetadata: Logger.Metadata = [ @@ -24,6 +61,11 @@ final class LlamaSuggestionEngine { "engine": .string("llama") ] do { + // A still-running focus warmup must not make this request wait behind it on the + // runtime's autocomplete lock; cancelling it aborts its native decode mid-chunk. + inflightPrewarmTask?.cancel() + inflightPrewarmTask = nil + let startTime = Date() let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request) let hintDesc = cachedPrefixBytes.map(String.init) ?? "none" @@ -38,20 +80,7 @@ final class LlamaSuggestionEngine { let rawSuggestion = try await runtimeManager.generate( prompt: request.prompt, cachedPrefixBytes: cachedPrefixBytes, - options: LlamaGenerationOptions( - maxPredictionTokens: request.maxPredictionTokens, - temperature: request.temperature, - topK: request.topK, - topP: request.topP, - minP: request.minP, - repetitionPenalty: request.repetitionPenalty, - seed: request.randomSeed, - singleLine: !request.isMultiLineEnabled, - forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation( - precedingText: request.context.precedingText, - trailingText: request.context.trailingText - ) - ) + options: Self.makeGenerationOptions(for: request) ) try Task.checkCancellation() @@ -143,9 +172,31 @@ final class LlamaSuggestionEngine { /// stale reuse; awaiting the runtime reset keeps native KV invalidation ordered before the next /// generation request that crosses this engine boundary. func resetCachedGenerationContext() async { + // The editing context moved on, so a warmup for the previous field's prompt is stale. + inflightPrewarmTask?.cancel() + inflightPrewarmTask = nil promptCacheHintTracker.reset() runtimeManager.resetPromptCache() } + + /// One shared mapping from a request to engine options so prewarm prefills decode under the + /// exact sampling fingerprint the following generation will validate its KV reuse against. + private static func makeGenerationOptions(for request: SuggestionRequest) -> LlamaGenerationOptions { + LlamaGenerationOptions( + maxPredictionTokens: request.maxPredictionTokens, + temperature: request.temperature, + topK: request.topK, + topP: request.topP, + minP: request.minP, + repetitionPenalty: request.repetitionPenalty, + seed: request.randomSeed, + singleLine: !request.isMultiLineEnabled, + forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation( + precedingText: request.context.precedingText, + trailingText: request.context.trailingText + ) + ) + } } extension LlamaSuggestionEngine: SuggestionGenerating {} diff --git a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift new file mode 100644 index 00000000..f66c4f57 --- /dev/null +++ b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift @@ -0,0 +1,130 @@ +import CoreGraphics +import Foundation +import XCTest +@testable import Cotabby + +/// Tests for the llama half of prewarm-on-focus: a focus change used to leave the llama engine's +/// `prewarm` as the protocol no-op while the focus reset destroyed the native sequence, so the +/// first suggestion in every field paid the full cold prompt decode. These pin the new contract: +/// prewarm prefills through the runtime and primes the reuse hint only when the prefill succeeded. +@MainActor +final class LlamaSuggestionEnginePrewarmTests: XCTestCase { + + func test_prewarm_prefillsAndPrimesTheReuseHint() async throws { + let runtime = RecordingPrewarmRuntime() + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + let request = makeRequest(prompt: "hello wor") + + await engine.prewarm(for: request) + + XCTAssertEqual(runtime.prefillPrompts, ["hello wor"]) + + _ = try await engine.generateSuggestion(for: request) + XCTAssertEqual( + runtime.generateCachedPrefixBytes, + ["hello wor".utf8.count], + "A successful prefill should let the next identical-context request advertise full reuse." + ) + } + + func test_failedPrewarm_leavesReuseHintCold() async throws { + let runtime = RecordingPrewarmRuntime() + runtime.prefillError = LlamaRuntimeError.unavailable("not loaded") + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + let request = makeRequest(prompt: "hello wor") + + await engine.prewarm(for: request) + + _ = try await engine.generateSuggestion(for: request) + XCTAssertEqual( + runtime.generateCachedPrefixBytes, + [nil], + "A failed prefill must not advertise reuse the native cache cannot back." + ) + } + + func test_resetClearsThePrimedHint() async throws { + let runtime = RecordingPrewarmRuntime() + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + let request = makeRequest(prompt: "hello wor") + + await engine.prewarm(for: request) + await engine.resetCachedGenerationContext() + + _ = try await engine.generateSuggestion(for: request) + XCTAssertEqual(runtime.generateCachedPrefixBytes, [nil]) + } + + // MARK: - Helpers + + private func makeRequest(prompt: String) -> SuggestionRequest { + let snapshot = FocusedInputSnapshot( + applicationName: "TestApp", + bundleIdentifier: "com.example.TestApp", + processIdentifier: 123, + elementIdentifier: "field", + role: "AXTextField", + subrole: nil, + caretRect: .zero, + inputFrameRect: nil, + caretSource: "test", + caretQuality: .exact, + observedCharWidth: nil, + precedingText: prompt, + trailingText: "", + selection: NSRange(location: prompt.count, length: 0), + isSecure: false + ) + let context = FocusedInputContext(snapshot: snapshot, generation: 1) + + return SuggestionRequest( + context: context, + prefixText: prompt, + prompt: prompt, + generation: context.generation, + maxPredictionTokens: 8, + temperature: 0.1, + topK: 20, + topP: 0.7, + minP: 0.08, + repetitionPenalty: 1.05, + randomSeed: 42, + maxSuffixCharacters: 192, + completionLengthInstruction: "Return only the next few words.", + userName: nil, + customRules: [], + languageInstruction: nil, + clipboardContext: nil, + visualContextSummary: nil, + isMultiLineEnabled: false + ) + } +} + +/// Records prefill calls and the reuse hints later generations advertise, so the prewarm contract +/// can be exercised without loading a real model. +@MainActor +private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating { + var prefillError: Error? + var generateResult: Result = .success("ok") + private(set) var prefillPrompts: [String] = [] + private(set) var generateCachedPrefixBytes: [Int?] = [] + + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions + ) async throws -> String { + generateCachedPrefixBytes.append(cachedPrefixBytes) + return try generateResult.get() + } + + func resetPromptCache() {} + + func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws { + if let prefillError { + throw prefillError + } + prefillPrompts.append(prompt) + } +} From 00c496881ea91af2fc4e781b50f884e207bc46ec Mon Sep 17 00:00:00 2001 From: Jacob Fu <141651335+FuJacob@users.noreply.github.com> Date: Thu, 11 Jun 2026 19:24:45 -0700 Subject: [PATCH 2/2] review: stop prewarm from double-decoding on models that reject partial KV trims On hybrid/SWA models (the current catalog) trimKV is rejected unconditionally, so a warmed sequence still carries its seed token, the following generate's reuse trim is rejected too, and the prompt gets fully decoded twice. The core now learns the rejection from the first failed trim after model load (generate's restore-trim, the reuse path, or the prefill tail), drops a warmed sequence it cannot trim instead of recording tracker facts the KV does not match, and turns subsequent prefills into no-ops until the next model load. --- .../Services/Runtime/LlamaRuntimeCore.swift | 47 +++++++++++++++---- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 784457d7..a18be48b 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -45,6 +45,13 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { /// prefix-reuse fast path degrades silently to a full prompt re-prefill on every request. private var loggedTrimRejectionForCurrentModel = false + /// True once the loaded model has rejected a partial KV trim (hybrid/recurrent and SWA caches + /// reject them unconditionally). On such models prefix reuse can never succeed, so prewarm + /// prefills are pure double work: the warmed sequence cannot be trimmed back to prompt-only + /// state, and the following generate's reuse trim is rejected too, forcing a second full + /// decode of the same prompt. Guarded by `autocompleteLock`; reset on model load. + private var modelRejectsPartialTrims = false + /// Coordinates model lifecycle with in-flight operations. `generate()` and `summarize()` /// increment the active count on entry and decrement on exit. `shutdown()` sets the /// shutting-down flag and blocks until all active operations finish before unloading. @@ -107,6 +114,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ) self.preparedRuntime = result loggedTrimRejectionForCurrentModel = false + modelRejectsPartialTrims = false CotabbyLogger.runtime.info( "Model loaded", metadata: [ @@ -163,8 +171,13 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ) defer { - // Trim sampled tokens so KV retains only the prompt for the next request. - _ = engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) + // Trim sampled tokens so KV retains only the prompt for the next request. A rejected + // trim leaves the sampled tokens in KV while the tracker records prompt-only state; + // that mismatch self-heals (the next reuse trim is rejected too and rebuilds fresh), + // but it also proves this model can never reuse, so remember that for `prefill`. + if !engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) { + modelRejectsPartialTrims = true + } autocompletePromptBytes = preparation.promptBytes autocompletePromptTokens = preparation.promptTokens autocompleteSamplingFingerprint = preparation.fingerprint @@ -213,6 +226,15 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // Same exit guarantee as `generate`: see the comment there. defer { clearAbortTarget() } + // On models that reject partial trims (the hybrid/SWA catalog families), a warmed + // sequence can never be reused, so prefilling would only double the cold decode the + // first real request pays anyway. The flag is learned from the first rejected trim + // after model load; until then one speculative prefill may still run and be discarded. + guard !modelRejectsPartialTrims else { + CotabbyLogger.runtime.debug("Prefill skipped: the loaded model rejects partial KV trims") + return + } + // A superseding generation cancels the warmup task before contending on the lock above. // The engine-level abort only reaches a decode that already published its target, so close // the window where the cancel landed while this prefill was still tokenizing or queued. @@ -228,12 +250,20 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { options: options ) - // No decode loop ran, so KV already holds prompt-only state; record the tracker facts the - // next request validates its reuse against. - _ = engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) - autocompletePromptBytes = preparation.promptBytes - autocompletePromptTokens = preparation.promptTokens - autocompleteSamplingFingerprint = preparation.fingerprint + // `decodePrompt` samples one seed token beyond the prompt, so the trim is what restores + // prompt-only KV. If it is rejected, the warmed sequence still carries the seed and can + // never be trimmed by the following generate either: drop it instead of recording tracker + // facts the KV does not match, and remember that warming this model is pointless. + if engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) { + autocompletePromptBytes = preparation.promptBytes + autocompletePromptTokens = preparation.promptTokens + autocompleteSamplingFingerprint = preparation.fingerprint + } else { + modelRejectsPartialTrims = true + engine.destroySequence(sequenceID) + autocompleteSequenceID = -1 + logTrimRejectionIfNeeded(reusableTokenCount: preparation.promptTokens.count) + } } /// Aborts the in-flight autocomplete operation's native work mid-prefill. Task cancellation is @@ -608,6 +638,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { /// full prompt re-prefill on every keystroke pause: the difference between decoding a few /// delta tokens and the entire prompt. private func logTrimRejectionIfNeeded(reusableTokenCount: Int) { + modelRejectsPartialTrims = true if !loggedTrimRejectionForCurrentModel { loggedTrimRejectionForCurrentModel = true CotabbyLogger.runtime.info(