diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index a2988d19..cfd245bd 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -568,6 +568,7 @@ E51FA12B690428CA431328FC /* WritingPaneView.swift in Sources */ = {isa = PBXBuildFile; fileRef = D48B95B6665109B6C6A63B42 /* WritingPaneView.swift */; }; E54F5F03E16859D5A1E3437A /* MacroController.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4638C74239D1DE2DC4D87975 /* MacroController.swift */; }; E5CB34ED76BAE87E8A858112 /* WebContentFieldDetectorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 210F9AD332273FE2EB3A9A01 /* WebContentFieldDetectorTests.swift */; }; + E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */; }; E6EE3C13FA31F261CD734C69 /* DownloadOutcomeClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DE1975F3B5F4A70478DBF41 /* DownloadOutcomeClassifier.swift */; }; E853B9C7AF93FA595DC417B2 /* EmojiVariantResolver.swift in Sources */ = {isa = PBXBuildFile; fileRef = 1A8414BEB7E34F57607E37FE /* EmojiVariantResolver.swift */; }; E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */; }; @@ -696,6 +697,7 @@ 24F613F0E2F7046E6532A09C /* OnboardingTemplateFeatureList.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OnboardingTemplateFeatureList.swift; sourceTree = ""; }; 262BE2F1E97389FE8D7A5FB9 /* Cotabby.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Cotabby.app; sourceTree = BUILT_PRODUCTS_DIR; }; 264CA64B2AB1611F82E5B760 /* WelcomeView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WelcomeView.swift; sourceTree = ""; }; + 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaSuggestionEnginePrewarmTests.swift; sourceTree = ""; }; 273B4DC844F79B4BE2C8910F /* FocusPollBackoffTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusPollBackoffTests.swift; sourceTree = ""; }; 27A5D63F390E9B7A7FE343FE /* SystemResourceSampler.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SystemResourceSampler.swift; sourceTree = ""; }; 28B7EB84781C0ED57844585E /* OnboardingTemplateTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = OnboardingTemplateTests.swift; sourceTree = ""; }; @@ -1363,6 +1365,7 @@ 4793D4EA5D36D7E5CC216C27 /* LanguageSupportTests.swift */, 0CA88BB29BC8727878C99E95 /* LlamaPromptCacheHintTrackerTests.swift */, AABCC3FD99B1824A81E665F3 /* LlamaSuggestionEngineCancellationTests.swift */, + 26EF16C7439BEB156BD9FB03 /* LlamaSuggestionEnginePrewarmTests.swift */, 9030FAAB468119A0236284A6 /* LLMIOFileHandlerTests.swift */, D8083D44ABCDCFA68A4CD497 /* MacroEngineTests.swift */, 22BE47D1DBF6C23151458836 /* MacroTriggerStateMachineTests.swift */, @@ -2334,6 +2337,7 @@ E912D4617AE1376061DF1F00 /* LanguageSupportTests.swift in Sources */, E38801433B99E65BD7E45A0E /* LlamaPromptCacheHintTrackerTests.swift in Sources */, BE3CB85508055D159C35020A /* LlamaSuggestionEngineCancellationTests.swift in Sources */, + E64AE96DF2A80A368FDE522D /* LlamaSuggestionEnginePrewarmTests.swift in Sources */, 8429B116328C392DCA018D95 /* MacroEngineTests.swift in Sources */, 3F8CBCBCC45E377DF9ADB216 /* MacroTriggerStateMachineTests.swift in Sources */, 87806DE08881D11F2608A13D /* MarkerSelectionSynthesizerTests.swift in Sources */, diff --git a/Cotabby/Models/SuggestionSubsystemContracts.swift b/Cotabby/Models/SuggestionSubsystemContracts.swift index f50b4d72..c47b2329 100644 --- a/Cotabby/Models/SuggestionSubsystemContracts.swift +++ b/Cotabby/Models/SuggestionSubsystemContracts.swift @@ -93,10 +93,10 @@ protocol SuggestionGenerating: AnyObject { /// continuous. Stateless engines may implement this as a no-op. func resetCachedGenerationContext() async /// Best-effort warmup hook the coordinator calls after focus arrives on an editable surface. - /// Engines that benefit from prefix caching or weight loading (Apple Foundation Models) use it - /// to prime the next request; engines that do not (llama already keeps its KV cache hot) can - /// rely on the default no-op extension. Failures are intentionally swallowed by implementations - /// because prewarming is opportunistic. + /// Apple Foundation Models primes its session here, and the llama engine prefills the new + /// field's prompt KV (a focus change destroys the previous field's native sequence, so without + /// this the first suggestion in every field pays the full cold prompt decode). Failures are + /// intentionally swallowed by implementations because prewarming is opportunistic. func prewarm(for request: SuggestionRequest) async } @@ -113,6 +113,16 @@ extension SuggestionGenerating { protocol LlamaRuntimeGenerating: AnyObject { func generate(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws -> String func resetPromptCache() + /// Decodes `prompt` into the native prompt cache without sampling any tokens, so the next + /// `generate` whose prompt extends this one only decodes the typed delta. Best-effort warmup: + /// callers treat failures as "no cache primed", never as a user-facing error. + func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws +} + +extension LlamaRuntimeGenerating { + /// Default no-op so test fakes that only exercise the generate/cancel contract keep compiling; + /// the production manager overrides this with a real prompt prefill. + func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws {} } @MainActor diff --git a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift index 5384b9cf..a18be48b 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeCore.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeCore.swift @@ -34,6 +34,24 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { private var autocompletePromptTokens: [Int32] = [] private var autocompleteSamplingFingerprint: SamplingFingerprint? + /// The sequence the in-flight autocomplete operation is decoding into, published for + /// `abortInFlightGeneration` to target from the canceller's thread. Guarded by its own lock + /// because the abort fires while `autocompleteLock` is held by the very work being aborted. + private let abortTargetLock = NSLock() + private var abortTargetSequenceID: Int32 = -1 + + /// One loud line per model load when the engine rejects partial KV trims (llama.cpp cannot + /// drop mid-sequence ranges on hybrid/recurrent or SWA caches). Without this signal the + /// prefix-reuse fast path degrades silently to a full prompt re-prefill on every request. + private var loggedTrimRejectionForCurrentModel = false + + /// True once the loaded model has rejected a partial KV trim (hybrid/recurrent and SWA caches + /// reject them unconditionally). On such models prefix reuse can never succeed, so prewarm + /// prefills are pure double work: the warmed sequence cannot be trimmed back to prompt-only + /// state, and the following generate's reuse trim is rejected too, forcing a second full + /// decode of the same prompt. Guarded by `autocompleteLock`; reset on model load. + private var modelRejectsPartialTrims = false + /// Coordinates model lifecycle with in-flight operations. `generate()` and `summarize()` /// increment the active count on entry and decrement on exit. `shutdown()` sets the /// shutting-down flag and blocks until all active operations finish before unloading. @@ -95,6 +113,8 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { backendName: "CotabbyInferenceEngine (llama.cpp in-process)" ) self.preparedRuntime = result + loggedTrimRejectionForCurrentModel = false + modelRejectsPartialTrims = false CotabbyLogger.runtime.info( "Model loaded", metadata: [ @@ -118,9 +138,73 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { cachedPrefixBytes: Int? = nil, options: LlamaGenerationOptions ) throws -> String { - guard let preparedRuntime else { - throw LlamaRuntimeError.unavailable("The llama model is not loaded.") + let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "generate") + + lifecycleCondition.lock() + guard !isShuttingDown else { + lifecycleCondition.unlock() + throw LlamaRuntimeError.unavailable("The runtime is shutting down.") } + activeOperationCount += 1 + lifecycleCondition.unlock() + + defer { + lifecycleCondition.lock() + activeOperationCount -= 1 + lifecycleCondition.broadcast() + lifecycleCondition.unlock() + } + + autocompleteLock.lock() + defer { autocompleteLock.unlock() } + // Registered before `obtainAutocompleteSequence` because that call publishes the abort + // target ahead of its prompt decode; every exit (including a cancelled prefill throwing) + // must clear it so a late abort can never flag a recycled sequence slot. + defer { clearAbortTarget() } + + let sequenceID = try obtainAutocompleteSequence( + promptTokens: preparation.promptTokens, + promptBytes: preparation.promptBytes, + fingerprint: preparation.fingerprint, + cachedPrefixBytes: preparation.cachedPrefixBytes, + options: options + ) + + defer { + // Trim sampled tokens so KV retains only the prompt for the next request. A rejected + // trim leaves the sampled tokens in KV while the tracker records prompt-only state; + // that mismatch self-heals (the next reuse trim is rejected too and rebuilds fresh), + // but it also proves this model can never reuse, so remember that for `prefill`. + if !engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) { + modelRejectsPartialTrims = true + } + autocompletePromptBytes = preparation.promptBytes + autocompletePromptTokens = preparation.promptTokens + autocompleteSamplingFingerprint = preparation.fingerprint + } + + // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for + // the next request. Token selection is delegated to the engine's built-in sampler. + let decode = runEngineSampledDecode(sequenceID: sequenceID, options: options) + if decode.engineCancelled { + // The engine's per-sequence abort flag is set-once; an aborted sequence would refuse + // every future decode, so drop it and let the next request build fresh. + engine.destroySequence(sequenceID) + autocompleteSequenceID = -1 + } + return decode.text + } + + /// Decodes `prompt` into the autocomplete KV cache without sampling, so the next `generate` + /// whose prompt extends this one only pays for the typed delta. This is the llama half of + /// prewarm-on-focus: a focus change destroys the previous field's sequence, and without a + /// prefill the first suggestion in every field pays the full cold prompt decode. + func prefill( + prompt: String, + cachedPrefixBytes: Int? = nil, + options: LlamaGenerationOptions + ) throws { + let preparation = try preparedPrompt(prompt: prompt, cachedPrefixBytes: cachedPrefixBytes, options: options, kind: "prefill") lifecycleCondition.lock() guard !isShuttingDown else { @@ -137,6 +221,88 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { lifecycleCondition.unlock() } + autocompleteLock.lock() + defer { autocompleteLock.unlock() } + // Same exit guarantee as `generate`: see the comment there. + defer { clearAbortTarget() } + + // On models that reject partial trims (the hybrid/SWA catalog families), a warmed + // sequence can never be reused, so prefilling would only double the cold decode the + // first real request pays anyway. The flag is learned from the first rejected trim + // after model load; until then one speculative prefill may still run and be discarded. + guard !modelRejectsPartialTrims else { + CotabbyLogger.runtime.debug("Prefill skipped: the loaded model rejects partial KV trims") + return + } + + // A superseding generation cancels the warmup task before contending on the lock above. + // The engine-level abort only reaches a decode that already published its target, so close + // the window where the cancel landed while this prefill was still tokenizing or queued. + guard !Task.isCancelled else { + throw CancellationError() + } + + let sequenceID = try obtainAutocompleteSequence( + promptTokens: preparation.promptTokens, + promptBytes: preparation.promptBytes, + fingerprint: preparation.fingerprint, + cachedPrefixBytes: preparation.cachedPrefixBytes, + options: options + ) + + // `decodePrompt` samples one seed token beyond the prompt, so the trim is what restores + // prompt-only KV. If it is rejected, the warmed sequence still carries the seed and can + // never be trimmed by the following generate either: drop it instead of recording tracker + // facts the KV does not match, and remember that warming this model is pointless. + if engine.trimKV(sequenceID, Int32(preparation.promptTokens.count)) { + autocompletePromptBytes = preparation.promptBytes + autocompletePromptTokens = preparation.promptTokens + autocompleteSamplingFingerprint = preparation.fingerprint + } else { + modelRejectsPartialTrims = true + engine.destroySequence(sequenceID) + autocompleteSequenceID = -1 + logTrimRejectionIfNeeded(reusableTokenCount: preparation.promptTokens.count) + } + } + + /// Aborts the in-flight autocomplete operation's native work mid-prefill. Task cancellation is + /// only polled between sampled tokens, so without this an uninterruptible prompt decode makes + /// the next request wait out the entire stale prefill. Safe from any thread: the engine flag + /// is atomic and its sequence lookup is mutex-guarded; a no-op when nothing is in flight. + func abortInFlightGeneration() { + abortTargetLock.lock() + let target = abortTargetSequenceID + abortTargetLock.unlock() + guard target >= 0 else { + return + } + engine.cancelSequence(target) + } + + private func setAbortTarget(_ sequenceID: Int32) { + abortTargetLock.lock() + abortTargetSequenceID = sequenceID + abortTargetLock.unlock() + } + + private func clearAbortTarget() { + abortTargetLock.lock() + abortTargetSequenceID = -1 + abortTargetLock.unlock() + } + + /// Shared tokenize/truncate/log front half of `generate` and `prefill`. + private func preparedPrompt( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions, + kind: String + ) throws -> PreparedPrompt { + guard let preparedRuntime else { + throw LlamaRuntimeError.unavailable("The llama model is not loaded.") + } + let promptBytes = Array(prompt.utf8) let allPromptTokens = tokenize(prompt) guard !allPromptTokens.isEmpty else { @@ -149,7 +315,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { CotabbyLogger.runtime.debug( "Decode start", metadata: [ - "kind": .string("generate"), + "kind": .string(kind), "prompt_tokens": .stringConvertible(allPromptTokens.count), "max_tokens": .stringConvertible(options.maxPredictionTokens), "cached_prefix_bytes": .string(cachedPrefixBytes.map(String.init) ?? "none") @@ -157,51 +323,44 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ) let maxPromptTokens = max(1, preparedRuntime.contextWindowTokens - options.maxPredictionTokens) - var promptTokens: [Int32] - var adjustedCachedPrefixBytes: Int? if allPromptTokens.count > maxPromptTokens { - promptTokens = Array(allPromptTokens.suffix(maxPromptTokens)) - adjustedCachedPrefixBytes = nil - } else { - promptTokens = allPromptTokens - adjustedCachedPrefixBytes = cachedPrefixBytes + return PreparedPrompt( + promptBytes: promptBytes, + promptTokens: Array(allPromptTokens.suffix(maxPromptTokens)), + cachedPrefixBytes: nil, + fingerprint: SamplingFingerprint(options: options) + ) } - - let fingerprint = SamplingFingerprint(options: options) - - autocompleteLock.lock() - defer { autocompleteLock.unlock() } - - let sequenceID = try obtainAutocompleteSequence( - promptTokens: promptTokens, + return PreparedPrompt( promptBytes: promptBytes, - fingerprint: fingerprint, - cachedPrefixBytes: adjustedCachedPrefixBytes, - options: options + promptTokens: allPromptTokens, + cachedPrefixBytes: cachedPrefixBytes, + fingerprint: SamplingFingerprint(options: options) ) + } - defer { - // Trim sampled tokens so KV retains only the prompt for the next request. - _ = engine.trimKV(sequenceID, Int32(promptTokens.count)) - autocompletePromptBytes = promptBytes - autocompletePromptTokens = promptTokens - autocompleteSamplingFingerprint = fingerprint - } - - // The KV-trim defer above runs after the decoder returns, restoring prompt-only KV state for - // the next request. Token selection is delegated to the engine's built-in sampler. - return runEngineSampledDecode(sequenceID: sequenceID, options: options) + private struct PreparedPrompt { + let promptBytes: [UInt8] + let promptTokens: [Int32] + let cachedPrefixBytes: Int? + let fingerprint: SamplingFingerprint } // MARK: - Decoders /// The shipping decoder: delegates token selection to the engine's built-in sampler /// (`sampleNext`), which applies temperature / top-k / top-p / min-p and commits each token. - private func runEngineSampledDecode(sequenceID: Int32, options: LlamaGenerationOptions) -> String { + /// `engineCancelled` reports that the native abort flag fired; the sequence must then be + /// discarded because the flag is set-once for a sequence's lifetime. + private func runEngineSampledDecode( + sequenceID: Int32, + options: LlamaGenerationOptions + ) -> (text: String, engineCancelled: Bool) { var generatedText = "" var tokensGenerated = 0 var sumLogprob = 0.0 var stopReason = "budget_exhausted" + var engineCancelled = false for _ in 0 ..< options.maxPredictionTokens { // Cooperative cancellation: when the wrapping Task is cancelled (caller hit a new @@ -217,6 +376,7 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { if result.was_cancelled { stopReason = "engine_cancelled" + engineCancelled = true break } if result.is_eos { @@ -255,9 +415,9 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { ) if Self.shouldSuppress(sumLogprob: sumLogprob, tokensGenerated: tokensGenerated, options: options) { - return "" + return ("", engineCancelled) } - return generatedText + return (generatedText, engineCancelled) } /// Low-confidence gate for the sampled decoder: drop completions the model itself was unsure @@ -373,37 +533,59 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { newPromptTokenCount: promptTokens.count ) - if reusableTokenCount > 0, - engine.trimKV(autocompleteSequenceID, Int32(reusableTokenCount)) { - - let remaining = Array(promptTokens[reusableTokenCount...]) - if !remaining.isEmpty { - // Seed for the reuse path is sampled at the end of this decodePrompt; apply - // the word-continuation constraint to it just like the fresh path does. - engine.setForceWordContinuation(autocompleteSequenceID, options.forceWordContinuation) - // Per-token log-probabilities cost two O(vocab) passes each in the engine; - // only compute them when the confidence gate would actually read them. - // Re-assert per request: the floor is not part of the sampling fingerprint, - // so a reused sequence must not carry a stale flag. - engine.setComputeLogprob( - autocompleteSequenceID, - options.confidenceFloor > -.infinity - ) - var mutableRemaining = remaining - let status = engine.decodePrompt( - autocompleteSequenceID, - &mutableRemaining, - Int32(mutableRemaining.count), - Int32(reusableTokenCount) - ) - if status != .ok { - // Reuse failed mid-decode; fall through to fresh build. - engine.destroySequence(autocompleteSequenceID) - autocompleteSequenceID = -1 - return try buildFreshSequence(promptTokens: promptTokens, options: options) + if reusableTokenCount > 0 { + if engine.trimKV(autocompleteSequenceID, Int32(reusableTokenCount)) { + let remaining = Array(promptTokens[reusableTokenCount...]) + if !remaining.isEmpty { + // Seed for the reuse path is sampled at the end of this decodePrompt; + // apply the word-continuation constraint to it like the fresh path does. + engine.setForceWordContinuation( + autocompleteSequenceID, + options.forceWordContinuation + ) + // Per-token log-probabilities cost two O(vocab) passes each in the + // engine; only compute them when the confidence gate would actually + // read them. Re-assert per request: the floor is not part of the + // sampling fingerprint, so a reused sequence must not carry a stale flag. + engine.setComputeLogprob( + autocompleteSequenceID, + options.confidenceFloor > -.infinity + ) + setAbortTarget(autocompleteSequenceID) + var mutableRemaining = remaining + let status = engine.decodePrompt( + autocompleteSequenceID, + &mutableRemaining, + Int32(mutableRemaining.count), + Int32(reusableTokenCount) + ) + if status == .cancelled { + // The caller's request was superseded mid-prefill. Do NOT rebuild + // fresh here: that would decode the full stale prompt right after + // its cancellation. The aborted sequence is unusable (set-once + // flag, partially decoded KV), so drop it and surface the cancel. + engine.destroySequence(autocompleteSequenceID) + autocompleteSequenceID = -1 + throw CancellationError() + } + if status != .ok { + // Reuse failed mid-decode; fall through to fresh build. + engine.destroySequence(autocompleteSequenceID) + autocompleteSequenceID = -1 + return try buildFreshSequence(promptTokens: promptTokens, options: options) + } } + CotabbyLogger.runtime.debug( + "KV prefix reused", + metadata: [ + "reused_tokens": .stringConvertible(reusableTokenCount), + "decoded_delta_tokens": .stringConvertible(promptTokens.count - reusableTokenCount) + ] + ) + return autocompleteSequenceID } - return autocompleteSequenceID + + logTrimRejectionIfNeeded(reusableTokenCount: reusableTokenCount) } } } @@ -433,10 +615,16 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { // would be summed and then discarded. engine.setComputeLogprob(seqID, options.confidenceFloor > -.infinity) + setAbortTarget(seqID) var tokens = promptTokens let status = engine.decodePrompt(seqID, &tokens, Int32(tokens.count), 0) guard status == .ok else { engine.destroySequence(seqID) + if status == .cancelled { + // Superseded mid-prefill; the abort exists precisely so the next request does not + // wait out the rest of this decode. Quiet cancellation, no runtime error. + throw CancellationError() + } throw LlamaRuntimeError.generationFailed("Prompt decoding failed.") } @@ -444,6 +632,31 @@ nonisolated final class LlamaRuntimeCore: @unchecked Sendable { return seqID } + /// Surfaces "this model cannot reuse its prompt KV" once per model load at info level, then + /// per-event at debug. llama.cpp rejects partial sequence removal on hybrid (recurrent) and + /// SWA caches — which includes the current catalog families — and the silent fallback is a + /// full prompt re-prefill on every keystroke pause: the difference between decoding a few + /// delta tokens and the entire prompt. + private func logTrimRejectionIfNeeded(reusableTokenCount: Int) { + modelRejectsPartialTrims = true + if !loggedTrimRejectionForCurrentModel { + loggedTrimRejectionForCurrentModel = true + CotabbyLogger.runtime.info( + "KV prefix reuse unavailable: the engine rejected a partial trim, so every request re-decodes its full prompt", + metadata: [ + "model": .string(preparedRuntime?.resolvedRuntime.modelDisplayName ?? "unknown"), + "rejected_reusable_tokens": .stringConvertible(reusableTokenCount) + ] + ) + return + } + + CotabbyLogger.runtime.debug( + "KV prefix trim rejected; rebuilding sequence", + metadata: ["rejected_reusable_tokens": .stringConvertible(reusableTokenCount)] + ) + } + // MARK: - Private: helpers private func tokenize(_ text: String) -> [Int32] { diff --git a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift index be4d9917..6cc22d7f 100644 --- a/Cotabby/Services/Runtime/LlamaRuntimeManager.swift +++ b/Cotabby/Services/Runtime/LlamaRuntimeManager.swift @@ -129,6 +129,11 @@ final class LlamaRuntimeManager: ObservableObject { return partial } onCancel: { task.cancel() + // Task cancellation is only polled between sampled tokens, so an in-flight prompt + // prefill would otherwise run to completion while holding the autocomplete lock, + // making the superseding request wait out the whole stale decode. The engine-level + // abort interrupts the decode at its next batch chunk. + core.abortInFlightGeneration() } } catch is CancellationError { CotabbyLogger.runtime.debug("Generation cancelled") @@ -150,6 +155,36 @@ final class LlamaRuntimeManager: ObservableObject { core.resetPromptCache() } + /// Decodes `prompt` into the native prompt cache without sampling (the llama half of + /// prewarm-on-focus). Best-effort by contract: cancellation is silent and failures only log, + /// because a missed warmup just means the next generate pays the cold prefill it would have + /// paid anyway. Errors are deliberately kept out of `diagnostics.lastError`. + func prefill( + prompt: String, + cachedPrefixBytes: Int? = nil, + options: LlamaGenerationOptions + ) async throws { + _ = try await preparedRuntime() + + let core = self.core + let task = Task.detached { + try core.prefill( + prompt: prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options + ) + } + try await withTaskCancellationHandler { + try await task.value + try Task.checkCancellation() + } onCancel: { + task.cancel() + // A prefill is superseded the moment a real generation arrives; abort its native + // decode so the generation does not queue behind a warmup for a stale prompt. + core.abortInFlightGeneration() + } + } + /// Cancels any retained prepared runtime and releases backend resources. /// Shutdown runs on a detached thread so it does not block the main actor. func stop() { diff --git a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift index 45a2195c..c6b9f344 100644 --- a/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift +++ b/Cotabby/Services/Runtime/LlamaSuggestionEngine.swift @@ -12,11 +12,48 @@ import Logging final class LlamaSuggestionEngine { private let runtimeManager: LlamaRuntimeGenerating private var promptCacheHintTracker = LlamaPromptCacheHintTracker() + /// The focus-time warmup in flight, if any. A real generation cancels it on entry so it never + /// queues behind a warmup for a prompt the user has already typed past. + private var inflightPrewarmTask: Task? init(runtimeManager: LlamaRuntimeGenerating) { self.runtimeManager = runtimeManager } + /// Prefills the prompt KV for the field the user just focused, so the first real suggestion + /// there only decodes the typed delta instead of the whole cold prompt. + /// + /// The protocol default used to be a no-op here on the assumption that llama "keeps its KV + /// cache hot", but a focus change resets the cached generation context and destroys the native + /// sequence, so the first request in every field paid a full prefill. Best-effort by design: + /// failures are swallowed (a missed warmup costs nothing the cold path would not have paid) + /// and the tracker only records the prompt after the native decode actually succeeded. + func prewarm(for request: SuggestionRequest) async { + inflightPrewarmTask?.cancel() + let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request) + let options = Self.makeGenerationOptions(for: request) + let task = Task { [weak self, runtimeManager] in + do { + try await runtimeManager.prefill( + prompt: request.prompt, + cachedPrefixBytes: cachedPrefixBytes, + options: options + ) + guard !Task.isCancelled else { + return + } + self?.promptCacheHintTracker.recordSuccessfulRequest(request) + } catch { + CotabbyLogger.suggestion.debug( + "Llama prewarm skipped: \(error.localizedDescription)", + metadata: ["request_id": .string(request.requestID), "engine": .string("llama")] + ) + } + } + inflightPrewarmTask = task + await task.value + } + /// Executes one generation request and packages the raw and normalized result for the coordinator. func generateSuggestion(for request: SuggestionRequest) async throws -> SuggestionResult { let baseMetadata: Logger.Metadata = [ @@ -24,6 +61,11 @@ final class LlamaSuggestionEngine { "engine": .string("llama") ] do { + // A still-running focus warmup must not make this request wait behind it on the + // runtime's autocomplete lock; cancelling it aborts its native decode mid-chunk. + inflightPrewarmTask?.cancel() + inflightPrewarmTask = nil + let startTime = Date() let cachedPrefixBytes = promptCacheHintTracker.cachedPrefixBytes(for: request) let hintDesc = cachedPrefixBytes.map(String.init) ?? "none" @@ -38,20 +80,7 @@ final class LlamaSuggestionEngine { let rawSuggestion = try await runtimeManager.generate( prompt: request.prompt, cachedPrefixBytes: cachedPrefixBytes, - options: LlamaGenerationOptions( - maxPredictionTokens: request.maxPredictionTokens, - temperature: request.temperature, - topK: request.topK, - topP: request.topP, - minP: request.minP, - repetitionPenalty: request.repetitionPenalty, - seed: request.randomSeed, - singleLine: !request.isMultiLineEnabled, - forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation( - precedingText: request.context.precedingText, - trailingText: request.context.trailingText - ) - ) + options: Self.makeGenerationOptions(for: request) ) try Task.checkCancellation() @@ -143,9 +172,31 @@ final class LlamaSuggestionEngine { /// stale reuse; awaiting the runtime reset keeps native KV invalidation ordered before the next /// generation request that crosses this engine boundary. func resetCachedGenerationContext() async { + // The editing context moved on, so a warmup for the previous field's prompt is stale. + inflightPrewarmTask?.cancel() + inflightPrewarmTask = nil promptCacheHintTracker.reset() runtimeManager.resetPromptCache() } + + /// One shared mapping from a request to engine options so prewarm prefills decode under the + /// exact sampling fingerprint the following generation will validate its KV reuse against. + private static func makeGenerationOptions(for request: SuggestionRequest) -> LlamaGenerationOptions { + LlamaGenerationOptions( + maxPredictionTokens: request.maxPredictionTokens, + temperature: request.temperature, + topK: request.topK, + topP: request.topP, + minP: request.minP, + repetitionPenalty: request.repetitionPenalty, + seed: request.randomSeed, + singleLine: !request.isMultiLineEnabled, + forceWordContinuation: MidWordContinuationPolicy.shouldForceContinuation( + precedingText: request.context.precedingText, + trailingText: request.context.trailingText + ) + ) + } } extension LlamaSuggestionEngine: SuggestionGenerating {} diff --git a/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift new file mode 100644 index 00000000..f66c4f57 --- /dev/null +++ b/CotabbyTests/LlamaSuggestionEnginePrewarmTests.swift @@ -0,0 +1,130 @@ +import CoreGraphics +import Foundation +import XCTest +@testable import Cotabby + +/// Tests for the llama half of prewarm-on-focus: a focus change used to leave the llama engine's +/// `prewarm` as the protocol no-op while the focus reset destroyed the native sequence, so the +/// first suggestion in every field paid the full cold prompt decode. These pin the new contract: +/// prewarm prefills through the runtime and primes the reuse hint only when the prefill succeeded. +@MainActor +final class LlamaSuggestionEnginePrewarmTests: XCTestCase { + + func test_prewarm_prefillsAndPrimesTheReuseHint() async throws { + let runtime = RecordingPrewarmRuntime() + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + let request = makeRequest(prompt: "hello wor") + + await engine.prewarm(for: request) + + XCTAssertEqual(runtime.prefillPrompts, ["hello wor"]) + + _ = try await engine.generateSuggestion(for: request) + XCTAssertEqual( + runtime.generateCachedPrefixBytes, + ["hello wor".utf8.count], + "A successful prefill should let the next identical-context request advertise full reuse." + ) + } + + func test_failedPrewarm_leavesReuseHintCold() async throws { + let runtime = RecordingPrewarmRuntime() + runtime.prefillError = LlamaRuntimeError.unavailable("not loaded") + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + let request = makeRequest(prompt: "hello wor") + + await engine.prewarm(for: request) + + _ = try await engine.generateSuggestion(for: request) + XCTAssertEqual( + runtime.generateCachedPrefixBytes, + [nil], + "A failed prefill must not advertise reuse the native cache cannot back." + ) + } + + func test_resetClearsThePrimedHint() async throws { + let runtime = RecordingPrewarmRuntime() + let engine = LlamaSuggestionEngine(runtimeManager: runtime) + let request = makeRequest(prompt: "hello wor") + + await engine.prewarm(for: request) + await engine.resetCachedGenerationContext() + + _ = try await engine.generateSuggestion(for: request) + XCTAssertEqual(runtime.generateCachedPrefixBytes, [nil]) + } + + // MARK: - Helpers + + private func makeRequest(prompt: String) -> SuggestionRequest { + let snapshot = FocusedInputSnapshot( + applicationName: "TestApp", + bundleIdentifier: "com.example.TestApp", + processIdentifier: 123, + elementIdentifier: "field", + role: "AXTextField", + subrole: nil, + caretRect: .zero, + inputFrameRect: nil, + caretSource: "test", + caretQuality: .exact, + observedCharWidth: nil, + precedingText: prompt, + trailingText: "", + selection: NSRange(location: prompt.count, length: 0), + isSecure: false + ) + let context = FocusedInputContext(snapshot: snapshot, generation: 1) + + return SuggestionRequest( + context: context, + prefixText: prompt, + prompt: prompt, + generation: context.generation, + maxPredictionTokens: 8, + temperature: 0.1, + topK: 20, + topP: 0.7, + minP: 0.08, + repetitionPenalty: 1.05, + randomSeed: 42, + maxSuffixCharacters: 192, + completionLengthInstruction: "Return only the next few words.", + userName: nil, + customRules: [], + languageInstruction: nil, + clipboardContext: nil, + visualContextSummary: nil, + isMultiLineEnabled: false + ) + } +} + +/// Records prefill calls and the reuse hints later generations advertise, so the prewarm contract +/// can be exercised without loading a real model. +@MainActor +private final class RecordingPrewarmRuntime: LlamaRuntimeGenerating { + var prefillError: Error? + var generateResult: Result = .success("ok") + private(set) var prefillPrompts: [String] = [] + private(set) var generateCachedPrefixBytes: [Int?] = [] + + func generate( + prompt: String, + cachedPrefixBytes: Int?, + options: LlamaGenerationOptions + ) async throws -> String { + generateCachedPrefixBytes.append(cachedPrefixBytes) + return try generateResult.get() + } + + func resetPromptCache() {} + + func prefill(prompt: String, cachedPrefixBytes: Int?, options: LlamaGenerationOptions) async throws { + if let prefillError { + throw prefillError + } + prefillPrompts.append(prompt) + } +}