From 7fdd5856046efbeb2558a4f08e1a8ec1b9b88e2e Mon Sep 17 00:00:00 2001 From: sukru tikves Date: Thu, 2 Jul 2026 13:12:08 -0700 Subject: [PATCH 1/5] VLM: fix tokenCount for 2D tensors, replace precondition with throw - EmbeddedInput.tokenCount: handle 2D [seq_len, hidden_dim] vs 3D [batch, seq_len, hidden_dim] - scatterMerge: replace precondition(float16) with guard/throw for bfloat16 compatibility --- .../InferenceEngines/CoreAISequentialVLMEngine.swift | 8 ++++---- .../InferenceEngines/EmbeddedInput.swift | 6 +++++- swift/Sources/Tools/llm-runner/LLMRunnerMain.swift | 1 + 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift index b9d2795..41bdb99 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift @@ -570,10 +570,10 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec } // Copy image embeddings into placeholder positions. - precondition( - imageEmbeddings.scalarType == .float16, - "scatterMerge only supports float16 embeddings; got \(imageEmbeddings.scalarType)" - ) + guard imageEmbeddings.scalarType == .float16 else { + throw InferenceRuntimeError.invalidInputType( + "scatterMerge only supports float16 embeddings; got \(imageEmbeddings.scalarType)") + } imageEmbeddings.view(as: Float16.self).withUnsafePointer { imgPtr, _, _ in var mutableView = merged.mutableView(as: Float16.self) mutableView.withUnsafeMutablePointer { mergedPtr, _, _ in diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift index bdee65b..f58b5ef 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift @@ -26,7 +26,11 @@ public struct EmbeddedInput: Sendable { /// Number of embedding tokens (seq_len dimension). public var tokenCount: Int { - embeddings.shape.count >= 2 ? embeddings.shape[1] : 0 + switch embeddings.shape.count { + case 3...: embeddings.shape[1] // [batch, seq_len, hidden_dim] + case 2: embeddings.shape[0] // [seq_len, hidden_dim] + default: 0 + } } // TODO: Multi-turn support — allow multiple image regions per input, diff --git a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift index cff76fa..5619eab 100644 --- a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift +++ b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift @@ -374,6 +374,7 @@ struct LLMRunner: AsyncParsableCommand, Sendable { ) let vlmConfig = VLMModelConfig(base: baseConfig, visionConfig: visionConfig) + // Sequential to avoid runtime errors with concurrent model preparation. let visionModel = try await PreparedModel.prepare(at: visionURL) let embedModel = try await PreparedModel.prepare(at: embedURL) let llmModel = try await PreparedModel.prepare(at: mainURL) From 5cacbca018c38896737d44ca4e0aff49dfca2b77 Mon Sep 17 00:00:00 2001 From: sukru tikves Date: Thu, 2 Jul 2026 16:46:51 -0700 Subject: [PATCH 2/5] Fix dangling @Option for maxTiles (missing var declaration) The @Option attribute for "Maximum tiles" had no associated var, causing a compile error with ArgumentParser. --- swift/Sources/Tools/llm-runner/LLMRunnerMain.swift | 2 ++ 1 file changed, 2 insertions(+) diff --git a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift index 5619eab..ab0cf60 100644 --- a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift +++ b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift @@ -180,7 +180,9 @@ struct LLMRunner: AsyncParsableCommand, Sendable { var imagePath: String? @Option( + name: .customLong("max-tiles"), help: "Maximum tiles for image splitting (overrides model config). 1 = single crop, no tiling.") + var maxTiles: Int? @Flag(help: "Enable verbose logging") var verbose: Bool = false From 4e87b0039de47ed2100665b282a2c9669e306d4b Mon Sep 17 00:00:00 2001 From: sukru tikves Date: Thu, 2 Jul 2026 18:31:25 -0700 Subject: [PATCH 3/5] Address review: remove premature maxTiles flag, centralize embedding shape handling - Remove --max-tiles CLI option (tiling model not implemented yet) - Move seq_len extraction into EmbeddedInput.seqLen(of:) so scatterMerge and other call sites use one canonical shape resolution path - tokenCount is now a stored property computed once at init --- .../CoreAISequentialVLMEngine.swift | 4 ++-- .../InferenceEngines/EmbeddedInput.swift | 20 +++++++++++++------ .../Tools/llm-runner/LLMRunnerMain.swift | 5 ----- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift index 41bdb99..dd86de8 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift @@ -556,8 +556,8 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec + "expected \(imageTokenCount) from config. Check prompt template.") } - let seqLen = textEmbeddings.shape.count >= 2 ? textEmbeddings.shape[1] : 0 - let imgSeqLen = imageEmbeddings.shape.count >= 2 ? imageEmbeddings.shape[1] : 0 + let seqLen = EmbeddedInput.seqLen(of: textEmbeddings) + let imgSeqLen = EmbeddedInput.seqLen(of: imageEmbeddings) guard imgSeqLen >= imageTokenCount else { throw InferenceRuntimeError.invalidArgument( "scatterMerge: image embeddings have \(imgSeqLen) tokens, need \(imageTokenCount)") diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift index f58b5ef..ff00f34 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift @@ -12,23 +12,31 @@ import Foundation /// language model. The engine performs scatter-merge: replacing placeholder /// token positions with these embeddings before the first forward pass. public struct EmbeddedInput: Sendable { - /// The embedding tensor, typically shape [1, seq_len, hidden_dim]. + /// The embedding tensor, shape [1, seq_len, hidden_dim] or [seq_len, hidden_dim]. /// Scalar type matches the LLM's expected input (float16, bFloat16, etc.). public let embeddings: NDArray /// Positions in the token sequence where embeddings replace placeholders. public let embeddingPositions: Range + /// The seq_len dimension, regardless of whether embeddings are 2D or 3D. + public let tokenCount: Int + public init(embeddings: NDArray, embeddingPositions: Range) { self.embeddings = embeddings self.embeddingPositions = embeddingPositions + switch embeddings.shape.count { + case 3...: self.tokenCount = embeddings.shape[1] + case 2: self.tokenCount = embeddings.shape[0] + default: self.tokenCount = 0 + } } - /// Number of embedding tokens (seq_len dimension). - public var tokenCount: Int { - switch embeddings.shape.count { - case 3...: embeddings.shape[1] // [batch, seq_len, hidden_dim] - case 2: embeddings.shape[0] // [seq_len, hidden_dim] + /// The seq_len dimension of an NDArray with the same layout conventions. + static func seqLen(of tensor: NDArray) -> Int { + switch tensor.shape.count { + case 3...: tensor.shape[1] + case 2: tensor.shape[0] default: 0 } } diff --git a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift index ab0cf60..e3ecf72 100644 --- a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift +++ b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift @@ -179,11 +179,6 @@ struct LLMRunner: AsyncParsableCommand, Sendable { @Option(name: .customLong("image"), help: "Path to an image file for vision-language models") var imagePath: String? - @Option( - name: .customLong("max-tiles"), - help: "Maximum tiles for image splitting (overrides model config). 1 = single crop, no tiling.") - var maxTiles: Int? - @Flag(help: "Enable verbose logging") var verbose: Bool = false From 8fde6f10e11aeff4a8b0855d099f46e5a90d0a5c Mon Sep 17 00:00:00 2001 From: sukru tikves Date: Thu, 2 Jul 2026 18:35:26 -0700 Subject: [PATCH 4/5] Simplify EmbeddedInput: assume 3D [batch, seq_len, hidden_dim] layout All current VLM models produce 3D embeddings. Drop the 2D fallback and the seqLen helper -- tokenCount is just shape[1]. --- .../CoreAISequentialVLMEngine.swift | 4 ++-- .../InferenceEngines/EmbeddedInput.swift | 20 +++---------------- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift index dd86de8..58d7e66 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift @@ -556,8 +556,8 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec + "expected \(imageTokenCount) from config. Check prompt template.") } - let seqLen = EmbeddedInput.seqLen(of: textEmbeddings) - let imgSeqLen = EmbeddedInput.seqLen(of: imageEmbeddings) + let seqLen = textEmbeddings.shape[1] + let imgSeqLen = imageEmbeddings.shape[1] guard imgSeqLen >= imageTokenCount else { throw InferenceRuntimeError.invalidArgument( "scatterMerge: image embeddings have \(imgSeqLen) tokens, need \(imageTokenCount)") diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift index ff00f34..fd701fb 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift @@ -12,34 +12,20 @@ import Foundation /// language model. The engine performs scatter-merge: replacing placeholder /// token positions with these embeddings before the first forward pass. public struct EmbeddedInput: Sendable { - /// The embedding tensor, shape [1, seq_len, hidden_dim] or [seq_len, hidden_dim]. + /// The embedding tensor, shape [batch, seq_len, hidden_dim]. /// Scalar type matches the LLM's expected input (float16, bFloat16, etc.). public let embeddings: NDArray /// Positions in the token sequence where embeddings replace placeholders. public let embeddingPositions: Range - /// The seq_len dimension, regardless of whether embeddings are 2D or 3D. - public let tokenCount: Int - public init(embeddings: NDArray, embeddingPositions: Range) { self.embeddings = embeddings self.embeddingPositions = embeddingPositions - switch embeddings.shape.count { - case 3...: self.tokenCount = embeddings.shape[1] - case 2: self.tokenCount = embeddings.shape[0] - default: self.tokenCount = 0 - } } - /// The seq_len dimension of an NDArray with the same layout conventions. - static func seqLen(of tensor: NDArray) -> Int { - switch tensor.shape.count { - case 3...: tensor.shape[1] - case 2: tensor.shape[0] - default: 0 - } - } + /// Number of embedding tokens (seq_len dimension). + public var tokenCount: Int { embeddings.shape[1] } // TODO: Multi-turn support — allow multiple image regions per input, // persistent across generate() calls (keep in KV cache on reset). From b5aa78db45c5e891a65999bff27adc48ba324b93 Mon Sep 17 00:00:00 2001 From: sukru tikves Date: Thu, 2 Jul 2026 18:36:22 -0700 Subject: [PATCH 5/5] EmbeddedInput: validate exactly 3D shape at init, throw on mismatch --- .../InferenceEngines/CoreAISequentialVLMEngine.swift | 2 +- .../InferenceEngines/EmbeddedInput.swift | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift index 58d7e66..6ec79ff 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift @@ -381,7 +381,7 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec CLILogger.log("VLM encodeImage complete: \(tokenCount) embedding tokens") - return EmbeddedInput( + return try EmbeddedInput( embeddings: projectedEmbeddings, embeddingPositions: placeholderRange ) diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift index fd701fb..29e10fb 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift @@ -19,7 +19,12 @@ public struct EmbeddedInput: Sendable { /// Positions in the token sequence where embeddings replace placeholders. public let embeddingPositions: Range - public init(embeddings: NDArray, embeddingPositions: Range) { + public init(embeddings: NDArray, embeddingPositions: Range) throws { + guard embeddings.shape.count == 3 else { + throw InferenceRuntimeError.invalidArgument( + "EmbeddedInput requires 3D embeddings [batch, seq_len, hidden_dim], " + + "got shape with \(embeddings.shape.count) dimensions") + } self.embeddings = embeddings self.embeddingPositions = embeddingPositions }