diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift index b9d2795..6ec79ff 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift @@ -381,7 +381,7 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec CLILogger.log("VLM encodeImage complete: \(tokenCount) embedding tokens") - return EmbeddedInput( + return try EmbeddedInput( embeddings: projectedEmbeddings, embeddingPositions: placeholderRange ) @@ -556,8 +556,8 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec + "expected \(imageTokenCount) from config. Check prompt template.") } - let seqLen = textEmbeddings.shape.count >= 2 ? textEmbeddings.shape[1] : 0 - let imgSeqLen = imageEmbeddings.shape.count >= 2 ? imageEmbeddings.shape[1] : 0 + let seqLen = textEmbeddings.shape[1] + let imgSeqLen = imageEmbeddings.shape[1] guard imgSeqLen >= imageTokenCount else { throw InferenceRuntimeError.invalidArgument( "scatterMerge: image embeddings have \(imgSeqLen) tokens, need \(imageTokenCount)") @@ -570,10 +570,10 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec } // Copy image embeddings into placeholder positions. - precondition( - imageEmbeddings.scalarType == .float16, - "scatterMerge only supports float16 embeddings; got \(imageEmbeddings.scalarType)" - ) + guard imageEmbeddings.scalarType == .float16 else { + throw InferenceRuntimeError.invalidInputType( + "scatterMerge only supports float16 embeddings; got \(imageEmbeddings.scalarType)") + } imageEmbeddings.view(as: Float16.self).withUnsafePointer { imgPtr, _, _ in var mutableView = merged.mutableView(as: Float16.self) mutableView.withUnsafeMutablePointer { mergedPtr, _, _ in diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift index bdee65b..29e10fb 100644 --- a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift +++ b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift @@ -12,22 +12,25 @@ import Foundation /// language model. The engine performs scatter-merge: replacing placeholder /// token positions with these embeddings before the first forward pass. public struct EmbeddedInput: Sendable { - /// The embedding tensor, typically shape [1, seq_len, hidden_dim]. + /// The embedding tensor, shape [batch, seq_len, hidden_dim]. /// Scalar type matches the LLM's expected input (float16, bFloat16, etc.). public let embeddings: NDArray /// Positions in the token sequence where embeddings replace placeholders. public let embeddingPositions: Range - public init(embeddings: NDArray, embeddingPositions: Range) { + public init(embeddings: NDArray, embeddingPositions: Range) throws { + guard embeddings.shape.count == 3 else { + throw InferenceRuntimeError.invalidArgument( + "EmbeddedInput requires 3D embeddings [batch, seq_len, hidden_dim], " + + "got shape with \(embeddings.shape.count) dimensions") + } self.embeddings = embeddings self.embeddingPositions = embeddingPositions } /// Number of embedding tokens (seq_len dimension). - public var tokenCount: Int { - embeddings.shape.count >= 2 ? embeddings.shape[1] : 0 - } + public var tokenCount: Int { embeddings.shape[1] } // TODO: Multi-turn support — allow multiple image regions per input, // persistent across generate() calls (keep in KV cache on reset). diff --git a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift index cff76fa..e3ecf72 100644 --- a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift +++ b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift @@ -179,9 +179,6 @@ struct LLMRunner: AsyncParsableCommand, Sendable { @Option(name: .customLong("image"), help: "Path to an image file for vision-language models") var imagePath: String? - @Option( - help: "Maximum tiles for image splitting (overrides model config). 1 = single crop, no tiling.") - @Flag(help: "Enable verbose logging") var verbose: Bool = false @@ -374,6 +371,7 @@ struct LLMRunner: AsyncParsableCommand, Sendable { ) let vlmConfig = VLMModelConfig(base: baseConfig, visionConfig: visionConfig) + // Sequential to avoid runtime errors with concurrent model preparation. let visionModel = try await PreparedModel.prepare(at: visionURL) let embedModel = try await PreparedModel.prepare(at: embedURL) let llmModel = try await PreparedModel.prepare(at: mainURL)