apple · stikves · Jul 2, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 3, 2026
diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/CoreAISequentialVLMEngine.swift
@@ -381,7 +381,7 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec
 
         CLILogger.log("VLM encodeImage complete: \(tokenCount) embedding tokens")
 
-        return EmbeddedInput(
+        return try EmbeddedInput(
             embeddings: projectedEmbeddings,
             embeddingPositions: placeholderRange
         )
@@ -556,8 +556,8 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec
                     + "expected \(imageTokenCount) from config. Check prompt template.")
         }
 
-        let seqLen = textEmbeddings.shape.count >= 2 ? textEmbeddings.shape[1] : 0
-        let imgSeqLen = imageEmbeddings.shape.count >= 2 ? imageEmbeddings.shape[1] : 0
+        let seqLen = textEmbeddings.shape[1]
+        let imgSeqLen = imageEmbeddings.shape[1]
         guard imgSeqLen >= imageTokenCount else {
             throw InferenceRuntimeError.invalidArgument(
                 "scatterMerge: image embeddings have \(imgSeqLen) tokens, need \(imageTokenCount)")
@@ -570,10 +570,10 @@ public final class CoreAISequentialVLMEngine: MultimodalInferenceEngine, @unchec
         }
 
         // Copy image embeddings into placeholder positions.
-        precondition(
-            imageEmbeddings.scalarType == .float16,
-            "scatterMerge only supports float16 embeddings; got \(imageEmbeddings.scalarType)"
-        )
+        guard imageEmbeddings.scalarType == .float16 else {
+            throw InferenceRuntimeError.invalidInputType(
+                "scatterMerge only supports float16 embeddings; got \(imageEmbeddings.scalarType)")
+        }
         imageEmbeddings.view(as: Float16.self).withUnsafePointer { imgPtr, _, _ in
             var mutableView = merged.mutableView(as: Float16.self)
             mutableView.withUnsafeMutablePointer { mergedPtr, _, _ in

diff --git a/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift b/swift/Sources/CoreAILanguageModels/InferenceEngines/EmbeddedInput.swift
@@ -12,22 +12,25 @@ import Foundation
 /// language model. The engine performs scatter-merge: replacing placeholder
 /// token positions with these embeddings before the first forward pass.
 public struct EmbeddedInput: Sendable {
-    /// The embedding tensor, typically shape [1, seq_len, hidden_dim].
+    /// The embedding tensor, shape [batch, seq_len, hidden_dim].
     /// Scalar type matches the LLM's expected input (float16, bFloat16, etc.).
     public let embeddings: NDArray
 
     /// Positions in the token sequence where embeddings replace placeholders.
     public let embeddingPositions: Range<Int>
 
-    public init(embeddings: NDArray, embeddingPositions: Range<Int>) {
+    public init(embeddings: NDArray, embeddingPositions: Range<Int>) throws {
+        guard embeddings.shape.count == 3 else {
+            throw InferenceRuntimeError.invalidArgument(
+                "EmbeddedInput requires 3D embeddings [batch, seq_len, hidden_dim], "
+                    + "got shape with \(embeddings.shape.count) dimensions")
+        }
         self.embeddings = embeddings
         self.embeddingPositions = embeddingPositions
     }
 
     /// Number of embedding tokens (seq_len dimension).
-    public var tokenCount: Int {
-        embeddings.shape.count >= 2 ? embeddings.shape[1] : 0
-    }
+    public var tokenCount: Int { embeddings.shape[1] }
 
     // TODO: Multi-turn support — allow multiple image regions per input,
     // persistent across generate() calls (keep in KV cache on reset).

diff --git a/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift b/swift/Sources/Tools/llm-runner/LLMRunnerMain.swift
@@ -179,9 +179,6 @@ struct LLMRunner: AsyncParsableCommand, Sendable {
     @Option(name: .customLong("image"), help: "Path to an image file for vision-language models")
     var imagePath: String?
 
-    @Option(
-        help: "Maximum tiles for image splitting (overrides model config). 1 = single crop, no tiling.")
-
     @Flag(help: "Enable verbose logging")
     var verbose: Bool = false
 
@@ -374,6 +371,7 @@ struct LLMRunner: AsyncParsableCommand, Sendable {
             )
             let vlmConfig = VLMModelConfig(base: baseConfig, visionConfig: visionConfig)
 
+            // Sequential to avoid runtime errors with concurrent model preparation.
             let visionModel = try await PreparedModel.prepare(at: visionURL)
             let embedModel = try await PreparedModel.prepare(at: embedURL)
             let llmModel = try await PreparedModel.prepare(at: mainURL)