From 0ac42b4a049664264cb6b4fd1e4110447ea5c2e4 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 19 Feb 2026 16:18:38 +0100
Subject: [PATCH 01/46] feat: initial implementation of multimodal runner with
 lfm vlm

---
 apps/llm/app.json                             |   3 +-
 apps/llm/app/_layout.tsx                      |  22 +-
 apps/llm/app/index.tsx                        |  22 +-
 apps/llm/app/multimodal_llm/index.tsx         | 389 ++++++++++++++++++
 apps/llm/package.json                         |   2 +
 .../rnexecutorch/RnExecutorchInstaller.cpp    |   6 +
 .../host_objects/ModelHostObject.h            |  24 +-
 .../models/multimodal_llm/MultimodalLLM.cpp   | 197 +++++++++
 .../models/multimodal_llm/MultimodalLLM.h     |  40 ++
 .../common/runner/image.h                     |  88 ++++
 .../common/runner/multimodal_decoder_runner.h |  73 ++++
 .../common/runner/multimodal_input.h          |  74 ++++
 .../common/runner/multimodal_prefiller.cpp    | 179 ++++++++
 .../common/runner/multimodal_prefiller.h      |  48 +++
 .../common/runner/multimodal_runner.cpp       | 149 +++++++
 .../common/runner/multimodal_runner.h         |  68 +++
 .../useMultimodalLLM.ts                       | 153 +++++++
 packages/react-native-executorch/src/index.ts |   3 +
 18 files changed, 1511 insertions(+), 29 deletions(-)
 create mode 100644 apps/llm/app/multimodal_llm/index.tsx
 create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp
 create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h
 create mode 100644 packages/react-native-executorch/common/runner/image.h
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_input.h
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_prefiller.h
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.cpp
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.h
 create mode 100644 packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts

diff --git a/apps/llm/app.json b/apps/llm/app.json
index e4d2da0d6..36a042341 100644
--- a/apps/llm/app.json
+++ b/apps/llm/app.json
@@ -55,7 +55,8 @@
       },
       "entitlements": {
         "com.apple.developer.kernel.increased-memory-limit": true
-      }
+      },
+      "appleTeamId": "B357MU264T"
     },
     "android": {
       "adaptiveIcon": {
diff --git a/apps/llm/app/_layout.tsx b/apps/llm/app/_layout.tsx
index 5ece80f1f..523d3aaf7 100644
--- a/apps/llm/app/_layout.tsx
+++ b/apps/llm/app/_layout.tsx
@@ -57,37 +57,45 @@ export default function _layout() {
           headerTitleStyle: { color: ColorPalette.primary },
         }}
       >
-        <Drawer.Screen
+        {/* <Drawer.Screen
           name="llm/index"
           options={{
             drawerLabel: 'LLM',
             title: 'LLM',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        />
-        <Drawer.Screen
+        /> */}
+        {/* <Drawer.Screen
           name="llm_tool_calling/index"
           options={{
             drawerLabel: 'LLM Tool Calling',
             title: 'LLM Tool Calling',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        />
-        <Drawer.Screen
+        /> */}
+        {/* <Drawer.Screen
           name="llm_structured_output/index"
           options={{
             drawerLabel: 'LLM Structured Output',
             title: 'LLM Structured Output',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        />
-        <Drawer.Screen
+        /> */}
+        {/* <Drawer.Screen
           name="voice_chat/index"
           options={{
             drawerLabel: 'Voice Chat',
             title: 'Voice Chat',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
+        /> */}
+        <Drawer.Screen
+          name="multimodal_llm/index"
+          options={{
+            drawerLabel: 'Multimodal LLM (VLM)',
+            title: 'Multimodal LLM',
+            headerTitleStyle: { color: ColorPalette.primary },
+          }}
         />
         <Drawer.Screen
           name="index"
diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx
index 899746206..7c723a2ba 100644
--- a/apps/llm/app/index.tsx
+++ b/apps/llm/app/index.tsx
@@ -13,27 +13,9 @@ export default function Home() {
       <View style={styles.buttonContainer}>
         <TouchableOpacity
           style={styles.button}
-          onPress={() => router.navigate('llm/')}
+          onPress={() => router.navigate('multimodal_llm/')}
         >
-          <Text style={styles.buttonText}>LLM</Text>
-        </TouchableOpacity>
-        <TouchableOpacity
-          style={styles.button}
-          onPress={() => router.navigate('llm_tool_calling/')}
-        >
-          <Text style={styles.buttonText}>LLM Tool Calling</Text>
-        </TouchableOpacity>
-        <TouchableOpacity
-          style={styles.button}
-          onPress={() => router.navigate('llm_structured_output/')}
-        >
-          <Text style={styles.buttonText}>LLM Structured Output</Text>
-        </TouchableOpacity>
-        <TouchableOpacity
-          style={styles.button}
-          onPress={() => router.navigate('voice_chat/')}
-        >
-          <Text style={styles.buttonText}>Voice Chat</Text>
+          <Text style={styles.buttonText}>Multimodal LLM (VLM)</Text>
         </TouchableOpacity>
       </View>
     </View>
diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
new file mode 100644
index 000000000..3a3b4692b
--- /dev/null
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -0,0 +1,389 @@
+import { useContext, useEffect, useRef, useState } from 'react';
+import {
+  Image,
+  Keyboard,
+  KeyboardAvoidingView,
+  Platform,
+  ScrollView,
+  StyleSheet,
+  Text,
+  TextInput,
+  TouchableOpacity,
+  TouchableWithoutFeedback,
+  View,
+} from 'react-native';
+import * as DocumentPicker from 'expo-document-picker';
+import { launchImageLibrary } from 'react-native-image-picker';
+import { useIsFocused } from '@react-navigation/native';
+import { useMultimodalLLM } from 'react-native-executorch';
+import ColorPalette from '../../colors';
+import Spinner from '../../components/Spinner';
+import { GeneratingContext } from '../../context';
+
+export default function MultimodalLLMScreenWrapper() {
+  const isFocused = useIsFocused();
+  return isFocused ? <MultimodalLLMScreenOuter /> : null;
+}
+
+// Outer component: collect model + tokenizer paths before mounting the hook
+function MultimodalLLMScreenOuter() {
+  const [modelUri, setModelUri] = useState<string | null>(null);
+  const [tokenizerUri, setTokenizerUri] = useState<string | null>(null);
+  const [confirmed, setConfirmed] = useState(false);
+
+  const pickFile = async (setter: (uri: string) => void) => {
+    const result = await DocumentPicker.getDocumentAsync({
+      copyToCacheDirectory: false,
+      multiple: false,
+    });
+    if (result.canceled) return;
+    const asset = result.assets[0];
+    if (asset?.uri) {
+      setter(asset.uri);
+    }
+  };
+
+  if (!confirmed) {
+    return (
+      <View style={styles.setupContainer}>
+        <Text style={styles.setupTitle}>Select model files</Text>
+        <Text style={styles.setupHint}>
+          Pick the .pte model and tokenizer.json from your device storage.
+        </Text>
+
+        <FilePicker
+          label="Model (.pte)"
+          uri={modelUri}
+          onPick={() => pickFile(setModelUri)}
+        />
+        <FilePicker
+          label="Tokenizer (.json)"
+          uri={tokenizerUri}
+          onPick={() => pickFile(setTokenizerUri)}
+        />
+
+        <TouchableOpacity
+          style={[
+            styles.loadButton,
+            (!modelUri || !tokenizerUri) && styles.loadButtonDisabled,
+          ]}
+          disabled={!modelUri || !tokenizerUri}
+          onPress={() => setConfirmed(true)}
+        >
+          <Text style={styles.loadButtonText}>Load model</Text>
+        </TouchableOpacity>
+      </View>
+    );
+  }
+
+  return (
+    <MultimodalLLMScreen
+      modelSource={modelUri!}
+      tokenizerSource={tokenizerUri!}
+    />
+  );
+}
+
+function FilePicker({
+  label,
+  uri,
+  onPick,
+}: {
+  label: string;
+  uri: string | null;
+  onPick: () => void;
+}) {
+  const fileName = uri ? (uri.split('/').pop() ?? uri) : null;
+  return (
+    <TouchableOpacity style={styles.filePickerRow} onPress={onPick}>
+      <View style={styles.filePickerInfo}>
+        <Text style={styles.filePickerLabel}>{label}</Text>
+        <Text
+          style={[
+            styles.filePickerValue,
+            uri ? styles.filePickerValueSet : styles.filePickerValueEmpty,
+          ]}
+          numberOfLines={1}
+          ellipsizeMode="middle"
+        >
+          {fileName ?? 'Tap to pick file'}
+        </Text>
+      </View>
+      <Text style={styles.filePickerChevron}>›</Text>
+    </TouchableOpacity>
+  );
+}
+
+function MultimodalLLMScreen({
+  modelSource,
+  tokenizerSource,
+}: {
+  modelSource: string;
+  tokenizerSource: string;
+}) {
+  const [imageUri, setImageUri] = useState<string | null>(null);
+  const [prompt, setPrompt] = useState('');
+  const [isTextInputFocused, setIsTextInputFocused] = useState(false);
+  const scrollViewRef = useRef<ScrollView>(null);
+  const { setGlobalGenerating } = useContext(GeneratingContext);
+
+  const vlm = useMultimodalLLM({ model: { modelSource, tokenizerSource } });
+
+  useEffect(() => {
+    setGlobalGenerating(vlm.isGenerating);
+  }, [vlm.isGenerating, setGlobalGenerating]);
+
+  useEffect(() => {
+    if (vlm.error) {
+      console.error('MultimodalLLM error:', vlm.error);
+    }
+  }, [vlm.error]);
+
+  const pickImage = async () => {
+    const result = await launchImageLibrary({ mediaType: 'photo' });
+    if (result.assets && result.assets.length > 0) {
+      const uri = result.assets[0]?.uri;
+      if (uri) {
+        setImageUri(uri);
+      }
+    }
+  };
+
+  const handleGenerate = async () => {
+    if (!imageUri || !prompt.trim() || !vlm.isReady || vlm.isGenerating) return;
+    Keyboard.dismiss();
+    try {
+      await vlm.generate(imageUri, prompt.trim());
+    } catch (e) {
+      console.error('Generation error:', e);
+    }
+  };
+
+  if (!vlm.isReady) {
+    return (
+      <Spinner
+        visible={!vlm.isReady}
+        textContent={
+          vlm.error
+            ? `Error: ${vlm.error.message}`
+            : `Loading model ${(vlm.downloadProgress * 100).toFixed(0)}%`
+        }
+      />
+    );
+  }
+
+  return (
+    <TouchableWithoutFeedback onPress={Keyboard.dismiss}>
+      <KeyboardAvoidingView
+        style={styles.container}
+        collapsable={false}
+        behavior={Platform.OS === 'ios' ? 'padding' : undefined}
+        keyboardVerticalOffset={Platform.OS === 'ios' ? 120 : 40}
+      >
+        <ScrollView
+          ref={scrollViewRef}
+          style={styles.scrollView}
+          contentContainerStyle={styles.scrollContent}
+          onContentSizeChange={() =>
+            scrollViewRef.current?.scrollToEnd({ animated: true })
+          }
+        >
+          {/* Image picker */}
+          <TouchableOpacity style={styles.imagePicker} onPress={pickImage}>
+            {imageUri ? (
+              <Image
+                source={{ uri: imageUri }}
+                style={styles.previewImage}
+                resizeMode="cover"
+              />
+            ) : (
+              <Text style={styles.imagePickerText}>Tap to pick an image</Text>
+            )}
+          </TouchableOpacity>
+
+          {/* Response area */}
+          {vlm.response ? (
+            <View style={styles.responseContainer}>
+              <Text style={styles.responseLabel}>Response:</Text>
+              <Text style={styles.responseText}>{vlm.response}</Text>
+            </View>
+          ) : vlm.isGenerating ? (
+            <View style={styles.responseContainer}>
+              <Text style={styles.responseLabel}>Generating…</Text>
+            </View>
+          ) : null}
+        </ScrollView>
+
+        {/* Bottom bar */}
+        <View style={styles.bottomContainer}>
+          <TextInput
+            autoCorrect={false}
+            onFocus={() => setIsTextInputFocused(true)}
+            onBlur={() => setIsTextInputFocused(false)}
+            style={[
+              styles.textInput,
+              {
+                borderColor: isTextInputFocused
+                  ? ColorPalette.blueDark
+                  : ColorPalette.blueLight,
+              },
+            ]}
+            placeholder="Ask about the image…"
+            placeholderTextColor="#C1C6E5"
+            multiline
+            value={prompt}
+            onChangeText={setPrompt}
+          />
+          {vlm.isGenerating ? (
+            <TouchableOpacity
+              style={styles.actionButton}
+              onPress={vlm.interrupt}
+            >
+              <Text style={styles.actionButtonText}>Stop</Text>
+            </TouchableOpacity>
+          ) : (
+            <TouchableOpacity
+              style={[
+                styles.actionButton,
+                (!imageUri || !prompt.trim()) && styles.actionButtonDisabled,
+              ]}
+              onPress={handleGenerate}
+              disabled={!imageUri || !prompt.trim()}
+            >
+              <Text style={styles.actionButtonText}>Ask</Text>
+            </TouchableOpacity>
+          )}
+        </View>
+      </KeyboardAvoidingView>
+    </TouchableWithoutFeedback>
+  );
+}
+
+const styles = StyleSheet.create({
+  // Setup phase
+  setupContainer: {
+    flex: 1,
+    padding: 24,
+    backgroundColor: '#fff',
+    justifyContent: 'center',
+  },
+  setupTitle: {
+    fontSize: 20,
+    fontFamily: 'medium',
+    color: ColorPalette.primary,
+    marginBottom: 8,
+  },
+  setupHint: {
+    fontSize: 13,
+    fontFamily: 'regular',
+    color: ColorPalette.blueDark,
+    marginBottom: 32,
+    lineHeight: 18,
+  },
+  filePickerRow: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    borderRadius: 10,
+    padding: 14,
+    marginBottom: 12,
+    backgroundColor: '#fafbff',
+  },
+  filePickerInfo: { flex: 1 },
+  filePickerLabel: {
+    fontSize: 12,
+    fontFamily: 'medium',
+    color: ColorPalette.blueDark,
+    marginBottom: 2,
+  },
+  filePickerValue: { fontSize: 14, fontFamily: 'regular' },
+  filePickerValueSet: { color: ColorPalette.primary },
+  filePickerValueEmpty: { color: ColorPalette.blueLight },
+  filePickerChevron: {
+    fontSize: 24,
+    color: ColorPalette.blueLight,
+    marginLeft: 8,
+  },
+  loadButton: {
+    marginTop: 16,
+    backgroundColor: ColorPalette.strongPrimary,
+    borderRadius: 10,
+    padding: 14,
+    alignItems: 'center',
+  },
+  loadButtonDisabled: { backgroundColor: ColorPalette.blueLight },
+  loadButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 15 },
+
+  // Chat phase
+  container: { flex: 1, backgroundColor: '#fff' },
+  scrollView: { flex: 1 },
+  scrollContent: { padding: 16, paddingBottom: 8 },
+  imagePicker: {
+    width: '100%',
+    height: 220,
+    borderRadius: 12,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
+    borderStyle: 'dashed',
+    justifyContent: 'center',
+    alignItems: 'center',
+    overflow: 'hidden',
+    marginBottom: 16,
+  },
+  previewImage: { width: '100%', height: '100%' },
+  imagePickerText: {
+    color: ColorPalette.blueLight,
+    fontSize: 16,
+    fontFamily: 'regular',
+  },
+  responseContainer: {
+    backgroundColor: ColorPalette.seaBlueLight,
+    borderRadius: 8,
+    padding: 12,
+    marginBottom: 8,
+  },
+  responseLabel: {
+    fontSize: 12,
+    color: ColorPalette.blueDark,
+    fontFamily: 'medium',
+    marginBottom: 4,
+  },
+  responseText: {
+    fontSize: 14,
+    lineHeight: 20,
+    color: ColorPalette.primary,
+    fontFamily: 'regular',
+  },
+  bottomContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    paddingHorizontal: 16,
+    paddingVertical: 12,
+    borderTopWidth: 1,
+    borderTopColor: ColorPalette.blueLight,
+    backgroundColor: '#fff',
+  },
+  textInput: {
+    flex: 1,
+    borderWidth: 1,
+    borderRadius: 8,
+    fontSize: 14,
+    lineHeight: 19.6,
+    fontFamily: 'regular',
+    color: ColorPalette.primary,
+    padding: 12,
+    maxHeight: 100,
+  },
+  actionButton: {
+    marginLeft: 8,
+    backgroundColor: ColorPalette.strongPrimary,
+    borderRadius: 8,
+    paddingHorizontal: 16,
+    paddingVertical: 12,
+    justifyContent: 'center',
+    alignItems: 'center',
+  },
+  actionButtonDisabled: { backgroundColor: ColorPalette.blueLight },
+  actionButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 14 },
+});
diff --git a/apps/llm/package.json b/apps/llm/package.json
index f58bc8127..d0fbb6401 100644
--- a/apps/llm/package.json
+++ b/apps/llm/package.json
@@ -19,6 +19,7 @@
     "expo-brightness": "~14.0.8",
     "expo-calendar": "~15.0.8",
     "expo-constants": "~18.0.11",
+    "expo-document-picker": "~13.0.3",
     "expo-font": "~14.0.10",
     "expo-linking": "~8.0.10",
     "expo-router": "~6.0.17",
@@ -30,6 +31,7 @@
     "react-native-device-info": "^15.0.2",
     "react-native-executorch": "workspace:*",
     "react-native-gesture-handler": "~2.28.0",
+    "react-native-image-picker": "^7.2.2",
     "react-native-loading-spinner-overlay": "^3.0.1",
     "react-native-markdown-display": "^7.0.2",
     "react-native-reanimated": "~4.1.1",
diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp
index 9d4b419e2..4cb6afef8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp
@@ -6,6 +6,7 @@
 #include <rnexecutorch/models/embeddings/image/ImageEmbeddings.h>
 #include <rnexecutorch/models/embeddings/text/TextEmbeddings.h>
 #include <rnexecutorch/models/llm/LLM.h>
+#include <rnexecutorch/models/multimodal_llm/MultimodalLLM.h>
 #include <rnexecutorch/models/object_detection/ObjectDetection.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h>
@@ -88,6 +89,11 @@ void RnExecutorchInstaller::injectJSIBindings(
       RnExecutorchInstaller::loadModel<models::llm::LLM>(
           jsiRuntime, jsCallInvoker, "loadLLM"));
 
+  jsiRuntime->global().setProperty(
+      *jsiRuntime, "loadMultimodalLLM",
+      RnExecutorchInstaller::loadModel<models::multimodal_llm::MultimodalLLM>(
+          jsiRuntime, jsCallInvoker, "loadMultimodalLLM"));
+
   jsiRuntime->global().setProperty(
       *jsiRuntime, "loadOCR",
       RnExecutorchInstaller::loadModel<models::ocr::OCR>(
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 7712b2b9d..2b7cbc2e1 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -18,6 +18,7 @@
 #include <rnexecutorch/metaprogramming/TypeConcepts.h>
 #include <rnexecutorch/models/BaseModel.h>
 #include <rnexecutorch/models/llm/LLM.h>
+#include <rnexecutorch/models/multimodal_llm/MultimodalLLM.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/speech_to_text/SpeechToText.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
@@ -32,7 +33,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
   explicit ModelHostObject(const std::shared_ptr<Model> &model,
                            std::shared_ptr<react::CallInvoker> callInvoker)
       : model(model), callInvoker(callInvoker) {
-    if constexpr (meta::DerivedFromOrSameAs<Model, models::BaseModel>) {
+    // MultimodalLLM moves module_ into its runner during construction, so
+    // the base class methods that go through module_ (forward, getInputShape)
+    // are unsafe to expose. Its unload is registered separately below.
+    if constexpr (meta::DerivedFromOrSameAs<Model, models::BaseModel> &&
+                  !meta::SameAs<Model, models::multimodal_llm::MultimodalLLM>) {
       addFunctions(
           JSI_EXPORT_FUNCTION(ModelHostObject<Model>, unload, "unload"));
 
@@ -172,6 +177,23 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
           ModelHostObject<Model>, synchronousHostFunction<&Model::streamStop>,
           "streamStop"));
     }
+
+    if constexpr (meta::SameAs<Model, models::multimodal_llm::MultimodalLLM>) {
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>, synchronousHostFunction<&Model::interrupt>,
+          "interrupt"));
+
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          synchronousHostFunction<&Model::setTemperature>, "setTemperature"));
+
+      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                                       synchronousHostFunction<&Model::setTopp>,
+                                       "setTopp"));
+
+      addFunctions(
+          JSI_EXPORT_FUNCTION(ModelHostObject<Model>, unload, "unload"));
+    }
   }
 
   // A generic host function that runs synchronously, works analogously to the
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp
new file mode 100644
index 000000000..7187b3d57
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp
@@ -0,0 +1,197 @@
+#include "MultimodalLLM.h"
+
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor.h>
+#include <filesystem>
+#include <rnexecutorch/Error.h>
+#include <rnexecutorch/data_processing/ImageProcessing.h>
+#include <runner/multimodal_decoder_runner.h>
+#include <runner/multimodal_prefiller.h>
+
+namespace rnexecutorch::models::multimodal_llm {
+namespace llm = ::executorch::extension::llm;
+namespace fs = std::filesystem;
+using namespace facebook;
+using ::executorch::extension::module::Module;
+using ::executorch::runtime::Error;
+
+// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values in
+// [0,255]. Normalization and patch unfolding are baked into the exported PTE.
+static constexpr int kImageSize = 512;
+static constexpr int kImageChannels = 3;
+
+// LFM2-VL chat template
+static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
+static constexpr const char *kChatSuffix =
+    "<|im_end|>\n<|im_start|>assistant\n";
+
+static llm::Image loadImageForLFM2(const std::string &imagePath) {
+  cv::Mat mat = image_processing::readImage(imagePath);
+  cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
+  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
+
+  // HWC uint8 → CHW float32, values in [0, 255]
+  std::vector<float> chw(kImageChannels * kImageSize * kImageSize);
+  const int pixelCount = kImageSize * kImageSize;
+  for (int i = 0; i < pixelCount; ++i) {
+    cv::Vec3b px = mat.at<cv::Vec3b>(i / kImageSize, i % kImageSize);
+    for (int c = 0; c < kImageChannels; ++c) {
+      chw[c * pixelCount + i] = static_cast<float>(px[c]);
+    }
+  }
+  return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels);
+}
+
+MultimodalLLM::MultimodalLLM(const std::string &modelSource,
+                             const std::string &tokenizerSource,
+                             std::shared_ptr<react::CallInvoker> callInvoker)
+    : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
+  // Build the multimodal runner from parts — all referencing module_ owned by
+  // BaseModel so we don't load the PTE twice.
+  auto tokenizer = std::make_unique<tokenizers::HFTokenizer>();
+  auto tokenizer_status = tokenizer->load(tokenizerSource);
+  if (tokenizer_status != tokenizers::Error::Ok) {
+    throw RnExecutorchError(RnExecutorchErrorCode::TokenizerError,
+                            "Failed to load tokenizer");
+  }
+
+  auto io_manager = std::make_unique<llm::IOManager>(*module_);
+  auto decoder_runner = std::make_unique<llm::MultimodalDecoderRunner>(
+      module_.get(), io_manager.get());
+
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  // Read EOS ids from PTE constant method if present, default to 7 (<|im_end|>)
+  auto method_names_result = module_->method_names();
+  if (method_names_result.ok()) {
+    if (method_names_result->count(llm::kEosIds)) {
+      auto eos_result = module_->execute(llm::kEosIds);
+      if (eos_result.ok()) {
+        for (const auto &ev : *eos_result) {
+          eos_ids->emplace(static_cast<uint64_t>(ev.toScalar().to<int64_t>()));
+        }
+      }
+    }
+  }
+  if (eos_ids->empty()) {
+    eos_ids->emplace(7); // <|im_end|> fallback
+  }
+
+  auto stats = std::make_unique<llm::Stats>();
+  // Keep a raw pointer before moving into the runner so TextTokenGenerator
+  // can safely reference the same Stats object owned by the runner.
+  llm::Stats *stats_ptr = stats.get();
+  auto token_generator = std::make_unique<llm::TextTokenGenerator>(
+      tokenizer.get(), decoder_runner.get(), /*use_kv_cache=*/true,
+      std::move(eos_ids), stats_ptr);
+
+  auto prefiller = std::make_unique<llm::MultimodalPrefiller>(
+      module_.get(), decoder_runner.get(), tokenizer.get(), io_manager.get());
+
+  // Read metadata from the PTE
+  std::unordered_map<std::string, int64_t> metadata = {
+      {llm::kMaxSeqLen, 2048},
+      {llm::kMaxContextLen, 2048},
+  };
+  if (method_names_result.ok()) {
+    for (auto &pair : metadata) {
+      if (method_names_result->count(pair.first)) {
+        auto val = module_->get(pair.first);
+        if (val.ok()) {
+          pair.second = val->toScalar().to<int64_t>();
+        }
+      }
+    }
+  }
+
+  runner_ = std::make_unique<llm::MultimodalRunner>(
+      std::move(metadata), std::move(tokenizer), std::move(module_),
+      std::move(decoder_runner), std::move(prefiller), std::move(io_manager),
+      std::move(token_generator), std::move(stats));
+
+  auto loadError = runner_->load();
+  if (loadError != Error::Ok) {
+    throw RnExecutorchError(loadError, "Failed to load multimodal runner");
+  }
+
+  memorySizeLowerBound = fs::file_size(fs::path(modelSource)) +
+                         fs::file_size(fs::path(tokenizerSource));
+}
+
+std::string MultimodalLLM::generate(std::string imagePath, std::string prompt,
+                                    std::shared_ptr<jsi::Function> callback) {
+  if (!runner_) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Runner is not loaded");
+  }
+
+  llm::Image image = loadImageForLFM2(imagePath);
+
+  std::vector<llm::MultimodalInput> inputs = {
+      llm::make_text_input(std::string(kChatPrefix)),
+      llm::make_image_input(std::move(image)),
+      llm::make_text_input(prompt + kChatSuffix),
+  };
+
+  std::string output;
+  auto nativeCallback = [this, &callback, &output](const std::string &token) {
+    output += token;
+    if (callback && callInvoker) {
+      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
+        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
+      });
+    }
+  };
+
+  auto error = runner_->generate(inputs, temperature_, topp_,
+                                 /*max_new_tokens=*/-1, nativeCallback);
+  if (error != Error::Ok) {
+    throw RnExecutorchError(error, "Failed to generate text");
+  }
+
+  runner_->reset();
+  return output;
+}
+
+void MultimodalLLM::interrupt() {
+  if (!runner_) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Can't interrupt a model that's not loaded");
+  }
+  runner_->stop();
+}
+
+size_t MultimodalLLM::getGeneratedTokenCount() const noexcept {
+  if (!runner_)
+    return 0;
+  return static_cast<size_t>(runner_->stats().num_generated_tokens);
+}
+
+size_t MultimodalLLM::getPromptTokenCount() const noexcept {
+  if (!runner_)
+    return 0;
+  return static_cast<size_t>(runner_->stats().num_prompt_tokens);
+}
+
+size_t MultimodalLLM::getMemoryLowerBound() const noexcept {
+  return memorySizeLowerBound;
+}
+
+void MultimodalLLM::setTemperature(float temperature) {
+  if (temperature < 0.0f) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
+                            "Temperature must be non-negative");
+  }
+  temperature_ = temperature;
+}
+
+void MultimodalLLM::setTopp(float topp) {
+  if (topp < 0.0f || topp > 1.0f) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
+                            "Top-p must be between 0.0 and 1.0");
+  }
+  topp_ = topp;
+}
+
+void MultimodalLLM::unload() noexcept { runner_.reset(nullptr); }
+
+} // namespace rnexecutorch::models::multimodal_llm
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h
new file mode 100644
index 000000000..6b9f8698c
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <ReactCommon/CallInvoker.h>
+#include <jsi/jsi.h>
+#include <rnexecutorch/models/BaseModel.h>
+#include <runner/multimodal_runner.h>
+
+namespace rnexecutorch {
+namespace models::multimodal_llm {
+using namespace facebook;
+
+class MultimodalLLM : public BaseModel {
+public:
+  explicit MultimodalLLM(const std::string &modelSource,
+                         const std::string &tokenizerSource,
+                         std::shared_ptr<react::CallInvoker> callInvoker);
+
+  std::string generate(std::string imagePath, std::string prompt,
+                       std::shared_ptr<jsi::Function> callback);
+  void interrupt();
+  void unload() noexcept;
+  size_t getGeneratedTokenCount() const noexcept;
+  size_t getPromptTokenCount() const noexcept;
+  size_t getMemoryLowerBound() const noexcept;
+  void setTemperature(float temperature);
+  void setTopp(float topp);
+
+private:
+  float temperature_ = 0.8f;
+  float topp_ = 0.9f;
+  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner_;
+};
+} // namespace models::multimodal_llm
+
+REGISTER_CONSTRUCTOR(models::multimodal_llm::MultimodalLLM, std::string,
+                     std::string, std::shared_ptr<react::CallInvoker>);
+} // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/runner/image.h b/packages/react-native-executorch/common/runner/image.h
new file mode 100644
index 000000000..86373ca91
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/image.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/image.h
+
+#pragma once
+
+#include <cstdint>
+#include <variant>
+#include <vector>
+
+#include <executorch/extension/tensor/tensor.h>
+#include <executorch/runtime/platform/log.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+class Image {
+public:
+  Image() : width_(0), height_(0), channels_(0) {}
+
+  Image(std::vector<uint8_t> &&data, int32_t width, int32_t height,
+        int32_t channels)
+      : data_(std::move(data)), width_(width), height_(height),
+        channels_(channels) {}
+
+  Image(std::vector<float> &&data, int32_t width, int32_t height,
+        int32_t channels)
+      : data_(std::move(data)), width_(width), height_(height),
+        channels_(channels) {}
+
+  int32_t width() const { return width_; }
+  int32_t height() const { return height_; }
+  int32_t channels() const { return channels_; }
+
+  bool is_uint8() const {
+    return std::holds_alternative<std::vector<uint8_t>>(data_);
+  }
+  bool is_float() const {
+    return std::holds_alternative<std::vector<float>>(data_);
+  }
+
+  const std::vector<uint8_t> &get_uint8_data() const & {
+    return std::get<std::vector<uint8_t>>(data_);
+  }
+  const std::vector<float> &get_float_data() const & {
+    return std::get<std::vector<float>>(data_);
+  }
+  std::vector<float> &get_float_data() & {
+    return std::get<std::vector<float>>(data_);
+  }
+
+  ::executorch::runtime::Result<::executorch::extension::TensorPtr>
+  toTensor(bool with_batch = false) const {
+    std::vector<::executorch::aten::SizesType> sizes = {channels(), height(),
+                                                        width()};
+    if (with_batch) {
+      sizes.insert(sizes.begin(), 1);
+    }
+    if (is_float()) {
+      return ::executorch::extension::from_blob(
+          const_cast<float *>(get_float_data().data()), sizes,
+          ::executorch::aten::ScalarType::Float);
+    } else if (is_uint8()) {
+      return ::executorch::extension::from_blob(
+          const_cast<uint8_t *>(get_uint8_data().data()), sizes,
+          ::executorch::aten::ScalarType::Byte);
+    }
+    ET_LOG(Error, "Image data is not initialized.");
+    return ::executorch::runtime::Error::NotSupported;
+  }
+
+private:
+  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
+  int32_t width_;
+  int32_t height_;
+  int32_t channels_;
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
new file mode 100644
index 000000000..2eafe3901
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/multimodal_decoder_runner.h
+
+#pragma once
+
+#include "constants.h"
+#include "text_decoder_runner.h"
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+// Extends TextDecoderRunner to use the multi-method PTE layout:
+//   token_embedding method  → embeddings
+//   text_decoder method     → logits
+class MultimodalDecoderRunner : public TextDecoderRunner {
+public:
+  explicit MultimodalDecoderRunner(Module *module, IOManager *io_manager)
+      : TextDecoderRunner(module, io_manager) {}
+
+  // Step: embed single token, then decode.
+  inline ::executorch::runtime::Result<::executorch::aten::Tensor>
+  step(TensorPtr &tokens, int64_t start_pos) override {
+    auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens);
+    if (!embed_result.ok()) {
+      return embed_result.error();
+    }
+    return decode((*embed_result)[0], start_pos);
+  }
+
+  // Decode an embedding EValue to logits.
+  inline ::executorch::runtime::Result<::executorch::aten::Tensor>
+  decode(const ::executorch::runtime::EValue &embeddings, int64_t start_pos) {
+    auto start_pos_tensor = ::executorch::extension::from_blob(
+        &start_pos, {1}, ::executorch::aten::ScalarType::Long);
+    auto outputs_result =
+        module_->execute(kTextModelMethod, {embeddings, start_pos_tensor});
+    if (!outputs_result.ok()) {
+      return outputs_result.error();
+    }
+    auto &outputs = *outputs_result;
+    ET_CHECK_MSG(outputs.size() == 1,
+                 "Expected 1 output from text_decoder, got %zu",
+                 outputs.size());
+    ET_CHECK_MSG(outputs[0].isTensor(), "text_decoder output is not a tensor");
+    return outputs[0].toTensor();
+  }
+
+  inline ::executorch::runtime::Error load() override {
+    if (is_method_loaded()) {
+      return ::executorch::runtime::Error::Ok;
+    }
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
+    return ::executorch::runtime::Error::Ok;
+  }
+
+  inline bool is_method_loaded() override {
+    return module_->is_method_loaded(kTokenEmbeddingMethod) &&
+           module_->is_method_loaded(kTextModelMethod);
+  }
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h
new file mode 100644
index 000000000..4ce588db6
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_input.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/multimodal_input.h
+// Audio support stripped — only text and image are used by LFM2-VL.
+
+#pragma once
+
+#include <runner/image.h>
+#include <string>
+#include <variant>
+#include <vector>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+class MultimodalInput {
+public:
+  explicit MultimodalInput(const std::string &text) : data_(text) {}
+  explicit MultimodalInput(std::string &&text) : data_(std::move(text)) {}
+  explicit MultimodalInput(const std::vector<uint64_t> &tokens)
+      : data_(tokens) {}
+  explicit MultimodalInput(std::vector<uint64_t> &&tokens)
+      : data_(std::move(tokens)) {}
+  explicit MultimodalInput(const Image &image) : data_(image) {}
+  explicit MultimodalInput(Image &&image) : data_(std::move(image)) {}
+
+  MultimodalInput(const MultimodalInput &) = default;
+  MultimodalInput &operator=(const MultimodalInput &) = default;
+  MultimodalInput(MultimodalInput &&) noexcept = default;
+  MultimodalInput &operator=(MultimodalInput &&) noexcept = default;
+
+  bool is_text() const noexcept {
+    return std::holds_alternative<std::string>(data_);
+  }
+  bool is_tokens() const noexcept {
+    return std::holds_alternative<std::vector<uint64_t>>(data_);
+  }
+  bool is_image() const noexcept {
+    return std::holds_alternative<Image>(data_);
+  }
+
+  const std::string &get_text() const & { return std::get<std::string>(data_); }
+  const std::vector<uint64_t> &get_tokens() const & {
+    return std::get<std::vector<uint64_t>>(data_);
+  }
+  const Image &get_image() const & { return std::get<Image>(data_); }
+
+private:
+  std::variant<std::string, std::vector<uint64_t>, Image> data_;
+};
+
+inline MultimodalInput make_text_input(const std::string &text) noexcept {
+  return MultimodalInput(text);
+}
+inline MultimodalInput make_text_input(std::string &&text) noexcept {
+  return MultimodalInput(std::move(text));
+}
+inline MultimodalInput make_image_input(const Image &image) noexcept {
+  return MultimodalInput(image);
+}
+inline MultimodalInput make_image_input(Image &&image) noexcept {
+  return MultimodalInput(std::move(image));
+}
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
new file mode 100644
index 000000000..c39c7cc0f
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/multimodal_prefiller.cpp
+// with our token-embedding padding fix and LFM2-VL adaptations.
+
+#include "multimodal_prefiller.h"
+#include "constants.h"
+#include "util.h"
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+using ::executorch::aten::SizesType;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Result;
+
+MultimodalPrefiller::MultimodalPrefiller(
+    Module *module, MultimodalDecoderRunner *decoder_runner,
+    tokenizers::HFTokenizer *tokenizer, IOManager *io_manager)
+    : module_(module), decoder_runner_(decoder_runner), tokenizer_(tokenizer),
+      io_manager_(io_manager) {}
+
+Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
+                                              int64_t &start_pos) {
+  // Keep backing storage alive for the duration of the prefill call.
+  EValue encoder_output;
+  std::vector<int64_t> padded_tokens_storage;
+  TensorPtr sliced_embed_storage;
+
+  if (input.is_image()) {
+    const Image &image = input.get_image();
+
+    // Query input dtype expected by vision_encoder.
+    auto method_meta_result = module_->method_meta(kVisionEncoderMethod);
+    ET_CHECK_OK_OR_RETURN_ERROR(method_meta_result.error(),
+                                "Failed to get method_meta for %s",
+                                kVisionEncoderMethod);
+    auto &method_meta = *method_meta_result;
+
+    ET_CHECK_OR_RETURN_ERROR(method_meta.num_inputs() > 0, InvalidArgument,
+                             "vision_encoder has no inputs");
+    auto input_meta_result = method_meta.input_tensor_meta(0);
+    ET_CHECK_OK_OR_RETURN_ERROR(input_meta_result.error(),
+                                "Cannot get vision_encoder input meta at 0");
+    auto expected_dtype = input_meta_result->scalar_type();
+
+    ET_CHECK_OR_RETURN_ERROR(
+        expected_dtype == ::executorch::aten::ScalarType::Float &&
+            image.is_float(),
+        InvalidArgument, "vision_encoder expects float32 image data");
+
+    auto expected_dims = input_meta_result->sizes();
+    auto image_tensor_result =
+        image.toTensor(/*with_batch=*/expected_dims.size() == 4);
+    ET_CHECK_OK_OR_RETURN_ERROR(image_tensor_result.error(),
+                                "Failed to convert image to tensor");
+
+    auto image_encoder_result =
+        module_->execute(kVisionEncoderMethod, *image_tensor_result);
+    ET_CHECK_OK_OR_RETURN_ERROR(image_encoder_result.error());
+    encoder_output = (*image_encoder_result)[0];
+
+  } else if (input.is_text() || input.is_tokens()) {
+    std::vector<uint64_t> tokens;
+    if (input.is_text()) {
+      auto encode_result = tokenizer_->encode(input.get_text());
+      if (!encode_result.ok()) {
+        ET_LOG(Error, "Tokenizer encode error %d",
+               static_cast<uint32_t>(encode_result.error()));
+        return Error::InvalidArgument;
+      }
+      tokens = std::move(*encode_result);
+    } else {
+      tokens = input.get_tokens();
+    }
+
+    const auto actual_seq_len = static_cast<SizesType>(tokens.size());
+
+    // The token_embedding PTE has a fixed MAX_SEQ_LEN input buffer.
+    // Pad with zeros, run embedding, then slice output back to actual length.
+    int64_t max_seq_len = actual_seq_len; // fallback: no padding needed
+    auto max_seq_len_result = module_->get(kMaxSeqLen);
+    if (max_seq_len_result.error() == Error::Ok) {
+      max_seq_len = max_seq_len_result->toScalar().to<int64_t>();
+    }
+
+    padded_tokens_storage.assign(max_seq_len, 0);
+    std::copy(tokens.begin(), tokens.end(), padded_tokens_storage.begin());
+
+    auto text_tensor = ::executorch::extension::from_blob(
+        padded_tokens_storage.data(), {1, static_cast<SizesType>(max_seq_len)},
+        ::executorch::aten::ScalarType::Long);
+
+    auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor);
+    ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error());
+
+    auto full_embed = (*embed_result)[0].toTensor();
+    const auto embed_dim = static_cast<SizesType>(full_embed.size(2));
+    sliced_embed_storage = ::executorch::extension::from_blob(
+        full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim},
+        ::executorch::aten::ScalarType::Float);
+    encoder_output = EValue(*sliced_embed_storage);
+
+  } else {
+    ET_LOG(Error, "Unsupported MultimodalInput type");
+    return Error::NotSupported;
+  }
+
+  // Run text_decoder for prefill.
+  int64_t seq_len = encoder_output.toTensor().size(1);
+  if (seq_len == 0) {
+    ET_LOG(Error, "Encoder returned empty output");
+    return Error::InvalidState;
+  }
+
+  std::vector<int64_t> cache_positions;
+  auto cache_pos_result = populate_start_pos_or_cache_position(
+      module_, start_pos, cache_positions, seq_len, kTextModelMethod);
+  ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error());
+
+  auto prefill_result =
+      module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result});
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error());
+
+  auto &prefill_outputs = *prefill_result;
+  ET_CHECK_OR_RETURN_ERROR(!prefill_outputs.empty(), InvalidState,
+                           "text_decoder returned no outputs during prefill");
+
+  auto logits = prefill_outputs[0].toTensor();
+  start_pos += seq_len;
+
+  return static_cast<uint64_t>(decoder_runner_->logits_to_token(logits));
+}
+
+Error MultimodalPrefiller::load() {
+  if (is_method_loaded()) {
+    return Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod));
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod));
+
+  auto method_names_result = module_->method_names();
+  ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error(),
+                              "Failed to get method names");
+  const auto &methods = *method_names_result;
+
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod));
+  }
+  return Error::Ok;
+}
+
+bool MultimodalPrefiller::is_method_loaded() {
+  auto methods_res = module_->method_names();
+  if (methods_res.error() != Error::Ok) {
+    return false;
+  }
+  if (!module_->is_method_loaded(kTokenEmbeddingMethod) ||
+      !module_->is_method_loaded(kTextModelMethod)) {
+    return false;
+  }
+  const auto &methods = *methods_res;
+  if (methods.find(kVisionEncoderMethod) != methods.end()) {
+    return module_->is_method_loaded(kVisionEncoderMethod);
+  }
+  return true;
+}
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
new file mode 100644
index 000000000..ee0f99a5b
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/multimodal_prefiller.h
+
+#pragma once
+
+#include "multimodal_decoder_runner.h"
+#include "multimodal_input.h"
+#include <executorch/extension/module/module.h>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+// Prefills all multimodal inputs (image + text segments) into the KV cache.
+// Implements the same padding logic as the ET repo's multimodal_prefiller.cpp.
+class MultimodalPrefiller {
+public:
+  explicit MultimodalPrefiller(Module *module,
+                               MultimodalDecoderRunner *decoder_runner,
+                               tokenizers::HFTokenizer *tokenizer,
+                               IOManager *io_manager);
+
+  // Prefill one input segment. Updates start_pos in-place.
+  // Returns the first predicted token after this segment.
+  ::executorch::runtime::Result<uint64_t> prefill(const MultimodalInput &input,
+                                                  int64_t &start_pos);
+
+  ::executorch::runtime::Error load();
+  bool is_method_loaded();
+
+private:
+  Module *module_;
+  MultimodalDecoderRunner *decoder_runner_;
+  tokenizers::HFTokenizer *tokenizer_;
+  IOManager *io_manager_;
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
new file mode 100644
index 000000000..842b96c72
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/multimodal_runner.cpp
+
+#include "multimodal_runner.h"
+#include "constants.h"
+#include "util.h"
+#include <rnexecutorch/Error.h>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+
+MultimodalRunner::MultimodalRunner(
+    std::unordered_map<std::string, int64_t> metadata,
+    std::unique_ptr<tokenizers::HFTokenizer> tokenizer,
+    std::unique_ptr<Module> module,
+    std::unique_ptr<MultimodalDecoderRunner> decoder_runner,
+    std::unique_ptr<MultimodalPrefiller> prefiller,
+    std::unique_ptr<IOManager> io_manager,
+    std::unique_ptr<TextTokenGenerator> token_generator,
+    std::unique_ptr<Stats> stats)
+    : metadata_(std::move(metadata)), tokenizer_(std::move(tokenizer)),
+      module_(std::move(module)), decoder_runner_(std::move(decoder_runner)),
+      prefiller_(std::move(prefiller)), io_manager_(std::move(io_manager)),
+      token_generator_(std::move(token_generator)), stats_(std::move(stats)),
+      pos_(0) {}
+
+bool MultimodalRunner::is_loaded() {
+  return prefiller_->is_method_loaded() && token_generator_->is_loaded();
+}
+
+Error MultimodalRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+  ET_CHECK_OK_OR_RETURN_ERROR(prefiller_->load());
+  ET_CHECK_OK_OR_RETURN_ERROR(token_generator_->load());
+  return Error::Ok;
+}
+
+Error MultimodalRunner::generate(
+    const std::vector<MultimodalInput> &inputs, float temperature, float topp,
+    int32_t max_new_tokens,
+    std::function<void(const std::string &)> token_callback) {
+  if (inputs.empty()) {
+    ET_LOG(Error, "MultimodalInput vector cannot be empty");
+    return Error::InvalidArgument;
+  }
+
+  if (!is_loaded()) {
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+  }
+
+  stats_->inference_start_ms = time_in_ms();
+
+  // Prefill all input segments in order.
+  uint64_t prefill_next_token = 0;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    ET_LOG(Info, "Prefilling input %zu/%zu", i + 1, inputs.size());
+    auto prefill_result = prefiller_->prefill(inputs[i], pos_);
+    if (!prefill_result.ok()) {
+      return prefill_result.error();
+    }
+    prefill_next_token = prefill_result.get();
+  }
+
+  stats_->first_token_ms = time_in_ms();
+  stats_->prompt_eval_end_ms = time_in_ms();
+  stats_->num_prompt_tokens = pos_;
+
+  // Decode and emit the first token from prefill.
+  auto decode_result =
+      tokenizer_->decode(prefill_next_token, prefill_next_token);
+  if (!decode_result.ok()) {
+    ET_LOG(Error, "Tokenizer decode error %d",
+           static_cast<uint32_t>(decode_result.error()));
+    return Error::InvalidArgument;
+  }
+  const std::string first_piece = std::move(*decode_result);
+  safe_printf(first_piece.c_str());
+  fflush(stdout);
+  if (token_callback) {
+    token_callback(first_piece);
+  }
+
+  // Resolve max_new_tokens from metadata if caller passed -1.
+  int64_t context_len = metadata_.count(kMaxContextLen)
+                            ? metadata_.at(kMaxContextLen)
+                        : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen)
+                                                      : 2048;
+  int32_t resolved_max_new = max_new_tokens > 0
+                                 ? max_new_tokens
+                                 : static_cast<int32_t>(context_len - pos_);
+  resolved_max_new = std::max(0, resolved_max_new);
+
+  // Autoregressive decode loop.
+  std::vector<uint64_t> prompt_tokens = {prefill_next_token};
+  auto wrapped_callback = [&](const std::string &piece) {
+    safe_printf(piece.c_str());
+    fflush(stdout);
+    if (token_callback) {
+      token_callback(piece);
+    }
+  };
+
+  auto generate_result = token_generator_->generate(
+      prompt_tokens, pos_,
+      static_cast<uint64_t>(std::max(0, resolved_max_new - 1)), temperature,
+      topp, wrapped_callback);
+
+  if (!generate_result.ok()) {
+    return generate_result.error();
+  }
+
+  int64_t num_generated = generate_result.get();
+  pos_ += num_generated;
+
+  stats_->inference_end_ms = time_in_ms();
+  stats_->num_generated_tokens = num_generated;
+
+  return Error::Ok;
+}
+
+void MultimodalRunner::stop() {
+  if (token_generator_) {
+    token_generator_->stop();
+  }
+}
+
+void MultimodalRunner::reset() {
+  pos_ = 0;
+  if (stats_) {
+    stats_->reset();
+  }
+}
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
new file mode 100644
index 000000000..c8007a67e
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Ported from executorch/extension/llm/runner/multimodal_runner.h
+
+#pragma once
+
+#include "multimodal_decoder_runner.h"
+#include "multimodal_input.h"
+#include "multimodal_prefiller.h"
+#include "stats.h"
+#include "text_token_generator.h"
+#include <executorch/extension/module/module.h>
+#include <functional>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace executorch {
+namespace extension {
+namespace llm {
+
+class MultimodalRunner {
+public:
+  explicit MultimodalRunner(
+      std::unordered_map<std::string, int64_t> metadata,
+      std::unique_ptr<tokenizers::HFTokenizer> tokenizer,
+      std::unique_ptr<Module> module,
+      std::unique_ptr<MultimodalDecoderRunner> decoder_runner,
+      std::unique_ptr<MultimodalPrefiller> prefiller,
+      std::unique_ptr<IOManager> io_manager,
+      std::unique_ptr<TextTokenGenerator> token_generator,
+      std::unique_ptr<Stats> stats);
+
+  bool is_loaded();
+  ::executorch::runtime::Error load();
+
+  ::executorch::runtime::Error
+  generate(const std::vector<MultimodalInput> &inputs, float temperature,
+           float topp, int32_t max_new_tokens,
+           std::function<void(const std::string &)> token_callback = {});
+
+  void stop();
+  void reset();
+
+  Stats &stats() { return *stats_; }
+
+private:
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
+  std::unique_ptr<Module> module_;
+  std::unique_ptr<MultimodalDecoderRunner> decoder_runner_;
+  std::unique_ptr<MultimodalPrefiller> prefiller_;
+  std::unique_ptr<IOManager> io_manager_;
+  std::unique_ptr<TextTokenGenerator> token_generator_;
+  std::unique_ptr<Stats> stats_;
+  int64_t pos_ = 0;
+};
+
+} // namespace llm
+} // namespace extension
+} // namespace executorch
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts
new file mode 100644
index 000000000..0a54239cc
--- /dev/null
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts
@@ -0,0 +1,153 @@
+import { useCallback, useEffect, useRef, useState } from 'react';
+import { ResourceSource } from '../../types/common';
+import { ResourceFetcher } from '../../utils/ResourceFetcher';
+import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
+import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
+
+export interface MultimodalLLMProps {
+  model: {
+    modelSource: ResourceSource;
+    tokenizerSource: ResourceSource;
+  };
+  preventLoad?: boolean;
+}
+
+export interface MultimodalLLMType {
+  isReady: boolean;
+  isGenerating: boolean;
+  downloadProgress: number;
+  response: string;
+  error: RnExecutorchError | null;
+  generate: (imagePath: string, prompt: string) => Promise<string>;
+  interrupt: () => void;
+}
+
+/**
+ * React hook for managing a Multimodal LLM (VLM) instance.
+ * Uses `loadMultimodalLLM` native global, which wraps a multi-method PTE
+ * with vision_encoder, token_embedding, and text_decoder methods.
+ *
+ * @category Hooks
+ */
+export const useMultimodalLLM = ({
+  model,
+  preventLoad = false,
+}: MultimodalLLMProps): MultimodalLLMType => {
+  const [nativeModule, setNativeModule] = useState<any>(null);
+  const [isReady, setIsReady] = useState(false);
+  const [isGenerating, setIsGenerating] = useState(false);
+  const [downloadProgress, setDownloadProgress] = useState(0);
+  const [response, setResponse] = useState('');
+  const [error, setError] = useState<RnExecutorchError | null>(null);
+
+  useEffect(() => {
+    setDownloadProgress(0);
+    setError(null);
+    setIsReady(false);
+
+    if (preventLoad) return;
+
+    let cancelled = false;
+
+    (async () => {
+      try {
+        const [modelResults, tokenizerResults] = await Promise.all([
+          ResourceFetcher.fetch(setDownloadProgress, model.modelSource),
+          ResourceFetcher.fetch(undefined, model.tokenizerSource),
+        ]);
+
+        if (cancelled) return;
+
+        const modelPath = modelResults?.[0];
+        const tokenizerPath = tokenizerResults?.[0];
+
+        if (!modelPath || !tokenizerPath) {
+          throw new RnExecutorchError(
+            RnExecutorchErrorCode.DownloadInterrupted,
+            'Download interrupted — not all files were fetched.'
+          );
+        }
+
+        const mod = global.loadMultimodalLLM(modelPath, tokenizerPath);
+        setNativeModule(mod);
+        setIsReady(true);
+      } catch (e) {
+        if (!cancelled) {
+          setError(parseUnknownError(e));
+        }
+      }
+    })();
+
+    return () => {
+      cancelled = true;
+    };
+  }, [model.modelSource, model.tokenizerSource, preventLoad]);
+
+  const tokenBufferRef = useRef('');
+  const rafRef = useRef<ReturnType<typeof requestAnimationFrame> | null>(null);
+
+  const generate = useCallback(
+    async (imagePath: string, prompt: string): Promise<string> => {
+      if (!nativeModule) {
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.ModuleNotLoaded,
+          'Multimodal LLM is not loaded yet.'
+        );
+      }
+      tokenBufferRef.current = '';
+      if (rafRef.current !== null) {
+        cancelAnimationFrame(rafRef.current);
+        rafRef.current = null;
+      }
+      setResponse('');
+      setIsGenerating(true);
+      try {
+        const result: string = await nativeModule.generate(
+          imagePath,
+          prompt,
+          (token: string) => {
+            tokenBufferRef.current += token;
+            if (rafRef.current === null) {
+              rafRef.current = requestAnimationFrame(() => {
+                rafRef.current = null;
+                const buffered = tokenBufferRef.current;
+                tokenBufferRef.current = '';
+                setResponse((prev) => prev + buffered);
+              });
+            }
+          }
+        );
+        // Flush any remaining buffered tokens after generation completes
+        if (rafRef.current !== null) {
+          cancelAnimationFrame(rafRef.current);
+          rafRef.current = null;
+        }
+        if (tokenBufferRef.current) {
+          const remaining = tokenBufferRef.current;
+          tokenBufferRef.current = '';
+          setResponse((prev) => prev + remaining);
+        }
+        return result;
+      } catch (e) {
+        throw parseUnknownError(e);
+      } finally {
+        setIsGenerating(false);
+      }
+    },
+    [nativeModule]
+  );
+
+  const interrupt = useCallback(() => {
+    nativeModule?.interrupt();
+  }, [nativeModule]);
+
+  return {
+    isReady,
+    isGenerating,
+    downloadProgress,
+    response,
+    error,
+    generate,
+    interrupt,
+  };
+};
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index dd7557ca2..e544d9cca 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -49,6 +49,7 @@ declare global {
   var loadVAD: (source: string) => any;
   var loadTextEmbeddings: (modelSource: string, tokenizerSource: string) => any;
   var loadLLM: (modelSource: string, tokenizerSource: string) => any;
+  var loadMultimodalLLM: (modelSource: string, tokenizerSource: string) => any;
   var loadTextToImage: (
     tokenizerSource: string,
     encoderSource: string,
@@ -97,6 +98,7 @@ if (
   global.loadImageEmbeddings == null ||
   global.loadVAD == null ||
   global.loadLLM == null ||
+  global.loadMultimodalLLM == null ||
   global.loadSpeechToText == null ||
   global.loadTextToSpeechKokoro == null ||
   global.loadOCR == null ||
@@ -121,6 +123,7 @@ export * from './hooks/computer_vision/useImageEmbeddings';
 export * from './hooks/computer_vision/useTextToImage';
 
 export * from './hooks/natural_language_processing/useLLM';
+export * from './hooks/natural_language_processing/useMultimodalLLM';
 export * from './hooks/natural_language_processing/useSpeechToText';
 export * from './hooks/natural_language_processing/useTextToSpeech';
 export * from './hooks/natural_language_processing/useTextEmbeddings';

From 57d96189bd4fa8490bf1d4b1a9778e73cd0bacc3 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 11:13:13 +0100
Subject: [PATCH 02/46] feat: unified LLM runner for text-only and multimodal
 PTEs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add UnifiedRunner that auto-detects PTE layout at load time
  (forward method → text-only, token_embedding+text_decoder → multimodal)
- Merge MultimodalLLM into LLM using UnifiedRunner
- VLMs now have full feature parity: multi-turn, countTextTokens,
  getMaxContextLength, setCountInterval, setTimeInterval
- Remove Runner, MultimodalRunner, MultimodalLLM classes
- Add sendMessageWithImage to LLMController and useLLM hook
- Remove useMultimodalLLM — callers use useLLM with isMultimodal: true
- Migrate multimodal_llm example app to useLLM

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/llm/app/multimodal_llm/index.tsx         |   8 +-
 .../rnexecutorch/RnExecutorchInstaller.cpp    |   6 -
 .../host_objects/ModelHostObject.h            |  45 +-
 .../common/rnexecutorch/models/llm/LLM.cpp    | 152 +++++--
 .../common/rnexecutorch/models/llm/LLM.h      |  15 +-
 .../models/multimodal_llm/MultimodalLLM.cpp   | 197 ---------
 .../models/multimodal_llm/MultimodalLLM.h     |  40 --
 .../common/runner/multimodal_runner.cpp       | 149 -------
 .../common/runner/multimodal_runner.h         |  68 ---
 .../common/runner/runner.cpp                  | 391 ------------------
 .../common/runner/runner.h                    |  87 ----
 .../common/runner/unified_runner.cpp          | 388 +++++++++++++++++
 .../common/runner/unified_runner.h            | 100 +++++
 .../src/controllers/LLMController.ts          | 134 ++++--
 .../natural_language_processing/useLLM.ts     |  13 +-
 .../useMultimodalLLM.ts                       | 153 -------
 packages/react-native-executorch/src/index.ts |   3 -
 .../react-native-executorch/src/types/llm.ts  |  15 +
 yarn.lock                                     |  11 +
 19 files changed, 793 insertions(+), 1182 deletions(-)
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h
 delete mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.cpp
 delete mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.h
 delete mode 100644 packages/react-native-executorch/common/runner/runner.cpp
 delete mode 100644 packages/react-native-executorch/common/runner/runner.h
 create mode 100644 packages/react-native-executorch/common/runner/unified_runner.cpp
 create mode 100644 packages/react-native-executorch/common/runner/unified_runner.h
 delete mode 100644 packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 3a3b4692b..990f6cf1a 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -15,7 +15,7 @@ import {
 import * as DocumentPicker from 'expo-document-picker';
 import { launchImageLibrary } from 'react-native-image-picker';
 import { useIsFocused } from '@react-navigation/native';
-import { useMultimodalLLM } from 'react-native-executorch';
+import { useLLM } from 'react-native-executorch';
 import ColorPalette from '../../colors';
 import Spinner from '../../components/Spinner';
 import { GeneratingContext } from '../../context';
@@ -127,7 +127,9 @@ function MultimodalLLMScreen({
   const scrollViewRef = useRef<ScrollView>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
-  const vlm = useMultimodalLLM({ model: { modelSource, tokenizerSource } });
+  const vlm = useLLM({
+    model: { modelSource, tokenizerSource, isMultimodal: true },
+  });
 
   useEffect(() => {
     setGlobalGenerating(vlm.isGenerating);
@@ -153,7 +155,7 @@ function MultimodalLLMScreen({
     if (!imageUri || !prompt.trim() || !vlm.isReady || vlm.isGenerating) return;
     Keyboard.dismiss();
     try {
-      await vlm.generate(imageUri, prompt.trim());
+      await vlm.sendMessageWithImage(imageUri, prompt.trim());
     } catch (e) {
       console.error('Generation error:', e);
     }
diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp
index 4cb6afef8..9d4b419e2 100644
--- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp
@@ -6,7 +6,6 @@
 #include <rnexecutorch/models/embeddings/image/ImageEmbeddings.h>
 #include <rnexecutorch/models/embeddings/text/TextEmbeddings.h>
 #include <rnexecutorch/models/llm/LLM.h>
-#include <rnexecutorch/models/multimodal_llm/MultimodalLLM.h>
 #include <rnexecutorch/models/object_detection/ObjectDetection.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h>
@@ -89,11 +88,6 @@ void RnExecutorchInstaller::injectJSIBindings(
       RnExecutorchInstaller::loadModel<models::llm::LLM>(
           jsiRuntime, jsCallInvoker, "loadLLM"));
 
-  jsiRuntime->global().setProperty(
-      *jsiRuntime, "loadMultimodalLLM",
-      RnExecutorchInstaller::loadModel<models::multimodal_llm::MultimodalLLM>(
-          jsiRuntime, jsCallInvoker, "loadMultimodalLLM"));
-
   jsiRuntime->global().setProperty(
       *jsiRuntime, "loadOCR",
       RnExecutorchInstaller::loadModel<models::ocr::OCR>(
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 2b7cbc2e1..f41da1e45 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -18,7 +18,6 @@
 #include <rnexecutorch/metaprogramming/TypeConcepts.h>
 #include <rnexecutorch/models/BaseModel.h>
 #include <rnexecutorch/models/llm/LLM.h>
-#include <rnexecutorch/models/multimodal_llm/MultimodalLLM.h>
 #include <rnexecutorch/models/ocr/OCR.h>
 #include <rnexecutorch/models/speech_to_text/SpeechToText.h>
 #include <rnexecutorch/models/text_to_image/TextToImage.h>
@@ -33,11 +32,7 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
   explicit ModelHostObject(const std::shared_ptr<Model> &model,
                            std::shared_ptr<react::CallInvoker> callInvoker)
       : model(model), callInvoker(callInvoker) {
-    // MultimodalLLM moves module_ into its runner during construction, so
-    // the base class methods that go through module_ (forward, getInputShape)
-    // are unsafe to expose. Its unload is registered separately below.
-    if constexpr (meta::DerivedFromOrSameAs<Model, models::BaseModel> &&
-                  !meta::SameAs<Model, models::multimodal_llm::MultimodalLLM>) {
+    if constexpr (meta::DerivedFromOrSameAs<Model, models::BaseModel>) {
       addFunctions(
           JSI_EXPORT_FUNCTION(ModelHostObject<Model>, unload, "unload"));
 
@@ -50,7 +45,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
           "getInputShape"));
     }
 
-    if constexpr (meta::HasGenerate<Model>) {
+    // LLM has overloaded generate — handled explicitly in the LLM block below
+    if constexpr (meta::HasGenerate<Model> &&
+                  !meta::SameAs<Model, models::llm::LLM>) {
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        promiseHostFunction<&Model::generate>,
                                        "generate"));
@@ -103,6 +100,12 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
     }
 
     if constexpr (meta::SameAs<Model, models::llm::LLM>) {
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          promiseHostFunction<static_cast<std::string (Model::*)(
+              std::string, std::shared_ptr<jsi::Function>)>(&Model::generate)>,
+          "generate"));
+
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>, synchronousHostFunction<&Model::interrupt>,
           "interrupt"));
@@ -149,6 +152,17 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                                        synchronousHostFunction<&Model::reset>,
                                        "reset"));
+
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          promiseHostFunction<static_cast<std::string (Model::*)(
+              std::string, std::string, std::shared_ptr<jsi::Function>)>(
+              &Model::generate)>,
+          "generateWithImage"));
+
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>, synchronousHostFunction<&Model::isMultimodal>,
+          "isMultimodal"));
     }
 
     if constexpr (meta::SameAs<Model, models::text_to_image::TextToImage>) {
@@ -177,23 +191,6 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
           ModelHostObject<Model>, synchronousHostFunction<&Model::streamStop>,
           "streamStop"));
     }
-
-    if constexpr (meta::SameAs<Model, models::multimodal_llm::MultimodalLLM>) {
-      addFunctions(JSI_EXPORT_FUNCTION(
-          ModelHostObject<Model>, synchronousHostFunction<&Model::interrupt>,
-          "interrupt"));
-
-      addFunctions(JSI_EXPORT_FUNCTION(
-          ModelHostObject<Model>,
-          synchronousHostFunction<&Model::setTemperature>, "setTemperature"));
-
-      addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
-                                       synchronousHostFunction<&Model::setTopp>,
-                                       "setTopp"));
-
-      addFunctions(
-          JSI_EXPORT_FUNCTION(ModelHostObject<Model>, unload, "unload"));
-    }
   }
 
   // A generic host function that runs synchronously, works analogously to the
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 4a9d40033..66b151faa 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -3,22 +3,64 @@
 #include <executorch/extension/tensor/tensor.h>
 #include <filesystem>
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
+#include <runner/image.h>
 
 namespace rnexecutorch::models::llm {
 namespace llm = ::executorch::extension::llm;
 namespace fs = std::filesystem;
 using namespace facebook;
-using executorch::extension::TensorPtr;
 using executorch::extension::module::Module;
 using executorch::runtime::Error;
 
+// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values [0,255]
+static constexpr int kImageSize = 512;
+static constexpr int kImageChannels = 3;
+
+// LFM2-VL chat template
+static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
+static constexpr const char *kChatSuffix =
+    "<|im_end|>\n<|im_start|>assistant\n";
+
+static llm::Image loadImageForVLM(const std::string &imagePath) {
+  cv::Mat mat = image_processing::readImage(imagePath);
+  cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
+  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
+
+  std::vector<float> chw(kImageChannels * kImageSize * kImageSize);
+  const int pixelCount = kImageSize * kImageSize;
+  for (int i = 0; i < pixelCount; ++i) {
+    cv::Vec3b px = mat.at<cv::Vec3b>(i / kImageSize, i % kImageSize);
+    for (int c = 0; c < kImageChannels; ++c) {
+      chw[c * pixelCount + i] = static_cast<float>(px[c]);
+    }
+  }
+  return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels);
+}
+
 LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::shared_ptr<react::CallInvoker> callInvoker)
-    : BaseModel(modelSource, callInvoker, Module::LoadMode::File),
-      runner(
-          std::make_unique<example::Runner>(module_.get(), tokenizerSource)) {
-  auto loadResult = runner->load();
+    : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
+
+  // Peek at method names to decide text vs multimodal before constructing
+  // runner
+  auto method_names_result = module_->method_names();
+  multimodal_ = method_names_result.ok() &&
+                method_names_result->count(llm::kTokenEmbeddingMethod) > 0 &&
+                method_names_result->count(llm::kTextModelMethod) > 0;
+
+  if (multimodal_) {
+    // Transfer module_ ownership to the runner (same as old MultimodalLLM)
+    runner_ = std::make_unique<example::UnifiedRunner>(
+        nullptr, std::move(module_), tokenizerSource);
+  } else {
+    // Lend module_ as a raw pointer (same as old LLM)
+    runner_ = std::make_unique<example::UnifiedRunner>(module_.get(), nullptr,
+                                                       tokenizerSource);
+  }
+
+  auto loadResult = runner_->load();
   if (loadResult != Error::Ok) {
     throw RnExecutorchError(loadResult, "Failed to load LLM runner");
   }
@@ -27,17 +69,21 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
                          fs::file_size(fs::path(tokenizerSource));
 }
 
-// TODO: add a way to manipulate the generation config with params
+bool LLM::isMultimodal() const noexcept { return multimodal_; }
+
 std::string LLM::generate(std::string input,
                           std::shared_ptr<jsi::Function> callback) {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
+  if (multimodal_) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidUserInput,
+        "This is a multimodal model. Call generate(imagePath, prompt, cb).");
+  }
 
   std::string output;
-
-  // Create a native callback that accumulates tokens and optionally invokes JS
   auto nativeCallback = [this, callback, &output](const std::string &token) {
     output += token;
     if (callback && callInvoker) {
@@ -48,51 +94,87 @@ std::string LLM::generate(std::string input,
   };
 
   auto config = llm::GenerationConfig{.echo = false, .warming = false};
-  auto error = runner->generate(input, config, nativeCallback, {});
-  if (error != executorch::runtime::Error::Ok) {
+  auto error = runner_->generate(input, config, nativeCallback, {});
+  if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate text");
   }
+  return output;
+}
+
+std::string LLM::generate(std::string imagePath, std::string prompt,
+                          std::shared_ptr<jsi::Function> callback) {
+  if (!runner_ || !runner_->is_loaded()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Runner is not loaded");
+  }
+  if (!multimodal_) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidUserInput,
+        "This is a text-only model. Call generate(prompt, cb).");
+  }
+
+  llm::Image image = loadImageForVLM(imagePath);
+  std::vector<llm::MultimodalInput> inputs = {
+      llm::make_text_input(std::string(kChatPrefix)),
+      llm::make_image_input(std::move(image)),
+      llm::make_text_input(prompt + kChatSuffix),
+  };
+
+  std::string output;
+  auto nativeCallback = [this, &callback, &output](const std::string &token) {
+    output += token;
+    if (callback && callInvoker) {
+      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
+        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
+      });
+    }
+  };
+
+  auto error =
+      runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
+  if (error != Error::Ok) {
+    throw RnExecutorchError(error, "Failed to generate multimodal response");
+  }
 
+  runner_->reset();
   return output;
 }
 
 void LLM::interrupt() {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Can't interrupt a model that's not loaded");
   }
-  runner->stop();
+  runner_->stop();
 }
 
 void LLM::reset() {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Can't reset a model that's not loaded");
   }
-  runner->reset();
+  runner_->reset();
 }
 
 size_t LLM::getGeneratedTokenCount() const noexcept {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded())
     return 0;
-  }
-  return runner->stats_.num_generated_tokens;
+  return runner_->stats_.num_generated_tokens;
 }
 
 size_t LLM::getPromptTokenCount() const noexcept {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded())
     return 0;
-  }
-  return runner->stats_.num_prompt_tokens;
+  return runner_->stats_.num_prompt_tokens;
 }
 
 int32_t LLM::countTextTokens(std::string text) const {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::ModuleNotLoaded,
         "Can't count tokens from a model that's not loaded");
   }
-  return runner->count_text_tokens(text);
+  return runner_->count_text_tokens(text);
 }
 
 size_t LLM::getMemoryLowerBound() const noexcept {
@@ -100,7 +182,7 @@ size_t LLM::getMemoryLowerBound() const noexcept {
 }
 
 void LLM::setCountInterval(size_t countInterval) {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Can't configure a model that's not loaded");
   }
@@ -108,11 +190,11 @@ void LLM::setCountInterval(size_t countInterval) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
                             "Count interval must be greater than 0");
   }
-  runner->set_count_interval(countInterval);
+  runner_->set_count_interval(countInterval);
 }
 
 void LLM::setTimeInterval(size_t timeInterval) {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Can't configure a model that's not loaded");
   }
@@ -120,11 +202,11 @@ void LLM::setTimeInterval(size_t timeInterval) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
                             "Time interval must be greater than 0");
   }
-  runner->set_time_interval(timeInterval);
+  runner_->set_time_interval(timeInterval);
 }
 
 void LLM::setTemperature(float temperature) {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Can't configure a model that's not loaded");
   }
@@ -132,11 +214,12 @@ void LLM::setTemperature(float temperature) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
                             "Temperature must be non-negative");
   }
-  runner->set_temperature(temperature);
+  temperature_ = temperature;
+  runner_->set_temperature(temperature);
 }
 
 void LLM::setTopp(float topp) {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Can't configure a model that's not loaded");
   }
@@ -144,18 +227,19 @@ void LLM::setTopp(float topp) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
                             "Top-p must be between 0.0 and 1.0");
   }
-  runner->set_topp(topp);
+  topp_ = topp;
+  runner_->set_topp(topp);
 }
 
 int32_t LLM::getMaxContextLength() const {
-  if (!runner || !runner->is_loaded()) {
+  if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::ModuleNotLoaded,
         "Can't get context length from a model that's not loaded");
   }
-  return runner->get_max_context_length();
+  return runner_->get_max_context_length();
 }
 
-void LLM::unload() noexcept { runner.reset(nullptr); }
+void LLM::unload() noexcept { runner_.reset(nullptr); }
 
 } // namespace rnexecutorch::models::llm
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 99daaf6f5..3763fe924 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -6,7 +6,7 @@
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
 #include <rnexecutorch/models/BaseModel.h>
-#include <runner/runner.h>
+#include <runner/unified_runner.h>
 
 namespace rnexecutorch {
 namespace models::llm {
@@ -18,8 +18,16 @@ class LLM : public BaseModel {
                const std::string &tokenizerSource,
                std::shared_ptr<react::CallInvoker> callInvoker);
 
+  // Text-only generate (existing signature — used by LLMController)
   std::string generate(std::string input,
                        std::shared_ptr<jsi::Function> callback);
+
+  // Multimodal generate (image + text prompt)
+  std::string generate(std::string imagePath, std::string prompt,
+                       std::shared_ptr<jsi::Function> callback);
+
+  bool isMultimodal() const noexcept;
+
   void interrupt();
   void reset();
   void unload() noexcept;
@@ -34,7 +42,10 @@ class LLM : public BaseModel {
   int32_t getMaxContextLength() const;
 
 private:
-  std::unique_ptr<example::Runner> runner;
+  std::unique_ptr<example::UnifiedRunner> runner_;
+  bool multimodal_;
+  float temperature_ = 0.8f;
+  float topp_ = 0.9f;
 };
 } // namespace models::llm
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp
deleted file mode 100644
index 7187b3d57..000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-#include "MultimodalLLM.h"
-
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor.h>
-#include <filesystem>
-#include <rnexecutorch/Error.h>
-#include <rnexecutorch/data_processing/ImageProcessing.h>
-#include <runner/multimodal_decoder_runner.h>
-#include <runner/multimodal_prefiller.h>
-
-namespace rnexecutorch::models::multimodal_llm {
-namespace llm = ::executorch::extension::llm;
-namespace fs = std::filesystem;
-using namespace facebook;
-using ::executorch::extension::module::Module;
-using ::executorch::runtime::Error;
-
-// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values in
-// [0,255]. Normalization and patch unfolding are baked into the exported PTE.
-static constexpr int kImageSize = 512;
-static constexpr int kImageChannels = 3;
-
-// LFM2-VL chat template
-static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
-static constexpr const char *kChatSuffix =
-    "<|im_end|>\n<|im_start|>assistant\n";
-
-static llm::Image loadImageForLFM2(const std::string &imagePath) {
-  cv::Mat mat = image_processing::readImage(imagePath);
-  cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
-  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
-
-  // HWC uint8 → CHW float32, values in [0, 255]
-  std::vector<float> chw(kImageChannels * kImageSize * kImageSize);
-  const int pixelCount = kImageSize * kImageSize;
-  for (int i = 0; i < pixelCount; ++i) {
-    cv::Vec3b px = mat.at<cv::Vec3b>(i / kImageSize, i % kImageSize);
-    for (int c = 0; c < kImageChannels; ++c) {
-      chw[c * pixelCount + i] = static_cast<float>(px[c]);
-    }
-  }
-  return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels);
-}
-
-MultimodalLLM::MultimodalLLM(const std::string &modelSource,
-                             const std::string &tokenizerSource,
-                             std::shared_ptr<react::CallInvoker> callInvoker)
-    : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
-  // Build the multimodal runner from parts — all referencing module_ owned by
-  // BaseModel so we don't load the PTE twice.
-  auto tokenizer = std::make_unique<tokenizers::HFTokenizer>();
-  auto tokenizer_status = tokenizer->load(tokenizerSource);
-  if (tokenizer_status != tokenizers::Error::Ok) {
-    throw RnExecutorchError(RnExecutorchErrorCode::TokenizerError,
-                            "Failed to load tokenizer");
-  }
-
-  auto io_manager = std::make_unique<llm::IOManager>(*module_);
-  auto decoder_runner = std::make_unique<llm::MultimodalDecoderRunner>(
-      module_.get(), io_manager.get());
-
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  // Read EOS ids from PTE constant method if present, default to 7 (<|im_end|>)
-  auto method_names_result = module_->method_names();
-  if (method_names_result.ok()) {
-    if (method_names_result->count(llm::kEosIds)) {
-      auto eos_result = module_->execute(llm::kEosIds);
-      if (eos_result.ok()) {
-        for (const auto &ev : *eos_result) {
-          eos_ids->emplace(static_cast<uint64_t>(ev.toScalar().to<int64_t>()));
-        }
-      }
-    }
-  }
-  if (eos_ids->empty()) {
-    eos_ids->emplace(7); // <|im_end|> fallback
-  }
-
-  auto stats = std::make_unique<llm::Stats>();
-  // Keep a raw pointer before moving into the runner so TextTokenGenerator
-  // can safely reference the same Stats object owned by the runner.
-  llm::Stats *stats_ptr = stats.get();
-  auto token_generator = std::make_unique<llm::TextTokenGenerator>(
-      tokenizer.get(), decoder_runner.get(), /*use_kv_cache=*/true,
-      std::move(eos_ids), stats_ptr);
-
-  auto prefiller = std::make_unique<llm::MultimodalPrefiller>(
-      module_.get(), decoder_runner.get(), tokenizer.get(), io_manager.get());
-
-  // Read metadata from the PTE
-  std::unordered_map<std::string, int64_t> metadata = {
-      {llm::kMaxSeqLen, 2048},
-      {llm::kMaxContextLen, 2048},
-  };
-  if (method_names_result.ok()) {
-    for (auto &pair : metadata) {
-      if (method_names_result->count(pair.first)) {
-        auto val = module_->get(pair.first);
-        if (val.ok()) {
-          pair.second = val->toScalar().to<int64_t>();
-        }
-      }
-    }
-  }
-
-  runner_ = std::make_unique<llm::MultimodalRunner>(
-      std::move(metadata), std::move(tokenizer), std::move(module_),
-      std::move(decoder_runner), std::move(prefiller), std::move(io_manager),
-      std::move(token_generator), std::move(stats));
-
-  auto loadError = runner_->load();
-  if (loadError != Error::Ok) {
-    throw RnExecutorchError(loadError, "Failed to load multimodal runner");
-  }
-
-  memorySizeLowerBound = fs::file_size(fs::path(modelSource)) +
-                         fs::file_size(fs::path(tokenizerSource));
-}
-
-std::string MultimodalLLM::generate(std::string imagePath, std::string prompt,
-                                    std::shared_ptr<jsi::Function> callback) {
-  if (!runner_) {
-    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
-                            "Runner is not loaded");
-  }
-
-  llm::Image image = loadImageForLFM2(imagePath);
-
-  std::vector<llm::MultimodalInput> inputs = {
-      llm::make_text_input(std::string(kChatPrefix)),
-      llm::make_image_input(std::move(image)),
-      llm::make_text_input(prompt + kChatSuffix),
-  };
-
-  std::string output;
-  auto nativeCallback = [this, &callback, &output](const std::string &token) {
-    output += token;
-    if (callback && callInvoker) {
-      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
-        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
-      });
-    }
-  };
-
-  auto error = runner_->generate(inputs, temperature_, topp_,
-                                 /*max_new_tokens=*/-1, nativeCallback);
-  if (error != Error::Ok) {
-    throw RnExecutorchError(error, "Failed to generate text");
-  }
-
-  runner_->reset();
-  return output;
-}
-
-void MultimodalLLM::interrupt() {
-  if (!runner_) {
-    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
-                            "Can't interrupt a model that's not loaded");
-  }
-  runner_->stop();
-}
-
-size_t MultimodalLLM::getGeneratedTokenCount() const noexcept {
-  if (!runner_)
-    return 0;
-  return static_cast<size_t>(runner_->stats().num_generated_tokens);
-}
-
-size_t MultimodalLLM::getPromptTokenCount() const noexcept {
-  if (!runner_)
-    return 0;
-  return static_cast<size_t>(runner_->stats().num_prompt_tokens);
-}
-
-size_t MultimodalLLM::getMemoryLowerBound() const noexcept {
-  return memorySizeLowerBound;
-}
-
-void MultimodalLLM::setTemperature(float temperature) {
-  if (temperature < 0.0f) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
-                            "Temperature must be non-negative");
-  }
-  temperature_ = temperature;
-}
-
-void MultimodalLLM::setTopp(float topp) {
-  if (topp < 0.0f || topp > 1.0f) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
-                            "Top-p must be between 0.0 and 1.0");
-  }
-  topp_ = topp;
-}
-
-void MultimodalLLM::unload() noexcept { runner_.reset(nullptr); }
-
-} // namespace rnexecutorch::models::multimodal_llm
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h
deleted file mode 100644
index 6b9f8698c..000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include <ReactCommon/CallInvoker.h>
-#include <jsi/jsi.h>
-#include <rnexecutorch/models/BaseModel.h>
-#include <runner/multimodal_runner.h>
-
-namespace rnexecutorch {
-namespace models::multimodal_llm {
-using namespace facebook;
-
-class MultimodalLLM : public BaseModel {
-public:
-  explicit MultimodalLLM(const std::string &modelSource,
-                         const std::string &tokenizerSource,
-                         std::shared_ptr<react::CallInvoker> callInvoker);
-
-  std::string generate(std::string imagePath, std::string prompt,
-                       std::shared_ptr<jsi::Function> callback);
-  void interrupt();
-  void unload() noexcept;
-  size_t getGeneratedTokenCount() const noexcept;
-  size_t getPromptTokenCount() const noexcept;
-  size_t getMemoryLowerBound() const noexcept;
-  void setTemperature(float temperature);
-  void setTopp(float topp);
-
-private:
-  float temperature_ = 0.8f;
-  float topp_ = 0.9f;
-  std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner_;
-};
-} // namespace models::multimodal_llm
-
-REGISTER_CONSTRUCTOR(models::multimodal_llm::MultimodalLLM, std::string,
-                     std::string, std::shared_ptr<react::CallInvoker>);
-} // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
deleted file mode 100644
index 842b96c72..000000000
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Ported from executorch/extension/llm/runner/multimodal_runner.cpp
-
-#include "multimodal_runner.h"
-#include "constants.h"
-#include "util.h"
-#include <rnexecutorch/Error.h>
-
-namespace executorch {
-namespace extension {
-namespace llm {
-
-using ::executorch::extension::Module;
-using ::executorch::runtime::Error;
-
-MultimodalRunner::MultimodalRunner(
-    std::unordered_map<std::string, int64_t> metadata,
-    std::unique_ptr<tokenizers::HFTokenizer> tokenizer,
-    std::unique_ptr<Module> module,
-    std::unique_ptr<MultimodalDecoderRunner> decoder_runner,
-    std::unique_ptr<MultimodalPrefiller> prefiller,
-    std::unique_ptr<IOManager> io_manager,
-    std::unique_ptr<TextTokenGenerator> token_generator,
-    std::unique_ptr<Stats> stats)
-    : metadata_(std::move(metadata)), tokenizer_(std::move(tokenizer)),
-      module_(std::move(module)), decoder_runner_(std::move(decoder_runner)),
-      prefiller_(std::move(prefiller)), io_manager_(std::move(io_manager)),
-      token_generator_(std::move(token_generator)), stats_(std::move(stats)),
-      pos_(0) {}
-
-bool MultimodalRunner::is_loaded() {
-  return prefiller_->is_method_loaded() && token_generator_->is_loaded();
-}
-
-Error MultimodalRunner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-  ET_CHECK_OK_OR_RETURN_ERROR(prefiller_->load());
-  ET_CHECK_OK_OR_RETURN_ERROR(token_generator_->load());
-  return Error::Ok;
-}
-
-Error MultimodalRunner::generate(
-    const std::vector<MultimodalInput> &inputs, float temperature, float topp,
-    int32_t max_new_tokens,
-    std::function<void(const std::string &)> token_callback) {
-  if (inputs.empty()) {
-    ET_LOG(Error, "MultimodalInput vector cannot be empty");
-    return Error::InvalidArgument;
-  }
-
-  if (!is_loaded()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-  }
-
-  stats_->inference_start_ms = time_in_ms();
-
-  // Prefill all input segments in order.
-  uint64_t prefill_next_token = 0;
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    ET_LOG(Info, "Prefilling input %zu/%zu", i + 1, inputs.size());
-    auto prefill_result = prefiller_->prefill(inputs[i], pos_);
-    if (!prefill_result.ok()) {
-      return prefill_result.error();
-    }
-    prefill_next_token = prefill_result.get();
-  }
-
-  stats_->first_token_ms = time_in_ms();
-  stats_->prompt_eval_end_ms = time_in_ms();
-  stats_->num_prompt_tokens = pos_;
-
-  // Decode and emit the first token from prefill.
-  auto decode_result =
-      tokenizer_->decode(prefill_next_token, prefill_next_token);
-  if (!decode_result.ok()) {
-    ET_LOG(Error, "Tokenizer decode error %d",
-           static_cast<uint32_t>(decode_result.error()));
-    return Error::InvalidArgument;
-  }
-  const std::string first_piece = std::move(*decode_result);
-  safe_printf(first_piece.c_str());
-  fflush(stdout);
-  if (token_callback) {
-    token_callback(first_piece);
-  }
-
-  // Resolve max_new_tokens from metadata if caller passed -1.
-  int64_t context_len = metadata_.count(kMaxContextLen)
-                            ? metadata_.at(kMaxContextLen)
-                        : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen)
-                                                      : 2048;
-  int32_t resolved_max_new = max_new_tokens > 0
-                                 ? max_new_tokens
-                                 : static_cast<int32_t>(context_len - pos_);
-  resolved_max_new = std::max(0, resolved_max_new);
-
-  // Autoregressive decode loop.
-  std::vector<uint64_t> prompt_tokens = {prefill_next_token};
-  auto wrapped_callback = [&](const std::string &piece) {
-    safe_printf(piece.c_str());
-    fflush(stdout);
-    if (token_callback) {
-      token_callback(piece);
-    }
-  };
-
-  auto generate_result = token_generator_->generate(
-      prompt_tokens, pos_,
-      static_cast<uint64_t>(std::max(0, resolved_max_new - 1)), temperature,
-      topp, wrapped_callback);
-
-  if (!generate_result.ok()) {
-    return generate_result.error();
-  }
-
-  int64_t num_generated = generate_result.get();
-  pos_ += num_generated;
-
-  stats_->inference_end_ms = time_in_ms();
-  stats_->num_generated_tokens = num_generated;
-
-  return Error::Ok;
-}
-
-void MultimodalRunner::stop() {
-  if (token_generator_) {
-    token_generator_->stop();
-  }
-}
-
-void MultimodalRunner::reset() {
-  pos_ = 0;
-  if (stats_) {
-    stats_->reset();
-  }
-}
-
-} // namespace llm
-} // namespace extension
-} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
deleted file mode 100644
index c8007a67e..000000000
--- a/packages/react-native-executorch/common/runner/multimodal_runner.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Ported from executorch/extension/llm/runner/multimodal_runner.h
-
-#pragma once
-
-#include "multimodal_decoder_runner.h"
-#include "multimodal_input.h"
-#include "multimodal_prefiller.h"
-#include "stats.h"
-#include "text_token_generator.h"
-#include <executorch/extension/module/module.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace executorch {
-namespace extension {
-namespace llm {
-
-class MultimodalRunner {
-public:
-  explicit MultimodalRunner(
-      std::unordered_map<std::string, int64_t> metadata,
-      std::unique_ptr<tokenizers::HFTokenizer> tokenizer,
-      std::unique_ptr<Module> module,
-      std::unique_ptr<MultimodalDecoderRunner> decoder_runner,
-      std::unique_ptr<MultimodalPrefiller> prefiller,
-      std::unique_ptr<IOManager> io_manager,
-      std::unique_ptr<TextTokenGenerator> token_generator,
-      std::unique_ptr<Stats> stats);
-
-  bool is_loaded();
-  ::executorch::runtime::Error load();
-
-  ::executorch::runtime::Error
-  generate(const std::vector<MultimodalInput> &inputs, float temperature,
-           float topp, int32_t max_new_tokens,
-           std::function<void(const std::string &)> token_callback = {});
-
-  void stop();
-  void reset();
-
-  Stats &stats() { return *stats_; }
-
-private:
-  std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
-  std::unique_ptr<Module> module_;
-  std::unique_ptr<MultimodalDecoderRunner> decoder_runner_;
-  std::unique_ptr<MultimodalPrefiller> prefiller_;
-  std::unique_ptr<IOManager> io_manager_;
-  std::unique_ptr<TextTokenGenerator> token_generator_;
-  std::unique_ptr<Stats> stats_;
-  int64_t pos_ = 0;
-};
-
-} // namespace llm
-} // namespace extension
-} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/runner.cpp b/packages/react-native-executorch/common/runner/runner.cpp
deleted file mode 100644
index 8e4660ac5..000000000
--- a/packages/react-native-executorch/common/runner/runner.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
- */
-
-// A simple llama2 runner that includes preprocessing and post processing logic.
-// The module takes in a string as input and emits a string as output.
-
-#include "runner.h"
-#include "constants.h"
-#include "util.h"
-#include <cstdint>
-#include <ctime>
-#include <rnexecutorch/Error.h>
-
-namespace example {
-
-using namespace executorch::extension::llm;
-using ::executorch::extension::Module;
-using ::executorch::runtime::Error;
-using ::executorch::runtime::Result;
-
-Runner::Runner(Module *module, const std::string &tokenizer_path,
-               const llm::GenerationConfig &config)
-    : config_(config), module_(module), tokenizer_path_(tokenizer_path),
-      tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
-      metadata_({
-          {kEnableDynamicShape, false},
-          {kMaxSeqLen, 128},
-          {kMaxContextLen, 128},
-          {kUseKVCache, true},
-          {kUseSDPAWithKVCache, false},
-      }) {}
-
-bool Runner::is_loaded() const {
-  return module_->is_loaded() && tokenizer_->is_loaded() &&
-         text_decoder_runner_ && text_prefiller_ && text_token_generator_;
-}
-
-Error Runner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-
-  auto status = tokenizer_->load(tokenizer_path_);
-
-  if (status != tokenizers::Error::Ok) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occured while loading tokenizer");
-  };
-
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-
-  ET_LOG(Info, "Reading metadata from model");
-
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  metadata_[kVocabSize] = tokenizer_->vocab_size();
-
-  // Load model metadata
-  const auto method_names =
-      ET_UNWRAP(module_->method_names(), "Failed reading method names");
-  for (auto &pair : metadata_) {
-    const auto &method_name = pair.first;
-    auto &value = pair.second;
-    if (method_names.count(method_name)) {
-      value = ET_UNWRAP(module_->get(method_name))
-                  .toScalar()
-                  .to<decltype(metadata_)::mapped_type>();
-    } else {
-      ET_LOG(Info, "Method %s not found, using the default value %" PRId64,
-             method_name.c_str(), value);
-    }
-    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
-  }
-
-  // Load EOS token ids
-  if (method_names.count(kEosIds)) {
-    eos_ids->clear();
-    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
-      auto value = eos_id.toScalar().to<int64_t>();
-      eos_ids->emplace(value);
-      ET_LOG(Info, "eos_id = %" PRId64, value);
-    }
-  }
-
-  // Determine missing config values
-  // If user does not directly specify configuration parameters such as
-  // max_seq_len (i.e. leaves them as default values), they are determined by
-  // reading the exported model's methods.
-  if (config_.max_seq_len < 0)
-    config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
-  if (config_.max_context_length < 0)
-    config_.max_context_length =
-        static_cast<int32_t>(metadata_.at(kMaxContextLen));
-  if (config_.max_new_tokens < 0)
-    config_.max_new_tokens =
-        std::min(config_.max_seq_len, config_.max_context_length);
-  if (config_.enable_dynamic_shape)
-    config_.enable_dynamic_shape =
-        static_cast<bool>(metadata_.at(kEnableDynamicShape));
-  if (config_.enable_kv_cache)
-    config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
-
-  io_manager_ = std::make_unique<llm::IOManager>(*module_);
-  text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_, io_manager_.get(), config_.temperature, config_.topp);
-  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
-      text_decoder_runner_.get(), config_.enable_kv_cache,
-      config_.enable_dynamic_shape, config_.max_seq_len);
-
-  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
-      tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
-      std::move(eos_ids), &stats_);
-
-  return Error::Ok;
-}
-
-// Don't print with the same priority during warmup
-#define RUNNER_ET_LOG(warmup, format, ...)                                     \
-  if (warmup) {                                                                \
-    ET_LOG(Debug, format, __VA_ARGS__);                                        \
-  } else {                                                                     \
-    ET_LOG(Info, format, __VA_ARGS__);                                         \
-  }
-
-Error Runner::generate(const std::string &prompt,
-                       const llm::GenerationConfig &generation_config,
-                       std::function<void(const std::string &)> token_callback,
-                       std::function<void(const llm::Stats &)> stats_callback) {
-  // Prepare the inputs.
-  // Use ones-initialized inputs.
-  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-  if (!is_loaded()) {
-    stats_.model_load_start_ms = llm::time_in_ms();
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_.model_load_end_ms = llm::time_in_ms();
-  }
-
-  if (generation_config.warming) {
-    ET_LOG(Info, "Doing a warmup run...");
-  }
-
-  RUNNER_ET_LOG(generation_config.warming,
-                "RSS after loading model: %f MiB (0 if unsupported)",
-                llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // Wrap the token_callback with print function
-  std::function<void(const std::string &)> wrapped_callback =
-      [token_callback, &generation_config](const std::string &piece) {
-        if (!generation_config.warming) {
-          llm::safe_printf(piece.c_str());
-          fflush(stdout);
-        }
-        if (token_callback) {
-          token_callback(piece);
-        }
-      };
-  // First token time only measures the time it takes to encode the prompt and
-  // return a response token.
-
-  stats_.inference_start_ms = llm::time_in_ms();
-  shouldStop_ = false;
-
-  // Override main config fields with given generation config if specified
-  int32_t max_seq_len = generation_config.max_seq_len >= 0
-                            ? generation_config.max_seq_len
-                            : config_.max_seq_len;
-  int32_t max_context_length = generation_config.max_context_length >= 0
-                                   ? generation_config.max_context_length
-                                   : config_.max_context_length;
-  int32_t new_tokens_limit = generation_config.max_new_tokens >= 0
-                                 ? generation_config.max_new_tokens
-                                 : config_.max_new_tokens;
-  float temperature = generation_config.temperature >= 0.F
-                          ? generation_config.temperature
-                          : config_.temperature;
-  float topp =
-      generation_config.topp >= 0.F ? generation_config.topp : config_.topp;
-
-  int64_t context_len_left = static_cast<int64_t>(max_context_length) - pos_;
-
-  // If the used tokenizer.json has defined post_processor field,
-  // setting any of bos or eos arguments to value other than provided constant
-  // ( which is 0) will result in running the post_processor with
-  // 'add_special_token' flag
-  auto encodeResult =
-      tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens);
-  if (!encodeResult.ok()) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occured while encoding: " +
-            std::to_string(static_cast<int32_t>(encodeResult.error())));
-  }
-  std::vector<uint64_t> prompt_tokens = encodeResult.get();
-
-  std::vector<uint64_t> prompt_tokens_uint64(prompt_tokens.begin(),
-                                             prompt_tokens.end());
-
-  // encode the (string) prompt into tokens sequence
-  int num_prompt_tokens = prompt_tokens.size();
-
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
-                           "Expected at least 1 prompt token");
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument,
-                           "num_prompt_tokens %d >= max_context_len %" PRId32
-                           ", Max seq length exceeded - please increase max "
-                           "seq len value in your export script",
-                           num_prompt_tokens, max_seq_len);
-
-  // Determine max_new_tokens using the GenerationConfig's resolve method,
-  // then subtract pos_ for max_new_tokens.
-  int32_t max_new_tokens = resolve_max_new_tokens(
-      num_prompt_tokens, max_seq_len, static_cast<int32_t>(context_len_left),
-      new_tokens_limit);
-
-  ET_LOG(Info,
-         "Max new tokens resolved: %d, given pos_ %" PRId64
-         ", num_prompt_tokens %zu, max_context_len %" PRId64,
-         max_new_tokens, pos_, prompt_tokens.size(),
-         static_cast<int64_t>(max_context_length));
-  ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
-                           "Max new tokens %d is less than or equal to 0",
-                           max_new_tokens);
-
-  // Prefill first
-  // Here feed all tokens to the model and get the next predicted token
-  // after the prompt. After that we will enter generate loop.
-
-  // print prompts
-  if (generation_config.echo) {
-    wrapped_callback(prompt);
-  }
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens_uint64, pos_);
-  stats_.first_token_ms = llm::time_in_ms();
-  stats_.prompt_eval_end_ms = llm::time_in_ms();
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
-  uint64_t cur_token = prefill_res.get();
-  auto decodeResult = tokenizer_->decode({cur_token});
-  if (!decodeResult.ok()) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occured while decoding: " +
-            std::to_string(static_cast<int32_t>(decodeResult.error())));
-  }
-  const std::string cur_decoded = decodeResult.get();
-  RUNNER_ET_LOG(generation_config.warming,
-                "RSS after prompt prefill: %f MiB (0 if unsupported)",
-                llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // start the main loop
-  prompt_tokens_uint64.push_back(cur_token);
-  int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate(
-      prompt_tokens_uint64, pos_, max_new_tokens - 1, temperature, topp,
-      wrapped_callback));
-
-  pos_ += num_generated_tokens;
-
-  stats_.inference_end_ms = llm::time_in_ms();
-  if (!generation_config.warming) {
-    printf("\n");
-  }
-  RUNNER_ET_LOG(
-      generation_config.warming,
-      "RSS after finishing text generation: %f MiB (0 if unsupported)",
-      llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  if (num_generated_tokens == max_new_tokens) {
-    RUNNER_ET_LOG(generation_config.warming, "Max new tokens %i reached!",
-                  max_new_tokens);
-  }
-
-  stats_.num_prompt_tokens = num_prompt_tokens;
-  stats_.num_generated_tokens = num_generated_tokens;
-
-  if (generation_config.warming) {
-    ET_LOG(Info, "Warmup run finished!");
-  } else {
-    // Do not print report during warmup
-#ifndef TEST_BUILD
-    ::executorch::llm::print_report(stats_);
-#endif
-  }
-  if (stats_callback) {
-    stats_callback(stats_);
-  }
-
-  return Error::Ok;
-}
-
-Error Runner::warmup(const std::string &prompt) {
-  // Create a GenerationConfig for warmup
-  llm::GenerationConfig config{.echo = false, .warming = true};
-
-  // Call generate with the warmup config
-  Error err = generate(prompt, config,
-                       /*token_callback=*/nullptr,
-                       /*stats_callbak=*/nullptr);
-
-  // Reset stats after warmup
-  reset();
-
-  return err;
-}
-
-void Runner::stop() {
-  if (is_loaded()) {
-    text_token_generator_->stop();
-  } else {
-    ET_LOG(Error, "Token generator is not loaded, cannot stop");
-  }
-}
-
-void Runner::reset() {
-  stats_.reset();
-  pos_ = 0;
-}
-
-void Runner::set_count_interval(size_t count_interval) {
-  text_token_generator_->set_count_interval(count_interval);
-}
-
-void Runner::set_time_interval(size_t time_interval) {
-  text_token_generator_->set_time_interval(time_interval);
-}
-
-void Runner::set_temperature(float temperature) noexcept {
-  config_.temperature = temperature;
-  if (text_decoder_runner_) {
-    text_decoder_runner_->set_temperature(temperature);
-  }
-}
-
-void Runner::set_topp(float topp) noexcept {
-  config_.topp = topp;
-  if (text_decoder_runner_) {
-    text_decoder_runner_->set_topp(topp);
-  }
-}
-
-int32_t Runner::get_max_context_length() const {
-  if (!is_loaded()) {
-    return metadata_.at(kMaxContextLen);
-  }
-  return config_.max_context_length;
-}
-
-int32_t Runner::count_text_tokens(const std::string &text) const {
-  auto encodeResult =
-      tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens);
-
-  if (!encodeResult.ok()) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Encoding failed during token count check.");
-  }
-
-  return encodeResult.get().size();
-}
-
-int32_t Runner::resolve_max_new_tokens(int32_t num_prompt_tokens,
-                                       int32_t max_seq_len,
-                                       int32_t max_context_len,
-                                       int32_t max_new_tokens) const {
-  int32_t result;
-
-  if (max_seq_len == -1 && max_new_tokens == -1) {
-    // Both are -1, use max context len minus prompt tokens
-    result = max_context_len - num_prompt_tokens;
-  } else if (max_seq_len == -1 && max_new_tokens != -1) {
-    // Only max_new_tokens is specified
-    result = std::min(max_new_tokens, max_context_len - num_prompt_tokens);
-  } else if (max_seq_len != -1 && max_new_tokens == -1) {
-    // Only seq_len is specified
-    result = std::min(max_seq_len, max_context_len) - num_prompt_tokens;
-  } else {
-    // Both are specified
-    result =
-        std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens,
-                 max_new_tokens);
-  }
-
-  // Ensure result is not negative
-  return std::max(0, result);
-}
-
-} // namespace example
diff --git a/packages/react-native-executorch/common/runner/runner.h b/packages/react-native-executorch/common/runner/runner.h
deleted file mode 100644
index 03dff39bc..000000000
--- a/packages/react-native-executorch/common/runner/runner.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// A simple llama2 runner that includes preprocessing and post processing logic.
-// The module takes in a string as input and emits a string as output.
-
-#pragma once
-
-#include "irunner.h"
-#include "stats.h"
-#include "text_decoder_runner.h"
-#include "text_prefiller.h"
-#include "text_token_generator.h"
-#include <cstdint>
-#include <executorch/extension/module/module.h>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <pytorch/tokenizers/hf_tokenizer.h>
-#include <string>
-#include <unordered_map>
-
-namespace example {
-
-namespace llm = ::executorch::extension::llm;
-
-class Runner : public llm::IRunner {
-public:
-  explicit Runner(::executorch::extension::Module *module,
-                  const std::string &tokenizer_path,
-                  const llm::GenerationConfig &config = {
-                      .temperature = 0.8F, .topp = 0.9F}); // The main config
-
-  bool is_loaded() const override;
-  ::executorch::runtime::Error load() override;
-  ::executorch::runtime::Error generate(
-      const std::string &prompt,
-      const llm::GenerationConfig &generation_config =
-          {}, // An extra config which temporarily overrides previous model
-              // settings
-      std::function<void(const std::string &)> token_callback = {},
-      std::function<void(const llm::Stats &)> stats_callback = {}) override;
-  ::executorch::runtime::Error warmup(const std::string &prompt);
-  void set_count_interval(size_t count_interval);
-  void set_time_interval(size_t time_interval);
-  void set_temperature(float temperature) noexcept;
-  void set_topp(float topp) noexcept;
-  int32_t count_text_tokens(const std::string &text) const;
-  int32_t get_max_context_length() const;
-
-  void stop() override;
-  void reset() override;
-
-  llm::Stats stats_;
-
-private:
-  // Helper functions
-  int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
-                                 int32_t max_context_len,
-                                 int32_t max_new_tokens = -1) const;
-
-  // Main config
-  llm::GenerationConfig config_;
-
-  // Flow control
-  bool shouldStop_{false};
-  int64_t pos_ = 0; // The position in KV cache of the input, starting from 0.
-
-  // Main model
-  ::executorch::extension::Module *module_;
-
-  // Subcomponents
-  std::string tokenizer_path_;
-  std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
-  std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<llm::IOManager> io_manager_;
-  std::unique_ptr<llm::TextDecoderRunner> text_decoder_runner_;
-  std::unique_ptr<llm::TextPrefiller> text_prefiller_;
-  std::unique_ptr<llm::TextTokenGenerator> text_token_generator_;
-};
-
-} // namespace example
diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp
new file mode 100644
index 000000000..98955d593
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/unified_runner.cpp
@@ -0,0 +1,388 @@
+// packages/react-native-executorch/common/runner/unified_runner.cpp
+#include "unified_runner.h"
+#include "constants.h"
+#include "util.h"
+#include <cstdint>
+#include <ctime>
+#include <rnexecutorch/Error.h>
+
+namespace example {
+
+using namespace executorch::extension::llm;
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::Result;
+
+UnifiedRunner::UnifiedRunner(Module *module,
+                             std::unique_ptr<Module> owned_module,
+                             const std::string &tokenizer_path,
+                             const llm::GenerationConfig &config)
+    : config_(config), module_(owned_module ? owned_module.get() : module),
+      owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path),
+      tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
+      metadata_({
+          {kEnableDynamicShape, false},
+          {kMaxSeqLen, 128},
+          {kMaxContextLen, 128},
+          {kUseKVCache, true},
+          {kUseSDPAWithKVCache, false},
+      }) {}
+
+bool UnifiedRunner::is_multimodal() const noexcept { return multimodal_; }
+
+bool UnifiedRunner::is_loaded() const {
+  if (multimodal_) {
+    return mm_prefiller_ && mm_prefiller_->is_method_loaded() &&
+           mm_token_generator_ && mm_token_generator_->is_loaded();
+  }
+  return module_->is_loaded() && tokenizer_->is_loaded() &&
+         text_decoder_runner_ && text_prefiller_ && text_token_generator_;
+}
+
+Error UnifiedRunner::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
+
+  auto status = tokenizer_->load(tokenizer_path_);
+  if (status != tokenizers::Error::Ok) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while loading tokenizer");
+  }
+
+  // Detect mode by inspecting method names
+  const auto method_names =
+      ET_UNWRAP(module_->method_names(), "Failed reading method names");
+
+  multimodal_ = method_names.count(kTokenEmbeddingMethod) > 0 &&
+                method_names.count(kTextModelMethod) > 0;
+
+  // Load metadata
+  metadata_[kVocabSize] = tokenizer_->vocab_size();
+  for (auto &pair : metadata_) {
+    const auto &method_name = pair.first;
+    auto &value = pair.second;
+    if (method_names.count(method_name)) {
+      value = ET_UNWRAP(module_->get(method_name))
+                  .toScalar()
+                  .to<decltype(metadata_)::mapped_type>();
+    }
+    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
+  }
+
+  if (config_.max_seq_len < 0)
+    config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
+  if (config_.max_context_length < 0)
+    config_.max_context_length =
+        static_cast<int32_t>(metadata_.at(kMaxContextLen));
+  if (config_.max_new_tokens < 0)
+    config_.max_new_tokens =
+        std::min(config_.max_seq_len, config_.max_context_length);
+  if (config_.enable_dynamic_shape)
+    config_.enable_dynamic_shape =
+        static_cast<bool>(metadata_.at(kEnableDynamicShape));
+  if (config_.enable_kv_cache)
+    config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
+
+  // Load EOS ids
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  if (method_names.count(kEosIds)) {
+    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
+      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
+    }
+  }
+  if (eos_ids->empty()) {
+    eos_ids->emplace(7); // fallback <|im_end|>
+  }
+
+  io_manager_ = std::make_unique<llm::IOManager>(*module_);
+  llm::Stats *stats_ptr = &stats_;
+
+  if (multimodal_) {
+    mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
+        module_, io_manager_.get());
+    mm_prefiller_ = std::make_unique<llm::MultimodalPrefiller>(
+        module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get());
+    mm_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
+        tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
+        std::move(eos_ids), stats_ptr);
+
+    ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load());
+    ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load());
+  } else {
+    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
+
+    text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
+        module_, io_manager_.get(), config_.temperature, config_.topp);
+    text_prefiller_ = std::make_unique<llm::TextPrefiller>(
+        text_decoder_runner_.get(), config_.enable_kv_cache,
+        config_.enable_dynamic_shape, config_.max_seq_len);
+    text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
+        tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
+        std::move(eos_ids), stats_ptr);
+  }
+
+  return Error::Ok;
+}
+
+Error UnifiedRunner::generate(
+    const std::string &prompt, const llm::GenerationConfig &generation_config,
+    std::function<void(const std::string &)> token_callback,
+    std::function<void(const llm::Stats &)> stats_callback) {
+
+  ET_CHECK_MSG(!multimodal_,
+               "generate(prompt) called on a multimodal runner. Use "
+               "generate(vector<MultimodalInput>) instead.");
+  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
+
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = llm::time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_.model_load_end_ms = llm::time_in_ms();
+  }
+
+  std::function<void(const std::string &)> wrapped_callback =
+      [token_callback, &generation_config](const std::string &piece) {
+        if (!generation_config.warming) {
+          llm::safe_printf(piece.c_str());
+          fflush(stdout);
+        }
+        if (token_callback)
+          token_callback(piece);
+      };
+
+  stats_.inference_start_ms = llm::time_in_ms();
+  shouldStop_ = false;
+
+  int32_t max_seq_len = generation_config.max_seq_len >= 0
+                            ? generation_config.max_seq_len
+                            : config_.max_seq_len;
+  int32_t max_context_length = generation_config.max_context_length >= 0
+                                   ? generation_config.max_context_length
+                                   : config_.max_context_length;
+  int32_t new_tokens_limit = generation_config.max_new_tokens >= 0
+                                 ? generation_config.max_new_tokens
+                                 : config_.max_new_tokens;
+  float temperature = generation_config.temperature >= 0.F
+                          ? generation_config.temperature
+                          : config_.temperature;
+  float topp =
+      generation_config.topp >= 0.F ? generation_config.topp : config_.topp;
+
+  int64_t context_len_left = static_cast<int64_t>(max_context_length) - pos_;
+
+  auto encodeResult =
+      tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens);
+  if (!encodeResult.ok()) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while encoding: " +
+            std::to_string(static_cast<int32_t>(encodeResult.error())));
+  }
+  std::vector<uint64_t> prompt_tokens = encodeResult.get();
+  int num_prompt_tokens = prompt_tokens.size();
+
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
+                           "Expected at least 1 prompt token");
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument,
+                           "num_prompt_tokens %d >= max_seq_len %" PRId32,
+                           num_prompt_tokens, max_seq_len);
+
+  int32_t max_new_tokens = resolve_max_new_tokens(
+      num_prompt_tokens, max_seq_len, static_cast<int32_t>(context_len_left),
+      new_tokens_limit);
+
+  ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
+                           "Max new tokens %d is <= 0", max_new_tokens);
+
+  if (generation_config.echo)
+    wrapped_callback(prompt);
+
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_);
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+
+  uint64_t cur_token = prefill_res.get();
+  auto decodeResult = tokenizer_->decode({cur_token});
+  if (!decodeResult.ok()) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while decoding: " +
+            std::to_string(static_cast<int32_t>(decodeResult.error())));
+  }
+
+  prompt_tokens.push_back(cur_token);
+  int64_t num_generated = ET_UNWRAP(
+      text_token_generator_->generate(prompt_tokens, pos_, max_new_tokens - 1,
+                                      temperature, topp, wrapped_callback));
+
+  pos_ += num_generated;
+  stats_.inference_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = num_prompt_tokens;
+  stats_.num_generated_tokens = num_generated;
+
+  if (stats_callback)
+    stats_callback(stats_);
+
+  return Error::Ok;
+}
+
+Error UnifiedRunner::generate(
+    const std::vector<llm::MultimodalInput> &inputs, float temperature,
+    float topp, int32_t max_new_tokens,
+    std::function<void(const std::string &)> token_callback) {
+
+  ET_CHECK_MSG(multimodal_,
+               "generate(MultimodalInput) called on a text-only runner. Use "
+               "generate(string) instead.");
+
+  if (inputs.empty()) {
+    ET_LOG(Error, "MultimodalInput vector cannot be empty");
+    return Error::InvalidArgument;
+  }
+
+  if (!is_loaded())
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+
+  stats_.inference_start_ms = llm::time_in_ms();
+
+  uint64_t prefill_next_token = 0;
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_);
+    if (!prefill_result.ok())
+      return prefill_result.error();
+    prefill_next_token = prefill_result.get();
+  }
+
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = pos_;
+
+  auto decode_result =
+      tokenizer_->decode(prefill_next_token, prefill_next_token);
+  if (!decode_result.ok()) {
+    ET_LOG(Error, "Tokenizer decode error %d",
+           static_cast<uint32_t>(decode_result.error()));
+    return Error::InvalidArgument;
+  }
+  const std::string first_piece = std::move(*decode_result);
+  llm::safe_printf(first_piece.c_str());
+  fflush(stdout);
+  if (token_callback)
+    token_callback(first_piece);
+
+  int64_t context_len = metadata_.count(kMaxContextLen)
+                            ? metadata_.at(kMaxContextLen)
+                        : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen)
+                                                      : 2048;
+  int32_t resolved_max_new = max_new_tokens > 0
+                                 ? max_new_tokens
+                                 : static_cast<int32_t>(context_len - pos_);
+  resolved_max_new = std::max(0, resolved_max_new);
+
+  std::vector<uint64_t> seed_tokens = {prefill_next_token};
+  auto wrapped_callback = [&](const std::string &piece) {
+    llm::safe_printf(piece.c_str());
+    fflush(stdout);
+    if (token_callback)
+      token_callback(piece);
+  };
+
+  auto generate_result = mm_token_generator_->generate(
+      seed_tokens, pos_,
+      static_cast<uint64_t>(std::max(0, resolved_max_new - 1)), temperature,
+      topp, wrapped_callback);
+
+  if (!generate_result.ok())
+    return generate_result.error();
+
+  int64_t num_generated = generate_result.get();
+  pos_ += num_generated;
+
+  stats_.inference_end_ms = llm::time_in_ms();
+  stats_.num_generated_tokens = num_generated;
+
+  return Error::Ok;
+}
+
+void UnifiedRunner::stop() {
+  if (multimodal_) {
+    if (mm_token_generator_)
+      mm_token_generator_->stop();
+  } else {
+    if (text_token_generator_)
+      text_token_generator_->stop();
+  }
+}
+
+void UnifiedRunner::reset() {
+  stats_.reset();
+  pos_ = 0;
+}
+
+int32_t UnifiedRunner::count_text_tokens(const std::string &text) const {
+  auto encodeResult =
+      tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens);
+  if (!encodeResult.ok()) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Encoding failed during token count check.");
+  }
+  return static_cast<int32_t>(encodeResult.get().size());
+}
+
+int32_t UnifiedRunner::get_max_context_length() const {
+  if (!is_loaded()) {
+    return static_cast<int32_t>(metadata_.at(kMaxContextLen));
+  }
+  return config_.max_context_length;
+}
+
+void UnifiedRunner::set_temperature(float temperature) noexcept {
+  config_.temperature = temperature;
+  if (text_decoder_runner_)
+    text_decoder_runner_->set_temperature(temperature);
+}
+
+void UnifiedRunner::set_topp(float topp) noexcept {
+  config_.topp = topp;
+  if (text_decoder_runner_)
+    text_decoder_runner_->set_topp(topp);
+}
+
+void UnifiedRunner::set_count_interval(size_t count_interval) {
+  if (text_token_generator_)
+    text_token_generator_->set_count_interval(count_interval);
+  if (mm_token_generator_)
+    mm_token_generator_->set_count_interval(count_interval);
+}
+
+void UnifiedRunner::set_time_interval(size_t time_interval) {
+  if (text_token_generator_)
+    text_token_generator_->set_time_interval(time_interval);
+  if (mm_token_generator_)
+    mm_token_generator_->set_time_interval(time_interval);
+}
+
+int32_t UnifiedRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,
+                                              int32_t max_seq_len,
+                                              int32_t max_context_len,
+                                              int32_t max_new_tokens) const {
+  int32_t result;
+  if (max_seq_len == -1 && max_new_tokens == -1) {
+    result = max_context_len - num_prompt_tokens;
+  } else if (max_seq_len == -1) {
+    result = std::min(max_new_tokens, max_context_len - num_prompt_tokens);
+  } else if (max_new_tokens == -1) {
+    result = std::min(max_seq_len, max_context_len) - num_prompt_tokens;
+  } else {
+    result =
+        std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens,
+                 max_new_tokens);
+  }
+  return std::max(0, result);
+}
+
+} // namespace example
diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h
new file mode 100644
index 000000000..9f38fb9e5
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/unified_runner.h
@@ -0,0 +1,100 @@
+// packages/react-native-executorch/common/runner/unified_runner.h
+#pragma once
+
+#include "multimodal_decoder_runner.h"
+#include "multimodal_input.h"
+#include "multimodal_prefiller.h"
+#include "stats.h"
+#include "text_decoder_runner.h"
+#include "text_prefiller.h"
+#include "text_token_generator.h"
+#include <cstdint>
+#include <executorch/extension/module/module.h>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace example {
+
+namespace llm = ::executorch::extension::llm;
+
+class UnifiedRunner {
+public:
+  // module: raw pointer borrowed from BaseModel (text mode uses this)
+  // owned_module: unique_ptr taken for multimodal mode (nullptr in text mode)
+  // tokenizer_path: path to tokenizer JSON
+  // config: generation defaults
+  explicit UnifiedRunner(
+      ::executorch::extension::Module *module,
+      std::unique_ptr<::executorch::extension::Module> owned_module,
+      const std::string &tokenizer_path,
+      const llm::GenerationConfig &config = {.temperature = 0.8F,
+                                             .topp = 0.9F});
+
+  bool is_multimodal() const noexcept;
+  bool is_loaded() const;
+  ::executorch::runtime::Error load();
+
+  // Text-only generate — mirrors Runner::generate signature
+  ::executorch::runtime::Error
+  generate(const std::string &prompt,
+           const llm::GenerationConfig &generation_config = {},
+           std::function<void(const std::string &)> token_callback = {},
+           std::function<void(const llm::Stats &)> stats_callback = {});
+
+  // Multimodal generate — mirrors MultimodalRunner::generate signature
+  ::executorch::runtime::Error
+  generate(const std::vector<llm::MultimodalInput> &inputs, float temperature,
+           float topp, int32_t max_new_tokens,
+           std::function<void(const std::string &)> token_callback = {});
+
+  void stop();
+  void reset();
+
+  // Available for both modes
+  int32_t count_text_tokens(const std::string &text) const;
+  int32_t get_max_context_length() const;
+  void set_temperature(float temperature) noexcept;
+  void set_topp(float topp) noexcept;
+  void set_count_interval(size_t count_interval);
+  void set_time_interval(size_t time_interval);
+
+  llm::Stats stats_;
+
+private:
+  int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
+                                 int32_t max_context_len,
+                                 int32_t max_new_tokens = -1) const;
+
+  bool multimodal_{false};
+  llm::GenerationConfig config_;
+  bool shouldStop_{false};
+  int64_t pos_{0};
+
+  // module access — module_ is always a valid raw pointer
+  // In text mode: points to BaseModel's module_ (borrowed)
+  // In multimodal mode: points to owned_module_.get() (owned)
+  ::executorch::extension::Module *module_;
+  std::unique_ptr<::executorch::extension::Module> owned_module_;
+
+  std::string tokenizer_path_;
+  std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unique_ptr<llm::IOManager> io_manager_;
+
+  // Text-only subcomponents (null in multimodal mode)
+  std::unique_ptr<llm::TextDecoderRunner> text_decoder_runner_;
+  std::unique_ptr<llm::TextPrefiller> text_prefiller_;
+  std::unique_ptr<llm::TextTokenGenerator> text_token_generator_;
+
+  // Multimodal subcomponents (null in text mode)
+  std::unique_ptr<llm::MultimodalDecoderRunner> mm_decoder_runner_;
+  std::unique_ptr<llm::MultimodalPrefiller> mm_prefiller_;
+  std::unique_ptr<llm::TextTokenGenerator> mm_token_generator_;
+};
+
+} // namespace example
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 702a00c45..bd000d270 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -24,6 +24,7 @@ export class LLMController {
   private _isReady = false;
   private _isGenerating = false;
   private _messageHistory: Message[] = [];
+  private isMultimodal_ = false;
 
   // User callbacks
   private tokenCallback: (token: string) => void;
@@ -76,11 +77,13 @@ export class LLMController {
     tokenizerSource,
     tokenizerConfigSource,
     onDownloadProgressCallback,
+    isMultimodal = false,
   }: {
     modelSource: ResourceSource;
     tokenizerSource: ResourceSource;
-    tokenizerConfigSource: ResourceSource;
+    tokenizerConfigSource?: ResourceSource;
     onDownloadProgressCallback?: (downloadProgress: number) => void;
+    isMultimodal?: boolean;
   }) {
     // reset inner state when loading new model
     this.messageHistoryCallback(this.chatConfig.initialMessageHistory);
@@ -88,37 +91,59 @@ export class LLMController {
     this.isReadyCallback(false);
 
     try {
-      const tokenizersPromise = ResourceFetcher.fetch(
-        undefined,
-        tokenizerSource,
-        tokenizerConfigSource
-      );
+      let tokenizerPath: string | undefined;
+      let modelPath: string | undefined;
+
+      if (isMultimodal) {
+        // Multimodal models don't need tokenizer config
+        const [tokenizerResults, modelResult] = await Promise.all([
+          ResourceFetcher.fetch(undefined, tokenizerSource),
+          ResourceFetcher.fetch(onDownloadProgressCallback, modelSource),
+        ]);
+        tokenizerPath = tokenizerResults?.[0];
+        modelPath = modelResult?.[0];
+
+        if (!tokenizerPath || !modelPath) {
+          throw new RnExecutorchError(
+            RnExecutorchErrorCode.DownloadInterrupted,
+            'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
+          );
+        }
+      } else {
+        const tokenizersPromise = ResourceFetcher.fetch(
+          undefined,
+          tokenizerSource,
+          tokenizerConfigSource!
+        );
 
-      const modelPromise = ResourceFetcher.fetch(
-        onDownloadProgressCallback,
-        modelSource
-      );
+        const modelPromise = ResourceFetcher.fetch(
+          onDownloadProgressCallback,
+          modelSource
+        );
 
-      const [tokenizersResults, modelResult] = await Promise.all([
-        tokenizersPromise,
-        modelPromise,
-      ]);
+        const [tokenizersResults, modelResult] = await Promise.all([
+          tokenizersPromise,
+          modelPromise,
+        ]);
 
-      const tokenizerPath = tokenizersResults?.[0];
-      const tokenizerConfigPath = tokenizersResults?.[1];
-      const modelPath = modelResult?.[0];
+        tokenizerPath = tokenizersResults?.[0];
+        const tokenizerConfigPath = tokenizersResults?.[1];
+        modelPath = modelResult?.[0];
 
-      if (!tokenizerPath || !tokenizerConfigPath || !modelPath) {
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.DownloadInterrupted,
-          'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
+        if (!tokenizerPath || !tokenizerConfigPath || !modelPath) {
+          throw new RnExecutorchError(
+            RnExecutorchErrorCode.DownloadInterrupted,
+            'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
+          );
+        }
+
+        this.tokenizerConfig = JSON.parse(
+          await ResourceFetcher.fs.readAsString(tokenizerConfigPath!)
         );
       }
 
-      this.tokenizerConfig = JSON.parse(
-        await ResourceFetcher.fs.readAsString(tokenizerConfigPath!)
-      );
       this.nativeModule = global.loadLLM(modelPath, tokenizerPath);
+      this.isMultimodal_ = this.nativeModule.isMultimodal();
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
         if (!data) {
@@ -180,6 +205,9 @@ export class LLMController {
   }
 
   private filterSpecialTokens(text: string): string {
+    if (!this.tokenizerConfig) {
+      return text;
+    }
     let filtered = text;
     if (
       SPECIAL_TOKENS.EOS_TOKEN in this.tokenizerConfig &&
@@ -237,6 +265,64 @@ export class LLMController {
     }
   }
 
+  public async generateWithImage(
+    imagePath: string,
+    prompt: string
+  ): Promise<string> {
+    if (!this._isReady) {
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.ModuleNotLoaded,
+        'The model is currently not loaded.'
+      );
+    }
+    if (!this.isMultimodal_) {
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.InvalidUserInput,
+        'generateWithImage() requires a multimodal model. Load with isMultimodal: true.'
+      );
+    }
+    if (this._isGenerating) {
+      throw new RnExecutorchError(
+        RnExecutorchErrorCode.ModelGenerating,
+        'The model is currently generating.'
+      );
+    }
+    try {
+      this.isGeneratingCallback(true);
+      this.nativeModule.reset();
+      const response = await this.nativeModule.generateWithImage(
+        imagePath,
+        prompt,
+        this.onToken
+      );
+      return response;
+    } catch (e) {
+      throw parseUnknownError(e);
+    } finally {
+      this.isGeneratingCallback(false);
+    }
+  }
+
+  public async sendMessageWithImage(
+    imagePath: string,
+    message: string
+  ): Promise<string> {
+    const updatedHistory = [
+      ...this._messageHistory,
+      { content: message, role: 'user' as const },
+    ];
+    this.messageHistoryCallback(updatedHistory);
+
+    const response = await this.generateWithImage(imagePath, message);
+
+    this.messageHistoryCallback([
+      ...this._messageHistory,
+      { content: response, role: 'assistant' },
+    ]);
+
+    return response;
+  }
+
   public interrupt() {
     if (!this.nativeModule) {
       throw new RnExecutorchError(
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 5578c1de7..2920e1bb5 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -51,8 +51,9 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
         await controllerInstance.load({
           modelSource: model.modelSource,
           tokenizerSource: model.tokenizerSource,
-          tokenizerConfigSource: model.tokenizerConfigSource!,
+          tokenizerConfigSource: model.tokenizerConfigSource,
           onDownloadProgressCallback: setDownloadProgress,
+          isMultimodal: model.isMultimodal,
         });
       } catch (e) {
         setError(parseUnknownError(e));
@@ -69,6 +70,7 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     model.modelSource,
     model.tokenizerSource,
     model.tokenizerConfigSource,
+    model.isMultimodal,
     preventLoad,
   ]);
 
@@ -124,6 +126,14 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     [controllerInstance]
   );
 
+  const sendMessageWithImage = useCallback(
+    (imagePath: string, message: string) => {
+      setResponse('');
+      return controllerInstance.sendMessageWithImage(imagePath, message);
+    },
+    [controllerInstance]
+  );
+
   return {
     messageHistory,
     response,
@@ -140,5 +150,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     sendMessage: sendMessage,
     deleteMessage: deleteMessage,
     interrupt: interrupt,
+    sendMessageWithImage: sendMessageWithImage,
   };
 };
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts
deleted file mode 100644
index 0a54239cc..000000000
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts
+++ /dev/null
@@ -1,153 +0,0 @@
-import { useCallback, useEffect, useRef, useState } from 'react';
-import { ResourceSource } from '../../types/common';
-import { ResourceFetcher } from '../../utils/ResourceFetcher';
-import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
-import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
-
-export interface MultimodalLLMProps {
-  model: {
-    modelSource: ResourceSource;
-    tokenizerSource: ResourceSource;
-  };
-  preventLoad?: boolean;
-}
-
-export interface MultimodalLLMType {
-  isReady: boolean;
-  isGenerating: boolean;
-  downloadProgress: number;
-  response: string;
-  error: RnExecutorchError | null;
-  generate: (imagePath: string, prompt: string) => Promise<string>;
-  interrupt: () => void;
-}
-
-/**
- * React hook for managing a Multimodal LLM (VLM) instance.
- * Uses `loadMultimodalLLM` native global, which wraps a multi-method PTE
- * with vision_encoder, token_embedding, and text_decoder methods.
- *
- * @category Hooks
- */
-export const useMultimodalLLM = ({
-  model,
-  preventLoad = false,
-}: MultimodalLLMProps): MultimodalLLMType => {
-  const [nativeModule, setNativeModule] = useState<any>(null);
-  const [isReady, setIsReady] = useState(false);
-  const [isGenerating, setIsGenerating] = useState(false);
-  const [downloadProgress, setDownloadProgress] = useState(0);
-  const [response, setResponse] = useState('');
-  const [error, setError] = useState<RnExecutorchError | null>(null);
-
-  useEffect(() => {
-    setDownloadProgress(0);
-    setError(null);
-    setIsReady(false);
-
-    if (preventLoad) return;
-
-    let cancelled = false;
-
-    (async () => {
-      try {
-        const [modelResults, tokenizerResults] = await Promise.all([
-          ResourceFetcher.fetch(setDownloadProgress, model.modelSource),
-          ResourceFetcher.fetch(undefined, model.tokenizerSource),
-        ]);
-
-        if (cancelled) return;
-
-        const modelPath = modelResults?.[0];
-        const tokenizerPath = tokenizerResults?.[0];
-
-        if (!modelPath || !tokenizerPath) {
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.DownloadInterrupted,
-            'Download interrupted — not all files were fetched.'
-          );
-        }
-
-        const mod = global.loadMultimodalLLM(modelPath, tokenizerPath);
-        setNativeModule(mod);
-        setIsReady(true);
-      } catch (e) {
-        if (!cancelled) {
-          setError(parseUnknownError(e));
-        }
-      }
-    })();
-
-    return () => {
-      cancelled = true;
-    };
-  }, [model.modelSource, model.tokenizerSource, preventLoad]);
-
-  const tokenBufferRef = useRef('');
-  const rafRef = useRef<ReturnType<typeof requestAnimationFrame> | null>(null);
-
-  const generate = useCallback(
-    async (imagePath: string, prompt: string): Promise<string> => {
-      if (!nativeModule) {
-        throw new RnExecutorchError(
-          RnExecutorchErrorCode.ModuleNotLoaded,
-          'Multimodal LLM is not loaded yet.'
-        );
-      }
-      tokenBufferRef.current = '';
-      if (rafRef.current !== null) {
-        cancelAnimationFrame(rafRef.current);
-        rafRef.current = null;
-      }
-      setResponse('');
-      setIsGenerating(true);
-      try {
-        const result: string = await nativeModule.generate(
-          imagePath,
-          prompt,
-          (token: string) => {
-            tokenBufferRef.current += token;
-            if (rafRef.current === null) {
-              rafRef.current = requestAnimationFrame(() => {
-                rafRef.current = null;
-                const buffered = tokenBufferRef.current;
-                tokenBufferRef.current = '';
-                setResponse((prev) => prev + buffered);
-              });
-            }
-          }
-        );
-        // Flush any remaining buffered tokens after generation completes
-        if (rafRef.current !== null) {
-          cancelAnimationFrame(rafRef.current);
-          rafRef.current = null;
-        }
-        if (tokenBufferRef.current) {
-          const remaining = tokenBufferRef.current;
-          tokenBufferRef.current = '';
-          setResponse((prev) => prev + remaining);
-        }
-        return result;
-      } catch (e) {
-        throw parseUnknownError(e);
-      } finally {
-        setIsGenerating(false);
-      }
-    },
-    [nativeModule]
-  );
-
-  const interrupt = useCallback(() => {
-    nativeModule?.interrupt();
-  }, [nativeModule]);
-
-  return {
-    isReady,
-    isGenerating,
-    downloadProgress,
-    response,
-    error,
-    generate,
-    interrupt,
-  };
-};
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index e544d9cca..dd7557ca2 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -49,7 +49,6 @@ declare global {
   var loadVAD: (source: string) => any;
   var loadTextEmbeddings: (modelSource: string, tokenizerSource: string) => any;
   var loadLLM: (modelSource: string, tokenizerSource: string) => any;
-  var loadMultimodalLLM: (modelSource: string, tokenizerSource: string) => any;
   var loadTextToImage: (
     tokenizerSource: string,
     encoderSource: string,
@@ -98,7 +97,6 @@ if (
   global.loadImageEmbeddings == null ||
   global.loadVAD == null ||
   global.loadLLM == null ||
-  global.loadMultimodalLLM == null ||
   global.loadSpeechToText == null ||
   global.loadTextToSpeechKokoro == null ||
   global.loadOCR == null ||
@@ -123,7 +121,6 @@ export * from './hooks/computer_vision/useImageEmbeddings';
 export * from './hooks/computer_vision/useTextToImage';
 
 export * from './hooks/natural_language_processing/useLLM';
-export * from './hooks/natural_language_processing/useMultimodalLLM';
 export * from './hooks/natural_language_processing/useSpeechToText';
 export * from './hooks/natural_language_processing/useTextToSpeech';
 export * from './hooks/natural_language_processing/useTextEmbeddings';
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 25d87e248..71dfcd3f8 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -20,6 +20,11 @@ export interface LLMProps {
      * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
      */
     tokenizerConfigSource?: ResourceSource;
+    /**
+     * Set to `true` when loading a vision-language (multimodal) model.
+     * Skips tokenizer config fetching and enables `sendMessageWithImage`.
+     */
+    isMultimodal?: boolean;
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
@@ -123,6 +128,16 @@ export interface LLMType {
    * Function to interrupt the current inference.
    */
   interrupt: () => void;
+
+  /**
+   * Send a user message with an image. Updates messageHistory after model responds.
+   * Only valid for multimodal models (loaded with `isMultimodal: true`).
+   *
+   * @param imagePath - Local path to the image file.
+   * @param message - The text question about the image.
+   * @returns The model's response as a string.
+   */
+  sendMessageWithImage: (imagePath: string, message: string) => Promise<string>;
 }
 
 /**
diff --git a/yarn.lock b/yarn.lock
index f839c07a6..c2f2e609c 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -8721,6 +8721,15 @@ __metadata:
   languageName: node
   linkType: hard
 
+"expo-document-picker@npm:~13.0.3":
+  version: 13.0.3
+  resolution: "expo-document-picker@npm:13.0.3"
+  peerDependencies:
+    expo: "*"
+  checksum: 10/a336310e6327d26f36ac19b5867e2ef453dd59a0e30f7b2854c34bc1f874d967f92ced4e0b5fddc2b193ba1d88059033e6f3b076980c060169b191f4af184f90
+  languageName: node
+  linkType: hard
+
 "expo-file-system@npm:^19.0.20, expo-file-system@npm:~19.0.21":
   version: 19.0.21
   resolution: "expo-file-system@npm:19.0.21"
@@ -11450,6 +11459,7 @@ __metadata:
     expo-brightness: "npm:~14.0.8"
     expo-calendar: "npm:~15.0.8"
     expo-constants: "npm:~18.0.11"
+    expo-document-picker: "npm:~13.0.3"
     expo-font: "npm:~14.0.10"
     expo-linking: "npm:~8.0.10"
     expo-router: "npm:~6.0.17"
@@ -11461,6 +11471,7 @@ __metadata:
     react-native-device-info: "npm:^15.0.2"
     react-native-executorch: "workspace:*"
     react-native-gesture-handler: "npm:~2.28.0"
+    react-native-image-picker: "npm:^7.2.2"
     react-native-loading-spinner-overlay: "npm:^3.0.1"
     react-native-markdown-display: "npm:^7.0.2"
     react-native-reanimated: "npm:~4.1.1"

From bf50ae2ea822e390786c79a2cb599221c62f43ff Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 12:34:47 +0100
Subject: [PATCH 03/46] feat: add conversational VLM demo with
 multimodal/text-only support and fix token generation bugs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/llm/app/_layout.tsx                      |  16 +-
 apps/llm/app/index.tsx                        |  24 ++
 apps/llm/app/multimodal_llm/index.tsx         | 368 +++++++-----------
 .../common/rnexecutorch/models/llm/LLM.cpp    |   6 -
 .../common/runner/unified_runner.cpp          |  39 +-
 .../common/runner/unified_runner.h            |   4 +-
 .../src/controllers/LLMController.ts          |  13 +-
 7 files changed, 214 insertions(+), 256 deletions(-)

diff --git a/apps/llm/app/_layout.tsx b/apps/llm/app/_layout.tsx
index 523d3aaf7..4ab010693 100644
--- a/apps/llm/app/_layout.tsx
+++ b/apps/llm/app/_layout.tsx
@@ -57,38 +57,38 @@ export default function _layout() {
           headerTitleStyle: { color: ColorPalette.primary },
         }}
       >
-        {/* <Drawer.Screen
+        <Drawer.Screen
           name="llm/index"
           options={{
             drawerLabel: 'LLM',
             title: 'LLM',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        /> */}
-        {/* <Drawer.Screen
+        />
+        <Drawer.Screen
           name="llm_tool_calling/index"
           options={{
             drawerLabel: 'LLM Tool Calling',
             title: 'LLM Tool Calling',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        /> */}
-        {/* <Drawer.Screen
+        />
+        <Drawer.Screen
           name="llm_structured_output/index"
           options={{
             drawerLabel: 'LLM Structured Output',
             title: 'LLM Structured Output',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        /> */}
-        {/* <Drawer.Screen
+        />
+        <Drawer.Screen
           name="voice_chat/index"
           options={{
             drawerLabel: 'Voice Chat',
             title: 'Voice Chat',
             headerTitleStyle: { color: ColorPalette.primary },
           }}
-        /> */}
+        />
         <Drawer.Screen
           name="multimodal_llm/index"
           options={{
diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx
index 7c723a2ba..c7d5bae22 100644
--- a/apps/llm/app/index.tsx
+++ b/apps/llm/app/index.tsx
@@ -11,6 +11,30 @@ export default function Home() {
       <ExecutorchLogo width={64} height={64} />
       <Text style={styles.headerText}>Select a demo model</Text>
       <View style={styles.buttonContainer}>
+        <TouchableOpacity
+          style={styles.button}
+          onPress={() => router.navigate('llm/')}
+        >
+          <Text style={styles.buttonText}>LLM</Text>
+        </TouchableOpacity>
+        <TouchableOpacity
+          style={styles.button}
+          onPress={() => router.navigate('llm_tool_calling/')}
+        >
+          <Text style={styles.buttonText}>LLM Tool Calling</Text>
+        </TouchableOpacity>
+        <TouchableOpacity
+          style={styles.button}
+          onPress={() => router.navigate('llm_structured_output/')}
+        >
+          <Text style={styles.buttonText}>LLM Structured Output</Text>
+        </TouchableOpacity>
+        <TouchableOpacity
+          style={styles.button}
+          onPress={() => router.navigate('voice_chat/')}
+        >
+          <Text style={styles.buttonText}>Voice Chat</Text>
+        </TouchableOpacity>
         <TouchableOpacity
           style={styles.button}
           onPress={() => router.navigate('multimodal_llm/')}
diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 990f6cf1a..f5f402183 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -4,7 +4,6 @@ import {
   Keyboard,
   KeyboardAvoidingView,
   Platform,
-  ScrollView,
   StyleSheet,
   Text,
   TextInput,
@@ -12,123 +11,42 @@ import {
   TouchableWithoutFeedback,
   View,
 } from 'react-native';
-import * as DocumentPicker from 'expo-document-picker';
 import { launchImageLibrary } from 'react-native-image-picker';
 import { useIsFocused } from '@react-navigation/native';
 import { useLLM } from 'react-native-executorch';
+import SendIcon from '../../assets/icons/send_icon.svg';
+import PauseIcon from '../../assets/icons/pause_icon.svg';
 import ColorPalette from '../../colors';
+import Messages from '../../components/Messages';
 import Spinner from '../../components/Spinner';
 import { GeneratingContext } from '../../context';
 
+const MODEL_SOURCE =
+  'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte';
+const TOKENIZER_SOURCE =
+  'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json';
+const TOKENIZER_CONFIG_SOURCE =
+  'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json';
+
 export default function MultimodalLLMScreenWrapper() {
   const isFocused = useIsFocused();
-  return isFocused ? <MultimodalLLMScreenOuter /> : null;
-}
-
-// Outer component: collect model + tokenizer paths before mounting the hook
-function MultimodalLLMScreenOuter() {
-  const [modelUri, setModelUri] = useState<string | null>(null);
-  const [tokenizerUri, setTokenizerUri] = useState<string | null>(null);
-  const [confirmed, setConfirmed] = useState(false);
-
-  const pickFile = async (setter: (uri: string) => void) => {
-    const result = await DocumentPicker.getDocumentAsync({
-      copyToCacheDirectory: false,
-      multiple: false,
-    });
-    if (result.canceled) return;
-    const asset = result.assets[0];
-    if (asset?.uri) {
-      setter(asset.uri);
-    }
-  };
-
-  if (!confirmed) {
-    return (
-      <View style={styles.setupContainer}>
-        <Text style={styles.setupTitle}>Select model files</Text>
-        <Text style={styles.setupHint}>
-          Pick the .pte model and tokenizer.json from your device storage.
-        </Text>
-
-        <FilePicker
-          label="Model (.pte)"
-          uri={modelUri}
-          onPick={() => pickFile(setModelUri)}
-        />
-        <FilePicker
-          label="Tokenizer (.json)"
-          uri={tokenizerUri}
-          onPick={() => pickFile(setTokenizerUri)}
-        />
-
-        <TouchableOpacity
-          style={[
-            styles.loadButton,
-            (!modelUri || !tokenizerUri) && styles.loadButtonDisabled,
-          ]}
-          disabled={!modelUri || !tokenizerUri}
-          onPress={() => setConfirmed(true)}
-        >
-          <Text style={styles.loadButtonText}>Load model</Text>
-        </TouchableOpacity>
-      </View>
-    );
-  }
-
-  return (
-    <MultimodalLLMScreen
-      modelSource={modelUri!}
-      tokenizerSource={tokenizerUri!}
-    />
-  );
+  return isFocused ? <MultimodalLLMScreen /> : null;
 }
 
-function FilePicker({
-  label,
-  uri,
-  onPick,
-}: {
-  label: string;
-  uri: string | null;
-  onPick: () => void;
-}) {
-  const fileName = uri ? (uri.split('/').pop() ?? uri) : null;
-  return (
-    <TouchableOpacity style={styles.filePickerRow} onPress={onPick}>
-      <View style={styles.filePickerInfo}>
-        <Text style={styles.filePickerLabel}>{label}</Text>
-        <Text
-          style={[
-            styles.filePickerValue,
-            uri ? styles.filePickerValueSet : styles.filePickerValueEmpty,
-          ]}
-          numberOfLines={1}
-          ellipsizeMode="middle"
-        >
-          {fileName ?? 'Tap to pick file'}
-        </Text>
-      </View>
-      <Text style={styles.filePickerChevron}>›</Text>
-    </TouchableOpacity>
-  );
-}
-
-function MultimodalLLMScreen({
-  modelSource,
-  tokenizerSource,
-}: {
-  modelSource: string;
-  tokenizerSource: string;
-}) {
+function MultimodalLLMScreen() {
   const [imageUri, setImageUri] = useState<string | null>(null);
-  const [prompt, setPrompt] = useState('');
+  const [userInput, setUserInput] = useState('');
   const [isTextInputFocused, setIsTextInputFocused] = useState(false);
-  const scrollViewRef = useRef<ScrollView>(null);
+  const textInputRef = useRef<TextInput>(null);
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   const vlm = useLLM({
-    model: { modelSource, tokenizerSource, isMultimodal: true },
+    model: {
+      modelSource: MODEL_SOURCE,
+      tokenizerSource: TOKENIZER_SOURCE,
+      tokenizerConfigSource: TOKENIZER_CONFIG_SOURCE,
+      isMultimodal: true,
+    },
   });
 
   useEffect(() => {
@@ -136,26 +54,29 @@ function MultimodalLLMScreen({
   }, [vlm.isGenerating, setGlobalGenerating]);
 
   useEffect(() => {
-    if (vlm.error) {
-      console.error('MultimodalLLM error:', vlm.error);
-    }
+    if (vlm.error) console.error('MultimodalLLM error:', vlm.error);
   }, [vlm.error]);
 
   const pickImage = async () => {
     const result = await launchImageLibrary({ mediaType: 'photo' });
     if (result.assets && result.assets.length > 0) {
       const uri = result.assets[0]?.uri;
-      if (uri) {
-        setImageUri(uri);
-      }
+      if (uri) setImageUri(uri);
     }
   };
 
-  const handleGenerate = async () => {
-    if (!imageUri || !prompt.trim() || !vlm.isReady || vlm.isGenerating) return;
+  const sendMessage = async () => {
+    if (!userInput.trim() || vlm.isGenerating) return;
+    const text = userInput.trim();
+    setUserInput('');
+    textInputRef.current?.clear();
     Keyboard.dismiss();
     try {
-      await vlm.sendMessageWithImage(imageUri, prompt.trim());
+      if (imageUri) {
+        await vlm.sendMessageWithImage(imageUri, text);
+      } else {
+        await vlm.sendMessage(text);
+      }
     } catch (e) {
       console.error('Generation error:', e);
     }
@@ -182,79 +103,86 @@ function MultimodalLLMScreen({
         behavior={Platform.OS === 'ios' ? 'padding' : undefined}
         keyboardVerticalOffset={Platform.OS === 'ios' ? 120 : 40}
       >
-        <ScrollView
-          ref={scrollViewRef}
-          style={styles.scrollView}
-          contentContainerStyle={styles.scrollContent}
-          onContentSizeChange={() =>
-            scrollViewRef.current?.scrollToEnd({ animated: true })
-          }
-        >
-          {/* Image picker */}
-          <TouchableOpacity style={styles.imagePicker} onPress={pickImage}>
-            {imageUri ? (
-              <Image
-                source={{ uri: imageUri }}
-                style={styles.previewImage}
-                resizeMode="cover"
+        <View style={styles.container}>
+          {vlm.messageHistory.length ? (
+            <View style={styles.chatContainer}>
+              <Messages
+                chatHistory={vlm.messageHistory}
+                llmResponse={vlm.response}
+                isGenerating={vlm.isGenerating}
+                deleteMessage={vlm.deleteMessage}
               />
-            ) : (
-              <Text style={styles.imagePickerText}>Tap to pick an image</Text>
-            )}
-          </TouchableOpacity>
-
-          {/* Response area */}
-          {vlm.response ? (
-            <View style={styles.responseContainer}>
-              <Text style={styles.responseLabel}>Response:</Text>
-              <Text style={styles.responseText}>{vlm.response}</Text>
             </View>
-          ) : vlm.isGenerating ? (
-            <View style={styles.responseContainer}>
-              <Text style={styles.responseLabel}>Generating…</Text>
+          ) : (
+            <View style={styles.helloMessageContainer}>
+              <Text style={styles.helloText}>Hello! 👋</Text>
+              <Text style={styles.bottomHelloText}>
+                Pick an image and ask me anything about it.
+              </Text>
             </View>
-          ) : null}
-        </ScrollView>
+          )}
 
-        {/* Bottom bar */}
-        <View style={styles.bottomContainer}>
-          <TextInput
-            autoCorrect={false}
-            onFocus={() => setIsTextInputFocused(true)}
-            onBlur={() => setIsTextInputFocused(false)}
-            style={[
-              styles.textInput,
-              {
-                borderColor: isTextInputFocused
-                  ? ColorPalette.blueDark
-                  : ColorPalette.blueLight,
-              },
-            ]}
-            placeholder="Ask about the image…"
-            placeholderTextColor="#C1C6E5"
-            multiline
-            value={prompt}
-            onChangeText={setPrompt}
-          />
-          {vlm.isGenerating ? (
+          {/* Image thumbnail strip */}
+          {imageUri && (
             <TouchableOpacity
-              style={styles.actionButton}
-              onPress={vlm.interrupt}
+              style={styles.imageThumbnailContainer}
+              onPress={pickImage}
             >
-              <Text style={styles.actionButtonText}>Stop</Text>
+              <Image
+                source={{ uri: imageUri }}
+                style={styles.imageThumbnail}
+                resizeMode="cover"
+              />
+              <Text style={styles.imageThumbnailHint}>Tap to change</Text>
             </TouchableOpacity>
-          ) : (
+          )}
+
+          <View style={styles.bottomContainer}>
+            {/* Image picker button */}
             <TouchableOpacity
-              style={[
-                styles.actionButton,
-                (!imageUri || !prompt.trim()) && styles.actionButtonDisabled,
-              ]}
-              onPress={handleGenerate}
-              disabled={!imageUri || !prompt.trim()}
+              style={styles.imageButton}
+              onPress={pickImage}
+              disabled={vlm.isGenerating}
             >
-              <Text style={styles.actionButtonText}>Ask</Text>
+              <Text style={styles.imageButtonText}>📷</Text>
             </TouchableOpacity>
-          )}
+
+            <TextInput
+              autoCorrect={false}
+              ref={textInputRef}
+              onFocus={() => setIsTextInputFocused(true)}
+              onBlur={() => setIsTextInputFocused(false)}
+              style={[
+                styles.textInput,
+                {
+                  borderColor: isTextInputFocused
+                    ? ColorPalette.blueDark
+                    : ColorPalette.blueLight,
+                },
+              ]}
+              placeholder={imageUri ? 'Ask about the image…' : 'Your message'}
+              placeholderTextColor="#C1C6E5"
+              multiline
+              onChangeText={setUserInput}
+            />
+
+            {userInput.trim() && !vlm.isGenerating && (
+              <TouchableOpacity
+                style={styles.sendChatTouchable}
+                onPress={sendMessage}
+              >
+                <SendIcon height={24} width={24} padding={4} margin={8} />
+              </TouchableOpacity>
+            )}
+            {vlm.isGenerating && (
+              <TouchableOpacity
+                style={styles.sendChatTouchable}
+                onPress={vlm.interrupt}
+              >
+                <PauseIcon height={24} width={24} padding={4} margin={8} />
+              </TouchableOpacity>
+            )}
+          </View>
         </View>
       </KeyboardAvoidingView>
     </TouchableWithoutFeedback>
@@ -318,74 +246,76 @@ const styles = StyleSheet.create({
   loadButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 15 },
 
   // Chat phase
-  container: { flex: 1, backgroundColor: '#fff' },
-  scrollView: { flex: 1 },
-  scrollContent: { padding: 16, paddingBottom: 8 },
-  imagePicker: {
+  container: { flex: 1 },
+  chatContainer: { flex: 10, width: '100%' },
+  helloMessageContainer: {
+    flex: 10,
     width: '100%',
-    height: 220,
-    borderRadius: 12,
-    borderWidth: 1,
-    borderColor: ColorPalette.blueLight,
-    borderStyle: 'dashed',
-    justifyContent: 'center',
     alignItems: 'center',
-    overflow: 'hidden',
-    marginBottom: 16,
+    justifyContent: 'center',
   },
-  previewImage: { width: '100%', height: '100%' },
-  imagePickerText: {
-    color: ColorPalette.blueLight,
-    fontSize: 16,
+  helloText: {
+    fontFamily: 'medium',
+    fontSize: 30,
+    color: ColorPalette.primary,
+  },
+  bottomHelloText: {
     fontFamily: 'regular',
+    fontSize: 20,
+    lineHeight: 28,
+    textAlign: 'center',
+    color: ColorPalette.primary,
+    paddingHorizontal: 24,
   },
-  responseContainer: {
-    backgroundColor: ColorPalette.seaBlueLight,
+  imageThumbnailContainer: {
+    flexDirection: 'row',
+    alignItems: 'center',
+    paddingHorizontal: 16,
+    paddingVertical: 6,
+    gap: 8,
+  },
+  imageThumbnail: {
+    width: 48,
+    height: 48,
     borderRadius: 8,
-    padding: 12,
-    marginBottom: 8,
+    borderWidth: 1,
+    borderColor: ColorPalette.blueLight,
   },
-  responseLabel: {
+  imageThumbnailHint: {
     fontSize: 12,
-    color: ColorPalette.blueDark,
-    fontFamily: 'medium',
-    marginBottom: 4,
-  },
-  responseText: {
-    fontSize: 14,
-    lineHeight: 20,
-    color: ColorPalette.primary,
     fontFamily: 'regular',
+    color: ColorPalette.blueDark,
   },
   bottomContainer: {
+    height: 100,
+    width: '100%',
     flexDirection: 'row',
+    justifyContent: 'space-between',
     alignItems: 'center',
     paddingHorizontal: 16,
-    paddingVertical: 12,
-    borderTopWidth: 1,
-    borderTopColor: ColorPalette.blueLight,
-    backgroundColor: '#fff',
   },
+  imageButton: {
+    width: 40,
+    height: 40,
+    justifyContent: 'center',
+    alignItems: 'center',
+    marginRight: 4,
+  },
+  imageButtonText: { fontSize: 22 },
   textInput: {
     flex: 1,
     borderWidth: 1,
     borderRadius: 8,
-    fontSize: 14,
     lineHeight: 19.6,
     fontFamily: 'regular',
+    fontSize: 14,
     color: ColorPalette.primary,
-    padding: 12,
-    maxHeight: 100,
+    padding: 16,
   },
-  actionButton: {
-    marginLeft: 8,
-    backgroundColor: ColorPalette.strongPrimary,
-    borderRadius: 8,
-    paddingHorizontal: 16,
-    paddingVertical: 12,
+  sendChatTouchable: {
+    height: '100%',
+    width: 48,
     justifyContent: 'center',
-    alignItems: 'center',
+    alignItems: 'flex-end',
   },
-  actionButtonDisabled: { backgroundColor: ColorPalette.blueLight },
-  actionButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 14 },
 });
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 66b151faa..320e9bebe 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -77,11 +77,6 @@ std::string LLM::generate(std::string input,
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
-  if (multimodal_) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "This is a multimodal model. Call generate(imagePath, prompt, cb).");
-  }
 
   std::string output;
   auto nativeCallback = [this, callback, &output](const std::string &token) {
@@ -136,7 +131,6 @@ std::string LLM::generate(std::string imagePath, std::string prompt,
     throw RnExecutorchError(error, "Failed to generate multimodal response");
   }
 
-  runner_->reset();
   return output;
 }
 
diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp
index 98955d593..a136835a3 100644
--- a/packages/react-native-executorch/common/runner/unified_runner.cpp
+++ b/packages/react-native-executorch/common/runner/unified_runner.cpp
@@ -131,11 +131,23 @@ Error UnifiedRunner::generate(
     std::function<void(const std::string &)> token_callback,
     std::function<void(const llm::Stats &)> stats_callback) {
 
-  ET_CHECK_MSG(!multimodal_,
-               "generate(prompt) called on a multimodal runner. Use "
-               "generate(vector<MultimodalInput>) instead.");
   ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
 
+  // In multimodal mode, delegate to the multimodal generate path with
+  // text-only input (no image).
+  if (multimodal_) {
+    std::vector<llm::MultimodalInput> text_inputs = {
+        llm::make_text_input(prompt)};
+    float temp =
+        generation_config.temperature >= 0.F
+            ? generation_config.temperature
+            : (config_.temperature >= 0.F ? config_.temperature : 0.8F);
+    float topp = generation_config.topp >= 0.F
+                     ? generation_config.topp
+                     : (config_.topp >= 0.F ? config_.topp : 0.9F);
+    return generate(text_inputs, temp, topp, -1, token_callback);
+  }
+
   if (!is_loaded()) {
     stats_.model_load_start_ms = llm::time_in_ms();
     ET_CHECK_OK_OR_RETURN_ERROR(load());
@@ -248,6 +260,7 @@ Error UnifiedRunner::generate(
 
   stats_.inference_start_ms = llm::time_in_ms();
 
+  int64_t pos_before_prefill = pos_;
   uint64_t prefill_next_token = 0;
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_);
@@ -260,26 +273,14 @@ Error UnifiedRunner::generate(
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   stats_.num_prompt_tokens = pos_;
 
-  auto decode_result =
-      tokenizer_->decode(prefill_next_token, prefill_next_token);
-  if (!decode_result.ok()) {
-    ET_LOG(Error, "Tokenizer decode error %d",
-           static_cast<uint32_t>(decode_result.error()));
-    return Error::InvalidArgument;
-  }
-  const std::string first_piece = std::move(*decode_result);
-  llm::safe_printf(first_piece.c_str());
-  fflush(stdout);
-  if (token_callback)
-    token_callback(first_piece);
-
   int64_t context_len = metadata_.count(kMaxContextLen)
                             ? metadata_.at(kMaxContextLen)
                         : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen)
                                                       : 2048;
-  int32_t resolved_max_new = max_new_tokens > 0
-                                 ? max_new_tokens
-                                 : static_cast<int32_t>(context_len - pos_);
+  int32_t resolved_max_new =
+      max_new_tokens > 0
+          ? max_new_tokens
+          : static_cast<int32_t>(context_len - pos_before_prefill);
   resolved_max_new = std::max(0, resolved_max_new);
 
   std::vector<uint64_t> seed_tokens = {prefill_next_token};
diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h
index 9f38fb9e5..6f003fcc5 100644
--- a/packages/react-native-executorch/common/runner/unified_runner.h
+++ b/packages/react-native-executorch/common/runner/unified_runner.h
@@ -1,6 +1,7 @@
 // packages/react-native-executorch/common/runner/unified_runner.h
 #pragma once
 
+#include "irunner.h"
 #include "multimodal_decoder_runner.h"
 #include "multimodal_input.h"
 #include "multimodal_prefiller.h"
@@ -32,8 +33,7 @@ class UnifiedRunner {
       ::executorch::extension::Module *module,
       std::unique_ptr<::executorch::extension::Module> owned_module,
       const std::string &tokenizer_path,
-      const llm::GenerationConfig &config = {.temperature = 0.8F,
-                                             .topp = 0.9F});
+      const llm::GenerationConfig &config = llm::GenerationConfig{});
 
   bool is_multimodal() const noexcept;
   bool is_loaded() const;
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index bd000d270..6e024220d 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -95,9 +95,12 @@ export class LLMController {
       let modelPath: string | undefined;
 
       if (isMultimodal) {
-        // Multimodal models don't need tokenizer config
         const [tokenizerResults, modelResult] = await Promise.all([
-          ResourceFetcher.fetch(undefined, tokenizerSource),
+          ResourceFetcher.fetch(
+            undefined,
+            tokenizerSource,
+            ...(tokenizerConfigSource ? [tokenizerConfigSource] : [])
+          ),
           ResourceFetcher.fetch(onDownloadProgressCallback, modelSource),
         ]);
         tokenizerPath = tokenizerResults?.[0];
@@ -109,6 +112,12 @@ export class LLMController {
             'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
           );
         }
+
+        if (tokenizerConfigSource && tokenizerResults?.[1]) {
+          this.tokenizerConfig = JSON.parse(
+            await ResourceFetcher.fs.readAsString(tokenizerResults[1])
+          );
+        }
       } else {
         const tokenizersPromise = ResourceFetcher.fetch(
           undefined,

From 1695f7e26603e95ede1f93453607382003b6b96a Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 13:13:39 +0100
Subject: [PATCH 04/46] fix: default UnifiedRunner temperature to 0.8 and topp
 to 0.9

---
 .../react-native-executorch/common/runner/unified_runner.h     | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h
index 6f003fcc5..ae7789bbe 100644
--- a/packages/react-native-executorch/common/runner/unified_runner.h
+++ b/packages/react-native-executorch/common/runner/unified_runner.h
@@ -33,7 +33,8 @@ class UnifiedRunner {
       ::executorch::extension::Module *module,
       std::unique_ptr<::executorch::extension::Module> owned_module,
       const std::string &tokenizer_path,
-      const llm::GenerationConfig &config = llm::GenerationConfig{});
+      const llm::GenerationConfig &config = {.temperature = 0.8F,
+                                             .topp = 0.9F});
 
   bool is_multimodal() const noexcept;
   bool is_loaded() const;

From b660b0feaf5e3f57fc7de6dd0ce8ca040114ef90 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 13:14:37 +0100
Subject: [PATCH 05/46] feat: add NativeMessage struct and JSI conversion for
 message history

---
 .../host_objects/JsiConversions.h             | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index df9abbdef..08acf6cff 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -228,6 +228,36 @@ getValue<std::vector<uint64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getArrayAsVector<uint64_t>(val, runtime);
 }
 
+struct NativeMessage {
+  std::string role; // "user" | "assistant" | "system"
+  std::string content;
+  std::string mediaPath; // empty string if no media
+};
+
+template <>
+inline std::vector<NativeMessage>
+getValue<std::vector<NativeMessage>>(const jsi::Value &val,
+                                     jsi::Runtime &runtime) {
+  jsi::Array array = val.asObject(runtime).asArray(runtime);
+  size_t length = array.size(runtime);
+  std::vector<NativeMessage> result;
+  result.reserve(length);
+  for (size_t i = 0; i < length; ++i) {
+    jsi::Object obj = array.getValueAtIndex(runtime, i).asObject(runtime);
+    NativeMessage msg;
+    msg.role =
+        obj.getProperty(runtime, "role").getString(runtime).utf8(runtime);
+    msg.content =
+        obj.getProperty(runtime, "content").getString(runtime).utf8(runtime);
+    auto mediaProp = obj.getProperty(runtime, "mediaPath");
+    if (!mediaProp.isUndefined() && !mediaProp.isNull()) {
+      msg.mediaPath = mediaProp.getString(runtime).utf8(runtime);
+    }
+    result.push_back(std::move(msg));
+  }
+  return result;
+}
+
 // Template specializations for std::span<T> types
 template <>
 inline std::span<float> getValue<std::span<float>>(const jsi::Value &val,

From 4331bded465523401a7910438220a69f1d65340a Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 13:17:14 +0100
Subject: [PATCH 06/46] feat: declare generateMultimodal on LLM and register
 JSI binding

---
 .../rnexecutorch/host_objects/ModelHostObject.h     |  5 +++++
 .../common/rnexecutorch/models/llm/LLM.h            | 13 +++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index f41da1e45..334f1f833 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -160,6 +160,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
               &Model::generate)>,
           "generateWithImage"));
 
+      addFunctions(
+          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
+                              promiseHostFunction<&Model::generateMultimodal>,
+                              "generateMultimodal"));
+
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>, synchronousHostFunction<&Model::isMultimodal>,
           "isMultimodal"));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 3763fe924..8b5684ad5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -2,10 +2,14 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <vector>
 
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
+#include <rnexecutorch/host_objects/JsiConversions.h>
 #include <rnexecutorch/models/BaseModel.h>
+#include <runner/image.h>
 #include <runner/unified_runner.h>
 
 namespace rnexecutorch {
@@ -26,6 +30,11 @@ class LLM : public BaseModel {
   std::string generate(std::string imagePath, std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
 
+  // Multimodal generate — takes full message history, builds MultimodalInput[]
+  std::string generateMultimodal(
+      std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
+      std::shared_ptr<jsi::Function> callback);
+
   bool isMultimodal() const noexcept;
 
   void interrupt();
@@ -46,6 +55,10 @@ class LLM : public BaseModel {
   bool multimodal_;
   float temperature_ = 0.8f;
   float topp_ = 0.9f;
+  std::unordered_map<std::string, executorch::extension::llm::Image>
+      imageCache_;
+  const executorch::extension::llm::Image &
+  getOrLoadImage(const std::string &path);
 };
 } // namespace models::llm
 

From d6530e4242d1afdf1605f38c36791ae846bc091e Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 13:19:33 +0100
Subject: [PATCH 07/46] fix: remove redundant unordered_map and vector includes
 from LLM.h

---
 .../common/rnexecutorch/models/llm/LLM.h                        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 8b5684ad5..5c9e1e458 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -2,8 +2,6 @@
 
 #include <memory>
 #include <string>
-#include <unordered_map>
-#include <vector>
 
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>

From d261a45d1400adb739c55cbb7daf31506e3a12e6 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 13:21:35 +0100
Subject: [PATCH 08/46] feat: implement generateMultimodal with per-turn chat
 template and image cache

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/models/llm/LLM.cpp    | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 320e9bebe..441d20dad 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -22,6 +22,10 @@ static constexpr int kImageChannels = 3;
 static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
 static constexpr const char *kChatSuffix =
     "<|im_end|>\n<|im_start|>assistant\n";
+// Separator inserted after each assistant turn in multi-turn conversations
+static constexpr const char *kAssistantTurnEnd = "<|im_end|>\n";
+// Prefix for subsequent user turns (no BOS token — only first turn has it)
+static constexpr const char *kUserTurnPrefix = "<|im_start|>user\n";
 
 static llm::Image loadImageForVLM(const std::string &imagePath) {
   cv::Mat mat = image_processing::readImage(imagePath);
@@ -39,6 +43,14 @@ static llm::Image loadImageForVLM(const std::string &imagePath) {
   return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels);
 }
 
+const llm::Image &LLM::getOrLoadImage(const std::string &path) {
+  auto it = imageCache_.find(path);
+  if (it != imageCache_.end()) {
+    return it->second;
+  }
+  return imageCache_.emplace(path, loadImageForVLM(path)).first->second;
+}
+
 LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::shared_ptr<react::CallInvoker> callInvoker)
     : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
@@ -134,6 +146,79 @@ std::string LLM::generate(std::string imagePath, std::string prompt,
   return output;
 }
 
+std::string LLM::generateMultimodal(
+    std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
+    std::shared_ptr<jsi::Function> callback) {
+  if (!runner_ || !runner_->is_loaded()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
+                            "Runner is not loaded");
+  }
+  if (!multimodal_) {
+    throw RnExecutorchError(
+        RnExecutorchErrorCode::InvalidUserInput,
+        "This is a text-only model. Use generate(prompt, cb) instead.");
+  }
+
+  std::vector<llm::MultimodalInput> inputs;
+  bool isFirst = true;
+
+  for (const auto &msg : messages) {
+    if (msg.role == "system") {
+      if (isFirst) {
+        inputs.push_back(llm::make_text_input(msg.content + "\n"));
+      }
+      continue;
+    }
+
+    if (msg.role == "user") {
+      if (isFirst) {
+        inputs.push_back(llm::make_text_input(std::string(kChatPrefix)));
+        isFirst = false;
+      } else {
+        inputs.push_back(llm::make_text_input(std::string(kUserTurnPrefix)));
+      }
+
+      if (!msg.mediaPath.empty()) {
+        const llm::Image &img = getOrLoadImage(msg.mediaPath);
+        inputs.push_back(llm::make_image_input(img));
+      }
+
+      if (!msg.content.empty()) {
+        inputs.push_back(llm::make_text_input(msg.content));
+      }
+
+      inputs.push_back(llm::make_text_input(std::string(kChatSuffix)));
+    } else if (msg.role == "assistant") {
+      inputs.push_back(llm::make_text_input(msg.content + kAssistantTurnEnd));
+      isFirst = false;
+    }
+  }
+
+  if (inputs.empty()) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "No inputs to generate from");
+  }
+
+  std::string output;
+  auto nativeCallback = [this, &callback, &output](const std::string &token) {
+    output += token;
+    if (callback && callInvoker) {
+      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
+        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
+      });
+    }
+  };
+
+  runner_->reset();
+  auto error =
+      runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
+  if (error != Error::Ok) {
+    throw RnExecutorchError(error, "Failed to generate multimodal response");
+  }
+
+  return output;
+}
+
 void LLM::interrupt() {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
@@ -148,6 +233,7 @@ void LLM::reset() {
                             "Can't reset a model that's not loaded");
   }
   runner_->reset();
+  imageCache_.clear();
 }
 
 size_t LLM::getGeneratedTokenCount() const noexcept {

From d91a64a60f21929d9bcb982575b1b9cf6cc0c530 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 13:25:00 +0100
Subject: [PATCH 09/46] feat: add mediaPath to Message, remove
 sendMessageWithImage from LLMType

---
 .../react-native-executorch/src/types/llm.ts  | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 71dfcd3f8..26843db92 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -110,12 +110,14 @@ export interface LLMType {
 
   /**
    * Function to add user message to conversation.
-   * After model responds, `messageHistory` will be updated with both user message and model response.
+   * Pass `mediaPath` for a multimodal message (image, audio, etc.).
+   * After model responds, `messageHistory` will be updated.
    *
    * @param message - The message string to send.
+   * @param mediaPath - Optional local file path to media.
    * @returns The model's response as a `string`.
    */
-  sendMessage: (message: string) => Promise<string>;
+  sendMessage: (message: string, mediaPath?: string) => Promise<string>;
 
   /**
    * Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated.
@@ -128,16 +130,6 @@ export interface LLMType {
    * Function to interrupt the current inference.
    */
   interrupt: () => void;
-
-  /**
-   * Send a user message with an image. Updates messageHistory after model responds.
-   * Only valid for multimodal models (loaded with `isMultimodal: true`).
-   *
-   * @param imagePath - Local path to the image file.
-   * @param message - The text question about the image.
-   * @returns The model's response as a string.
-   */
-  sendMessageWithImage: (imagePath: string, message: string) => Promise<string>;
 }
 
 /**
@@ -199,6 +191,11 @@ export type MessageRole = 'user' | 'assistant' | 'system';
 export interface Message {
   role: MessageRole;
   content: string;
+  /**
+   * Optional local file path to media (image, audio, etc.).
+   * Only valid on `user` messages.
+   */
+  mediaPath?: string;
 }
 
 /**

From 49f5af68bdf2a0c47a2b8f7d47f6a5ac6f367d27 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 14:11:16 +0100
Subject: [PATCH 10/46] feat: replace sendMessageWithImage with
 sendMessage(msg, mediaPath?) using generateMultimodal

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/llm/app/multimodal_llm/index.tsx         |   6 +-
 .../src/controllers/LLMController.ts          | 128 ++++++++----------
 .../natural_language_processing/useLLM.ts     |  13 +-
 .../react-native-executorch/src/types/llm.ts  |   2 +-
 4 files changed, 61 insertions(+), 88 deletions(-)

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index f5f402183..7a3f85671 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -72,11 +72,7 @@ function MultimodalLLMScreen() {
     textInputRef.current?.clear();
     Keyboard.dismiss();
     try {
-      if (imageUri) {
-        await vlm.sendMessageWithImage(imageUri, text);
-      } else {
-        await vlm.sendMessage(text);
-      }
+      await vlm.sendMessage(text, imageUri ?? undefined);
     } catch (e) {
       console.error('Generation error:', e);
     }
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 6e024220d..a47c04b9d 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -274,10 +274,7 @@ export class LLMController {
     }
   }
 
-  public async generateWithImage(
-    imagePath: string,
-    prompt: string
-  ): Promise<string> {
+  private async generateMultimodal(messages: Message[]): Promise<string> {
     if (!this._isReady) {
       throw new RnExecutorchError(
         RnExecutorchErrorCode.ModuleNotLoaded,
@@ -287,7 +284,7 @@ export class LLMController {
     if (!this.isMultimodal_) {
       throw new RnExecutorchError(
         RnExecutorchErrorCode.InvalidUserInput,
-        'generateWithImage() requires a multimodal model. Load with isMultimodal: true.'
+        'generateMultimodal() requires a multimodal model.'
       );
     }
     if (this._isGenerating) {
@@ -299,9 +296,8 @@ export class LLMController {
     try {
       this.isGeneratingCallback(true);
       this.nativeModule.reset();
-      const response = await this.nativeModule.generateWithImage(
-        imagePath,
-        prompt,
+      const response = await this.nativeModule.generateMultimodal(
+        messages,
         this.onToken
       );
       return response;
@@ -312,26 +308,6 @@ export class LLMController {
     }
   }
 
-  public async sendMessageWithImage(
-    imagePath: string,
-    message: string
-  ): Promise<string> {
-    const updatedHistory = [
-      ...this._messageHistory,
-      { content: message, role: 'user' as const },
-    ];
-    this.messageHistoryCallback(updatedHistory);
-
-    const response = await this.generateWithImage(imagePath, message);
-
-    this.messageHistoryCallback([
-      ...this._messageHistory,
-      { content: response, role: 'assistant' },
-    ]);
-
-    return response;
-  }
-
   public interrupt() {
     if (!this.nativeModule) {
       throw new RnExecutorchError(
@@ -399,36 +375,47 @@ export class LLMController {
     return await this.forward(renderedChat);
   }
 
-  public async sendMessage(message: string): Promise<string> {
-    const updatedHistory = [
-      ...this._messageHistory,
-      { content: message, role: 'user' as const },
-    ];
+  public async sendMessage(
+    message: string,
+    mediaPath?: string
+  ): Promise<string> {
+    const newMessage: Message = {
+      content: message,
+      role: 'user',
+      ...(mediaPath ? { mediaPath } : {}),
+    };
+    const updatedHistory = [...this._messageHistory, newMessage];
     this.messageHistoryCallback(updatedHistory);
 
-    const countTokensCallback = (messages: Message[]) => {
-      const rendered = this.applyChatTemplate(
-        messages,
-        this.tokenizerConfig,
-        this.toolsConfig?.tools,
-        // eslint-disable-next-line camelcase
-        { tools_in_user_message: false, add_generation_prompt: true }
-      );
-      return this.nativeModule.countTextTokens(rendered);
-    };
-    const maxContextLength = this.nativeModule.getMaxContextLength();
-    const messageHistoryWithPrompt =
-      this.chatConfig.contextStrategy.buildContext(
-        this.chatConfig.systemPrompt,
-        updatedHistory,
-        maxContextLength,
-        countTokensCallback
+    let response: string;
+
+    if (mediaPath || this._messageHistory.some((m) => m.mediaPath)) {
+      // Any message in history has media — use multimodal path
+      response = await this.generateMultimodal(updatedHistory);
+    } else {
+      const countTokensCallback = (messages: Message[]) => {
+        const rendered = this.applyChatTemplate(
+          messages,
+          this.tokenizerConfig,
+          this.toolsConfig?.tools,
+          // eslint-disable-next-line camelcase
+          { tools_in_user_message: false, add_generation_prompt: true }
+        );
+        return this.nativeModule.countTextTokens(rendered);
+      };
+      const maxContextLength = this.nativeModule.getMaxContextLength();
+      const messageHistoryWithPrompt =
+        this.chatConfig.contextStrategy.buildContext(
+          this.chatConfig.systemPrompt,
+          updatedHistory,
+          maxContextLength,
+          countTokensCallback
+        );
+      response = await this.generate(
+        messageHistoryWithPrompt,
+        this.toolsConfig?.tools
       );
-
-    const response = await this.generate(
-      messageHistoryWithPrompt,
-      this.toolsConfig?.tools
-    );
+    }
 
     if (!this.toolsConfig || this.toolsConfig.displayToolCalls) {
       this.messageHistoryCallback([
@@ -436,24 +423,23 @@ export class LLMController {
         { content: response, role: 'assistant' },
       ]);
     }
-    if (!this.toolsConfig) {
-      return response;
-    }
 
-    const toolCalls = parseToolCall(response);
-
-    for (const toolCall of toolCalls) {
-      this.toolsConfig
-        .executeToolCallback(toolCall)
-        .then((toolResponse: string | null) => {
-          if (toolResponse) {
-            this.messageHistoryCallback([
-              ...this._messageHistory,
-              { content: toolResponse, role: 'assistant' },
-            ]);
-          }
-        });
+    if (this.toolsConfig) {
+      const toolCalls = parseToolCall(response);
+      for (const toolCall of toolCalls) {
+        this.toolsConfig
+          .executeToolCallback(toolCall)
+          .then((toolResponse: string | null) => {
+            if (toolResponse) {
+              this.messageHistoryCallback([
+                ...this._messageHistory,
+                { content: toolResponse, role: 'assistant' },
+              ]);
+            }
+          });
+      }
     }
+
     return response;
   }
 
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 2920e1bb5..deabbbbb0 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -94,9 +94,9 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
   );
 
   const sendMessage = useCallback(
-    (message: string) => {
+    (message: string, mediaPath?: string) => {
       setResponse('');
-      return controllerInstance.sendMessage(message);
+      return controllerInstance.sendMessage(message, mediaPath);
     },
     [controllerInstance]
   );
@@ -126,14 +126,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     [controllerInstance]
   );
 
-  const sendMessageWithImage = useCallback(
-    (imagePath: string, message: string) => {
-      setResponse('');
-      return controllerInstance.sendMessageWithImage(imagePath, message);
-    },
-    [controllerInstance]
-  );
-
   return {
     messageHistory,
     response,
@@ -150,6 +142,5 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     sendMessage: sendMessage,
     deleteMessage: deleteMessage,
     interrupt: interrupt,
-    sendMessageWithImage: sendMessageWithImage,
   };
 };
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 26843db92..0c648c25d 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -22,7 +22,7 @@ export interface LLMProps {
     tokenizerConfigSource?: ResourceSource;
     /**
      * Set to `true` when loading a vision-language (multimodal) model.
-     * Skips tokenizer config fetching and enables `sendMessageWithImage`.
+     * Skips tokenizer config fetching and enables multimodal message handling via `sendMessage`.
      */
     isMultimodal?: boolean;
   };

From d07ce65c6cc0834c2157c1a8cc22c6cc17fe5087 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:04:50 +0100
Subject: [PATCH 11/46] fix: use updatedHistory for multimodal routing, remove
 redundant reset before generateMultimodal

---
 .../react-native-executorch/src/controllers/LLMController.ts   | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index a47c04b9d..cb854546f 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -295,7 +295,6 @@ export class LLMController {
     }
     try {
       this.isGeneratingCallback(true);
-      this.nativeModule.reset();
       const response = await this.nativeModule.generateMultimodal(
         messages,
         this.onToken
@@ -389,7 +388,7 @@ export class LLMController {
 
     let response: string;
 
-    if (mediaPath || this._messageHistory.some((m) => m.mediaPath)) {
+    if (updatedHistory.some((m) => m.mediaPath)) {
       // Any message in history has media — use multimodal path
       response = await this.generateMultimodal(updatedHistory);
     } else {

From b29f74c1b2f639e6c80258cc88911c7557d12315 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:07:51 +0100
Subject: [PATCH 12/46] fix: skip system messages in generateMultimodal, clear
 imageUri after send

---
 apps/llm/app/multimodal_llm/index.tsx                        | 1 +
 .../common/rnexecutorch/models/llm/LLM.cpp                   | 5 ++---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 7a3f85671..486b4e109 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -73,6 +73,7 @@ function MultimodalLLMScreen() {
     Keyboard.dismiss();
     try {
       await vlm.sendMessage(text, imageUri ?? undefined);
+      setImageUri(null);
     } catch (e) {
       console.error('Generation error:', e);
     }
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 441d20dad..ab6398bc3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -164,9 +164,8 @@ std::string LLM::generateMultimodal(
 
   for (const auto &msg : messages) {
     if (msg.role == "system") {
-      if (isFirst) {
-        inputs.push_back(llm::make_text_input(msg.content + "\n"));
-      }
+      // LFM2-VL has no dedicated system turn — skip silently, consistent
+      // with the single-turn generate(imagePath, prompt, cb) path.
       continue;
     }
 

From e1d0f08df3730eae28d703c076c6920f4fe4c94f Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:17:23 +0100
Subject: [PATCH 13/46] feat: show image thumbnail in user message bubble when
 mediaPath is set

---
 apps/llm/components/MessageItem.tsx | 50 +++++++++++++++++++++--------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/apps/llm/components/MessageItem.tsx b/apps/llm/components/MessageItem.tsx
index c4d7d549e..2a235d459 100644
--- a/apps/llm/components/MessageItem.tsx
+++ b/apps/llm/components/MessageItem.tsx
@@ -4,6 +4,7 @@ import {
   StyleSheet,
   TouchableOpacity,
   Text,
+  Image,
   Platform,
 } from 'react-native';
 import MarkdownComponent from './MarkdownComponent';
@@ -17,19 +18,31 @@ interface MessageItemProps {
 }
 
 const MessageItem = memo(({ message, deleteMessage }: MessageItemProps) => {
-  return (
-    <View
-      style={
-        message.role === 'assistant' ? styles.aiMessage : styles.userMessage
-      }
-    >
-      {message.role === 'assistant' && (
+  if (message.role === 'assistant') {
+    return (
+      <View style={styles.aiMessage}>
         <View style={styles.aiMessageIconContainer}>
           <LlamaIcon width={24} height={24} />
         </View>
-      )}
-      <MarkdownComponent text={message.content} />
+        <MarkdownComponent text={message.content} />
+        <CloseButton deleteMessage={deleteMessage} role={message.role} />
+      </View>
+    );
+  }
+
+  return (
+    <View style={styles.userMessageWrapper}>
       <CloseButton deleteMessage={deleteMessage} role={message.role} />
+      <View style={styles.userMessageBubble}>
+        {message.mediaPath && (
+          <Image
+            source={{ uri: message.mediaPath }}
+            style={styles.userMessageImage}
+            resizeMode="cover"
+          />
+        )}
+        <MarkdownComponent text={message.content} />
+      </View>
     </View>
   );
 });
@@ -64,17 +77,26 @@ const styles = StyleSheet.create({
     marginVertical: 8,
     alignItems: 'center',
   },
-  userMessage: {
+  userMessageWrapper: {
     flexDirection: 'row-reverse',
-    paddingHorizontal: 12,
-    paddingVertical: 8,
     marginRight: 8,
     marginVertical: 8,
     maxWidth: '75%',
+    alignSelf: 'flex-end',
+    alignItems: 'flex-start',
+  },
+  userMessageBubble: {
+    flexDirection: 'column',
+    paddingHorizontal: 12,
+    paddingVertical: 8,
     borderRadius: 8,
     backgroundColor: ColorPalette.seaBlueLight,
-    alignSelf: 'flex-end',
-    alignItems: 'center',
+  },
+  userMessageImage: {
+    width: 200,
+    height: 150,
+    borderRadius: 6,
+    marginBottom: 6,
   },
   aiMessageIconContainer: {
     backgroundColor: ColorPalette.seaBlueLight,

From 11cab574e756ff3a6e01a90312a37c3f8d23fbc1 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:20:18 +0100
Subject: [PATCH 14/46] fix: use resizeMode contain so full image is always
 visible in message bubble

---
 apps/llm/components/MessageItem.tsx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/llm/components/MessageItem.tsx b/apps/llm/components/MessageItem.tsx
index 2a235d459..58da5074c 100644
--- a/apps/llm/components/MessageItem.tsx
+++ b/apps/llm/components/MessageItem.tsx
@@ -38,7 +38,7 @@ const MessageItem = memo(({ message, deleteMessage }: MessageItemProps) => {
           <Image
             source={{ uri: message.mediaPath }}
             style={styles.userMessageImage}
-            resizeMode="cover"
+            resizeMode="contain"
           />
         )}
         <MarkdownComponent text={message.content} />
@@ -94,7 +94,7 @@ const styles = StyleSheet.create({
   },
   userMessageImage: {
     width: 200,
-    height: 150,
+    height: 200,
     borderRadius: 6,
     marginBottom: 6,
   },

From 9ddd5d75030ee6f0f9d78f5482daf89092cbd576 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:32:31 +0100
Subject: [PATCH 15/46] refactor: derive isMultimodal from load param, unify
 load branches, remove tokenizerConfig guard

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../host_objects/ModelHostObject.h            |  4 -
 .../common/rnexecutorch/models/llm/LLM.cpp    |  2 -
 .../common/rnexecutorch/models/llm/LLM.h      |  2 -
 .../src/controllers/LLMController.ts          | 73 +++++--------------
 4 files changed, 18 insertions(+), 63 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 334f1f833..2c7a3e535 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -164,10 +164,6 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
           JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
                               promiseHostFunction<&Model::generateMultimodal>,
                               "generateMultimodal"));
-
-      addFunctions(JSI_EXPORT_FUNCTION(
-          ModelHostObject<Model>, synchronousHostFunction<&Model::isMultimodal>,
-          "isMultimodal"));
     }
 
     if constexpr (meta::SameAs<Model, models::text_to_image::TextToImage>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index ab6398bc3..2210a5cf8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -81,8 +81,6 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
                          fs::file_size(fs::path(tokenizerSource));
 }
 
-bool LLM::isMultimodal() const noexcept { return multimodal_; }
-
 std::string LLM::generate(std::string input,
                           std::shared_ptr<jsi::Function> callback) {
   if (!runner_ || !runner_->is_loaded()) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 5c9e1e458..11f8c5e06 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -33,8 +33,6 @@ class LLM : public BaseModel {
       std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
       std::shared_ptr<jsi::Function> callback);
 
-  bool isMultimodal() const noexcept;
-
   void interrupt();
   void reset();
   void unload() noexcept;
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index cb854546f..e9a113459 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -91,68 +91,34 @@ export class LLMController {
     this.isReadyCallback(false);
 
     try {
-      let tokenizerPath: string | undefined;
-      let modelPath: string | undefined;
-
-      if (isMultimodal) {
-        const [tokenizerResults, modelResult] = await Promise.all([
-          ResourceFetcher.fetch(
-            undefined,
-            tokenizerSource,
-            ...(tokenizerConfigSource ? [tokenizerConfigSource] : [])
-          ),
-          ResourceFetcher.fetch(onDownloadProgressCallback, modelSource),
-        ]);
-        tokenizerPath = tokenizerResults?.[0];
-        modelPath = modelResult?.[0];
-
-        if (!tokenizerPath || !modelPath) {
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.DownloadInterrupted,
-            'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
-          );
-        }
-
-        if (tokenizerConfigSource && tokenizerResults?.[1]) {
-          this.tokenizerConfig = JSON.parse(
-            await ResourceFetcher.fs.readAsString(tokenizerResults[1])
-          );
-        }
-      } else {
-        const tokenizersPromise = ResourceFetcher.fetch(
+      const [tokenizerResults, modelResult] = await Promise.all([
+        ResourceFetcher.fetch(
           undefined,
           tokenizerSource,
-          tokenizerConfigSource!
-        );
-
-        const modelPromise = ResourceFetcher.fetch(
-          onDownloadProgressCallback,
-          modelSource
-        );
-
-        const [tokenizersResults, modelResult] = await Promise.all([
-          tokenizersPromise,
-          modelPromise,
-        ]);
+          ...(tokenizerConfigSource ? [tokenizerConfigSource] : [])
+        ),
+        ResourceFetcher.fetch(onDownloadProgressCallback, modelSource),
+      ]);
 
-        tokenizerPath = tokenizersResults?.[0];
-        const tokenizerConfigPath = tokenizersResults?.[1];
-        modelPath = modelResult?.[0];
+      const tokenizerPath = tokenizerResults?.[0];
+      const tokenizerConfigPath = tokenizerResults?.[1];
+      const modelPath = modelResult?.[0];
 
-        if (!tokenizerPath || !tokenizerConfigPath || !modelPath) {
-          throw new RnExecutorchError(
-            RnExecutorchErrorCode.DownloadInterrupted,
-            'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
-          );
-        }
+      if (!tokenizerPath || !modelPath) {
+        throw new RnExecutorchError(
+          RnExecutorchErrorCode.DownloadInterrupted,
+          'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
+        );
+      }
 
+      if (tokenizerConfigPath) {
         this.tokenizerConfig = JSON.parse(
-          await ResourceFetcher.fs.readAsString(tokenizerConfigPath!)
+          await ResourceFetcher.fs.readAsString(tokenizerConfigPath)
         );
       }
 
       this.nativeModule = global.loadLLM(modelPath, tokenizerPath);
-      this.isMultimodal_ = this.nativeModule.isMultimodal();
+      this.isMultimodal_ = isMultimodal;
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
         if (!data) {
@@ -214,9 +180,6 @@ export class LLMController {
   }
 
   private filterSpecialTokens(text: string): string {
-    if (!this.tokenizerConfig) {
-      return text;
-    }
     let filtered = text;
     if (
       SPECIAL_TOKENS.EOS_TOKEN in this.tokenizerConfig &&

From 7d2ce9b3e73f795b905c4d0a6af53e29046b7b14 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:37:47 +0100
Subject: [PATCH 16/46] refactor: remove isMultimodal flag, inline
 generateMultimodal into sendMessage

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/llm/app/multimodal_llm/index.tsx         |  1 -
 .../src/controllers/LLMController.ts          | 86 +++++++------------
 .../natural_language_processing/useLLM.ts     |  2 -
 .../react-native-executorch/src/types/llm.ts  |  5 --
 4 files changed, 31 insertions(+), 63 deletions(-)

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 486b4e109..cf9a3f4e6 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -45,7 +45,6 @@ function MultimodalLLMScreen() {
       modelSource: MODEL_SOURCE,
       tokenizerSource: TOKENIZER_SOURCE,
       tokenizerConfigSource: TOKENIZER_CONFIG_SOURCE,
-      isMultimodal: true,
     },
   });
 
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index e9a113459..43852e09d 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -24,8 +24,6 @@ export class LLMController {
   private _isReady = false;
   private _isGenerating = false;
   private _messageHistory: Message[] = [];
-  private isMultimodal_ = false;
-
   // User callbacks
   private tokenCallback: (token: string) => void;
   private messageHistoryCallback: (messageHistory: Message[]) => void;
@@ -77,13 +75,11 @@ export class LLMController {
     tokenizerSource,
     tokenizerConfigSource,
     onDownloadProgressCallback,
-    isMultimodal = false,
   }: {
     modelSource: ResourceSource;
     tokenizerSource: ResourceSource;
     tokenizerConfigSource?: ResourceSource;
     onDownloadProgressCallback?: (downloadProgress: number) => void;
-    isMultimodal?: boolean;
   }) {
     // reset inner state when loading new model
     this.messageHistoryCallback(this.chatConfig.initialMessageHistory);
@@ -91,34 +87,37 @@ export class LLMController {
     this.isReadyCallback(false);
 
     try {
-      const [tokenizerResults, modelResult] = await Promise.all([
-        ResourceFetcher.fetch(
-          undefined,
-          tokenizerSource,
-          ...(tokenizerConfigSource ? [tokenizerConfigSource] : [])
-        ),
-        ResourceFetcher.fetch(onDownloadProgressCallback, modelSource),
+      const tokenizersPromise = ResourceFetcher.fetch(
+        undefined,
+        tokenizerSource,
+        ...(tokenizerConfigSource ? [tokenizerConfigSource] : [])
+      );
+
+      const modelPromise = ResourceFetcher.fetch(
+        onDownloadProgressCallback,
+        modelSource
+      );
+
+      const [tokenizersResults, modelResult] = await Promise.all([
+        tokenizersPromise,
+        modelPromise,
       ]);
 
-      const tokenizerPath = tokenizerResults?.[0];
-      const tokenizerConfigPath = tokenizerResults?.[1];
+      const tokenizerPath = tokenizersResults?.[0];
+      const tokenizerConfigPath = tokenizersResults?.[1];
       const modelPath = modelResult?.[0];
 
-      if (!tokenizerPath || !modelPath) {
+      if (!tokenizerPath || !tokenizerConfigPath || !modelPath) {
         throw new RnExecutorchError(
           RnExecutorchErrorCode.DownloadInterrupted,
           'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.'
         );
       }
 
-      if (tokenizerConfigPath) {
-        this.tokenizerConfig = JSON.parse(
-          await ResourceFetcher.fs.readAsString(tokenizerConfigPath)
-        );
-      }
-
+      this.tokenizerConfig = JSON.parse(
+        await ResourceFetcher.fs.readAsString(tokenizerConfigPath!)
+      );
       this.nativeModule = global.loadLLM(modelPath, tokenizerPath);
-      this.isMultimodal_ = isMultimodal;
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
         if (!data) {
@@ -237,39 +236,6 @@ export class LLMController {
     }
   }
 
-  private async generateMultimodal(messages: Message[]): Promise<string> {
-    if (!this._isReady) {
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.ModuleNotLoaded,
-        'The model is currently not loaded.'
-      );
-    }
-    if (!this.isMultimodal_) {
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.InvalidUserInput,
-        'generateMultimodal() requires a multimodal model.'
-      );
-    }
-    if (this._isGenerating) {
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.ModelGenerating,
-        'The model is currently generating.'
-      );
-    }
-    try {
-      this.isGeneratingCallback(true);
-      const response = await this.nativeModule.generateMultimodal(
-        messages,
-        this.onToken
-      );
-      return response;
-    } catch (e) {
-      throw parseUnknownError(e);
-    } finally {
-      this.isGeneratingCallback(false);
-    }
-  }
-
   public interrupt() {
     if (!this.nativeModule) {
       throw new RnExecutorchError(
@@ -353,7 +319,17 @@ export class LLMController {
 
     if (updatedHistory.some((m) => m.mediaPath)) {
       // Any message in history has media — use multimodal path
-      response = await this.generateMultimodal(updatedHistory);
+      try {
+        this.isGeneratingCallback(true);
+        response = await this.nativeModule.generateMultimodal(
+          updatedHistory,
+          this.onToken
+        );
+      } catch (e) {
+        throw parseUnknownError(e);
+      } finally {
+        this.isGeneratingCallback(false);
+      }
     } else {
       const countTokensCallback = (messages: Message[]) => {
         const rendered = this.applyChatTemplate(
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index deabbbbb0..502b06f1d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -53,7 +53,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
           tokenizerSource: model.tokenizerSource,
           tokenizerConfigSource: model.tokenizerConfigSource,
           onDownloadProgressCallback: setDownloadProgress,
-          isMultimodal: model.isMultimodal,
         });
       } catch (e) {
         setError(parseUnknownError(e));
@@ -70,7 +69,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     model.modelSource,
     model.tokenizerSource,
     model.tokenizerConfigSource,
-    model.isMultimodal,
     preventLoad,
   ]);
 
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 0c648c25d..df0b3a06d 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -20,11 +20,6 @@ export interface LLMProps {
      * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
      */
     tokenizerConfigSource?: ResourceSource;
-    /**
-     * Set to `true` when loading a vision-language (multimodal) model.
-     * Skips tokenizer config fetching and enables multimodal message handling via `sendMessage`.
-     */
-    isMultimodal?: boolean;
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.

From 87fa1f0b4773dc01c08405427e8f5c7ad346f8e9 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:40:12 +0100
Subject: [PATCH 17/46] fix: make tokenizerConfigSource required throughout
 load pipeline

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../react-native-executorch/src/controllers/LLMController.ts  | 4 ++--
 packages/react-native-executorch/src/types/llm.ts             | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 43852e09d..925ad2420 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -78,7 +78,7 @@ export class LLMController {
   }: {
     modelSource: ResourceSource;
     tokenizerSource: ResourceSource;
-    tokenizerConfigSource?: ResourceSource;
+    tokenizerConfigSource: ResourceSource;
     onDownloadProgressCallback?: (downloadProgress: number) => void;
   }) {
     // reset inner state when loading new model
@@ -90,7 +90,7 @@ export class LLMController {
       const tokenizersPromise = ResourceFetcher.fetch(
         undefined,
         tokenizerSource,
-        ...(tokenizerConfigSource ? [tokenizerConfigSource] : [])
+        tokenizerConfigSource
       );
 
       const modelPromise = ResourceFetcher.fetch(
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index df0b3a06d..c660e3a7d 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -19,7 +19,7 @@ export interface LLMProps {
     /**
      * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
      */
-    tokenizerConfigSource?: ResourceSource;
+    tokenizerConfigSource: ResourceSource;
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.

From b39895288116c3e74182858a075b84ac3ffe61f6 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:41:59 +0100
Subject: [PATCH 18/46] fix: prepend system prompt to multimodal history before
 generateMultimodal

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../src/controllers/LLMController.ts                        | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 925ad2420..bd76f5e4e 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -319,10 +319,14 @@ export class LLMController {
 
     if (updatedHistory.some((m) => m.mediaPath)) {
       // Any message in history has media — use multimodal path
+      const historyWithSystemPrompt = [
+        { content: this.chatConfig.systemPrompt, role: 'system' as const },
+        ...updatedHistory,
+      ];
       try {
         this.isGeneratingCallback(true);
         response = await this.nativeModule.generateMultimodal(
-          updatedHistory,
+          historyWithSystemPrompt,
           this.onToken
         );
       } catch (e) {

From a0b80e38ae4937677d87e941b1c02aae15d645ac Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 15:56:55 +0100
Subject: [PATCH 19/46] =?UTF-8?q?refactor:=20unify=20generate=20=E2=80=94?=
 =?UTF-8?q?=20Jinja=20renders=20prompt+<image>=20tokens=20in=20JS,=20C++?=
 =?UTF-8?q?=20splits=20on=20placeholder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../host_objects/JsiConversions.h             |  30 -----
 .../host_objects/ModelHostObject.h            |  11 +-
 .../common/rnexecutorch/models/llm/LLM.cpp    | 103 +++++-------------
 .../common/rnexecutorch/models/llm/LLM.h      |  15 +--
 .../src/controllers/LLMController.ts          |  95 +++++++++-------
 5 files changed, 92 insertions(+), 162 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index 08acf6cff..df9abbdef 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -228,36 +228,6 @@ getValue<std::vector<uint64_t>>(const jsi::Value &val, jsi::Runtime &runtime) {
   return getArrayAsVector<uint64_t>(val, runtime);
 }
 
-struct NativeMessage {
-  std::string role; // "user" | "assistant" | "system"
-  std::string content;
-  std::string mediaPath; // empty string if no media
-};
-
-template <>
-inline std::vector<NativeMessage>
-getValue<std::vector<NativeMessage>>(const jsi::Value &val,
-                                     jsi::Runtime &runtime) {
-  jsi::Array array = val.asObject(runtime).asArray(runtime);
-  size_t length = array.size(runtime);
-  std::vector<NativeMessage> result;
-  result.reserve(length);
-  for (size_t i = 0; i < length; ++i) {
-    jsi::Object obj = array.getValueAtIndex(runtime, i).asObject(runtime);
-    NativeMessage msg;
-    msg.role =
-        obj.getProperty(runtime, "role").getString(runtime).utf8(runtime);
-    msg.content =
-        obj.getProperty(runtime, "content").getString(runtime).utf8(runtime);
-    auto mediaProp = obj.getProperty(runtime, "mediaPath");
-    if (!mediaProp.isUndefined() && !mediaProp.isNull()) {
-      msg.mediaPath = mediaProp.getString(runtime).utf8(runtime);
-    }
-    result.push_back(std::move(msg));
-  }
-  return result;
-}
-
 // Template specializations for std::span<T> types
 template <>
 inline std::span<float> getValue<std::span<float>>(const jsi::Value &val,
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 2c7a3e535..a4af6eb8f 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -156,14 +156,9 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           promiseHostFunction<static_cast<std::string (Model::*)(
-              std::string, std::string, std::shared_ptr<jsi::Function>)>(
-              &Model::generate)>,
-          "generateWithImage"));
-
-      addFunctions(
-          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
-                              promiseHostFunction<&Model::generateMultimodal>,
-                              "generateMultimodal"));
+              std::string, std::vector<std::string>,
+              std::shared_ptr<jsi::Function>)>(&Model::generate)>,
+          "generateMultimodal"));
     }
 
     if constexpr (meta::SameAs<Model, models::text_to_image::TextToImage>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 2210a5cf8..b26cd3b20 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -18,15 +18,6 @@ using executorch::runtime::Error;
 static constexpr int kImageSize = 512;
 static constexpr int kImageChannels = 3;
 
-// LFM2-VL chat template
-static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n";
-static constexpr const char *kChatSuffix =
-    "<|im_end|>\n<|im_start|>assistant\n";
-// Separator inserted after each assistant turn in multi-turn conversations
-static constexpr const char *kAssistantTurnEnd = "<|im_end|>\n";
-// Prefix for subsequent user turns (no BOS token — only first turn has it)
-static constexpr const char *kUserTurnPrefix = "<|im_start|>user\n";
-
 static llm::Image loadImageForVLM(const std::string &imagePath) {
   cv::Mat mat = image_processing::readImage(imagePath);
   cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
@@ -106,7 +97,8 @@ std::string LLM::generate(std::string input,
   return output;
 }
 
-std::string LLM::generate(std::string imagePath, std::string prompt,
+std::string LLM::generate(std::string prompt,
+                          std::vector<std::string> imagePaths,
                           std::shared_ptr<jsi::Function> callback) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
@@ -118,77 +110,34 @@ std::string LLM::generate(std::string imagePath, std::string prompt,
         "This is a text-only model. Call generate(prompt, cb).");
   }
 
-  llm::Image image = loadImageForVLM(imagePath);
-  std::vector<llm::MultimodalInput> inputs = {
-      llm::make_text_input(std::string(kChatPrefix)),
-      llm::make_image_input(std::move(image)),
-      llm::make_text_input(prompt + kChatSuffix),
-  };
-
-  std::string output;
-  auto nativeCallback = [this, &callback, &output](const std::string &token) {
-    output += token;
-    if (callback && callInvoker) {
-      callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
-        callback->call(runtime, jsi::String::createFromUtf8(runtime, token));
-      });
-    }
-  };
-
-  auto error =
-      runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
-  if (error != Error::Ok) {
-    throw RnExecutorchError(error, "Failed to generate multimodal response");
-  }
-
-  return output;
-}
-
-std::string LLM::generateMultimodal(
-    std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
-    std::shared_ptr<jsi::Function> callback) {
-  if (!runner_ || !runner_->is_loaded()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
-                            "Runner is not loaded");
-  }
-  if (!multimodal_) {
-    throw RnExecutorchError(
-        RnExecutorchErrorCode::InvalidUserInput,
-        "This is a text-only model. Use generate(prompt, cb) instead.");
-  }
+  // Split rendered prompt on "<image>" placeholders and interleave with images.
+  static constexpr const char *kImageToken = "<image>";
+  static constexpr size_t kImageTokenLen = 7; // strlen("<image>")
 
   std::vector<llm::MultimodalInput> inputs;
-  bool isFirst = true;
-
-  for (const auto &msg : messages) {
-    if (msg.role == "system") {
-      // LFM2-VL has no dedicated system turn — skip silently, consistent
-      // with the single-turn generate(imagePath, prompt, cb) path.
-      continue;
-    }
-
-    if (msg.role == "user") {
-      if (isFirst) {
-        inputs.push_back(llm::make_text_input(std::string(kChatPrefix)));
-        isFirst = false;
-      } else {
-        inputs.push_back(llm::make_text_input(std::string(kUserTurnPrefix)));
-      }
-
-      if (!msg.mediaPath.empty()) {
-        const llm::Image &img = getOrLoadImage(msg.mediaPath);
-        inputs.push_back(llm::make_image_input(img));
+  size_t imageIdx = 0;
+  size_t searchPos = 0;
+
+  while (true) {
+    size_t found = prompt.find(kImageToken, searchPos);
+    if (found == std::string::npos) {
+      // Remaining text after last image (or entire prompt if no images)
+      if (searchPos < prompt.size()) {
+        inputs.push_back(llm::make_text_input(prompt.substr(searchPos)));
       }
-
-      if (!msg.content.empty()) {
-        inputs.push_back(llm::make_text_input(msg.content));
-      }
-
-      inputs.push_back(llm::make_text_input(std::string(kChatSuffix)));
-    } else if (msg.role == "assistant") {
-      inputs.push_back(llm::make_text_input(msg.content + kAssistantTurnEnd));
-      isFirst = false;
+      break;
+    }
+    // Text segment before this placeholder
+    if (found > searchPos) {
+      inputs.push_back(
+          llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
+    }
+    // Image at this position
+    if (imageIdx < imagePaths.size()) {
+      const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]);
+      inputs.push_back(llm::make_image_input(img));
     }
+    searchPos = found + kImageTokenLen;
   }
 
   if (inputs.empty()) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 11f8c5e06..6e47dbed0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -5,7 +5,6 @@
 
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
-#include <rnexecutorch/host_objects/JsiConversions.h>
 #include <rnexecutorch/models/BaseModel.h>
 #include <runner/image.h>
 #include <runner/unified_runner.h>
@@ -20,19 +19,15 @@ class LLM : public BaseModel {
                const std::string &tokenizerSource,
                std::shared_ptr<react::CallInvoker> callInvoker);
 
-  // Text-only generate (existing signature — used by LLMController)
-  std::string generate(std::string input,
+  // Text-only: pre-rendered prompt string
+  std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
 
-  // Multimodal generate (image + text prompt)
-  std::string generate(std::string imagePath, std::string prompt,
+  // Multimodal: pre-rendered prompt string with <image> placeholders +
+  // ordered list of image paths (one per placeholder)
+  std::string generate(std::string prompt, std::vector<std::string> imagePaths,
                        std::shared_ptr<jsi::Function> callback);
 
-  // Multimodal generate — takes full message history, builds MultimodalInput[]
-  std::string generateMultimodal(
-      std::vector<rnexecutorch::jsi_conversion::NativeMessage> messages,
-      std::shared_ptr<jsi::Function> callback);
-
   void interrupt();
   void reset();
   void unload() noexcept;
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index bd76f5e4e..7062d78cd 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -211,7 +211,7 @@ export class LLMController {
     this.isGeneratingCallback(false);
   }
 
-  public async forward(input: string): Promise<string> {
+  public async forward(input: string, imagePaths?: string[]): Promise<string> {
     if (!this._isReady) {
       throw new RnExecutorchError(
         RnExecutorchErrorCode.ModuleNotLoaded,
@@ -227,7 +227,14 @@ export class LLMController {
     try {
       this.isGeneratingCallback(true);
       this.nativeModule.reset();
-      const response = await this.nativeModule.generate(input, this.onToken);
+      const response =
+        imagePaths && imagePaths.length > 0
+          ? await this.nativeModule.generateMultimodal(
+              input,
+              imagePaths,
+              this.onToken
+            )
+          : await this.nativeModule.generate(input, this.onToken);
       return this.filterSpecialTokens(response);
     } catch (e) {
       throw parseUnknownError(e);
@@ -317,42 +324,56 @@ export class LLMController {
 
     let response: string;
 
-    if (updatedHistory.some((m) => m.mediaPath)) {
-      // Any message in history has media — use multimodal path
-      const historyWithSystemPrompt = [
-        { content: this.chatConfig.systemPrompt, role: 'system' as const },
-        ...updatedHistory,
-      ];
-      try {
-        this.isGeneratingCallback(true);
-        response = await this.nativeModule.generateMultimodal(
-          historyWithSystemPrompt,
-          this.onToken
-        );
-      } catch (e) {
-        throw parseUnknownError(e);
-      } finally {
-        this.isGeneratingCallback(false);
-      }
+    const isMultimodal = updatedHistory.some((m) => m.mediaPath);
+
+    // For multimodal messages, convert mediaPath into structured content so
+    // the chat template emits <image> placeholders in the right position.
+    const historyForTemplate = isMultimodal
+      ? updatedHistory.map((m) =>
+          m.mediaPath
+            ? {
+                ...m,
+                content: [
+                  { type: 'image' },
+                  { type: 'text', text: m.content },
+                ] as any,
+              }
+            : m
+        )
+      : updatedHistory;
+
+    const countTokensCallback = (messages: Message[]) => {
+      const rendered = this.applyChatTemplate(
+        messages,
+        this.tokenizerConfig,
+        this.toolsConfig?.tools,
+        // eslint-disable-next-line camelcase
+        { tools_in_user_message: false, add_generation_prompt: true }
+      );
+      return this.nativeModule.countTextTokens(rendered);
+    };
+    const maxContextLength = this.nativeModule.getMaxContextLength();
+    const messageHistoryWithPrompt =
+      this.chatConfig.contextStrategy.buildContext(
+        this.chatConfig.systemPrompt,
+        historyForTemplate,
+        maxContextLength,
+        countTokensCallback
+      );
+
+    if (isMultimodal) {
+      const renderedPrompt = this.applyChatTemplate(
+        messageHistoryWithPrompt,
+        this.tokenizerConfig,
+        undefined,
+        // eslint-disable-next-line camelcase
+        { tools_in_user_message: false, add_generation_prompt: true }
+      );
+      const imagePaths = updatedHistory
+        .filter((m) => m.mediaPath)
+        .map((m) => m.mediaPath!);
+      response = await this.forward(renderedPrompt, imagePaths);
     } else {
-      const countTokensCallback = (messages: Message[]) => {
-        const rendered = this.applyChatTemplate(
-          messages,
-          this.tokenizerConfig,
-          this.toolsConfig?.tools,
-          // eslint-disable-next-line camelcase
-          { tools_in_user_message: false, add_generation_prompt: true }
-        );
-        return this.nativeModule.countTextTokens(rendered);
-      };
-      const maxContextLength = this.nativeModule.getMaxContextLength();
-      const messageHistoryWithPrompt =
-        this.chatConfig.contextStrategy.buildContext(
-          this.chatConfig.systemPrompt,
-          updatedHistory,
-          maxContextLength,
-          countTokensCallback
-        );
       response = await this.generate(
         messageHistoryWithPrompt,
         this.toolsConfig?.tools

From 13f631e618e1d812c1ca6fd4231f4c389a6b0e22 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 16:04:19 +0100
Subject: [PATCH 20/46] fix: collect imagePaths from messageHistoryWithPrompt,
 not full history

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../react-native-executorch/src/controllers/LLMController.ts    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 7062d78cd..7e3b8baf2 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -369,7 +369,7 @@ export class LLMController {
         // eslint-disable-next-line camelcase
         { tools_in_user_message: false, add_generation_prompt: true }
       );
-      const imagePaths = updatedHistory
+      const imagePaths = messageHistoryWithPrompt
         .filter((m) => m.mediaPath)
         .map((m) => m.mediaPath!);
       response = await this.forward(renderedPrompt, imagePaths);

From 76f9c7c035ed0cc005399bf71e9beffdf48e6f52 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 16:07:36 +0100
Subject: [PATCH 21/46] fix: typing

---
 .../src/hooks/natural_language_processing/useLLM.ts             | 2 +-
 packages/react-native-executorch/src/types/llm.ts               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 502b06f1d..99210e357 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -51,7 +51,7 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
         await controllerInstance.load({
           modelSource: model.modelSource,
           tokenizerSource: model.tokenizerSource,
-          tokenizerConfigSource: model.tokenizerConfigSource,
+          tokenizerConfigSource: model.tokenizerConfigSource!,
           onDownloadProgressCallback: setDownloadProgress,
         });
       } catch (e) {
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index c660e3a7d..df0b3a06d 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -19,7 +19,7 @@ export interface LLMProps {
     /**
      * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
      */
-    tokenizerConfigSource: ResourceSource;
+    tokenizerConfigSource?: ResourceSource;
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.

From ab8c088dc16cf9b47fff6f9583f4951a88a94d6b Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 17:00:40 +0100
Subject: [PATCH 22/46] feat: correctly calculate image tokens

---
 .../common/runner/multimodal_prefiller.cpp                   | 1 -
 .../react-native-executorch/src/controllers/LLMController.ts | 5 ++++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
index c39c7cc0f..098763550 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -11,7 +11,6 @@
 
 #include "multimodal_prefiller.h"
 #include "constants.h"
-#include "util.h"
 
 namespace executorch {
 namespace extension {
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 7e3b8baf2..7e9a37c86 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -342,6 +342,7 @@ export class LLMController {
         )
       : updatedHistory;
 
+    const IMAGE_VISUAL_TOKENS = 256;
     const countTokensCallback = (messages: Message[]) => {
       const rendered = this.applyChatTemplate(
         messages,
@@ -350,7 +351,9 @@ export class LLMController {
         // eslint-disable-next-line camelcase
         { tools_in_user_message: false, add_generation_prompt: true }
       );
-      return this.nativeModule.countTextTokens(rendered);
+      const textTokens = this.nativeModule.countTextTokens(rendered);
+      const imageCount = messages.filter((m) => m.mediaPath).length;
+      return textTokens + imageCount * (IMAGE_VISUAL_TOKENS - 1);
     };
     const maxContextLength = this.nativeModule.getMaxContextLength();
     const messageHistoryWithPrompt =

From c211ba9519f9cac177bcc0ddf01ca299eeeef14e Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 17:01:57 +0100
Subject: [PATCH 23/46] fix: add missing import

---
 .../common/runner/multimodal_prefiller.cpp                       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
index 098763550..c39c7cc0f 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -11,6 +11,7 @@
 
 #include "multimodal_prefiller.h"
 #include "constants.h"
+#include "util.h"
 
 namespace executorch {
 namespace extension {

From 0e29349e838a349e436a8412ab319ad4a11c8531 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 17:20:31 +0100
Subject: [PATCH 24/46] fix: fall back to max_seq_len when model doesn't export
 max_context_len

---
 .../common/runner/unified_runner.cpp                        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp
index a136835a3..12b431e8b 100644
--- a/packages/react-native-executorch/common/runner/unified_runner.cpp
+++ b/packages/react-native-executorch/common/runner/unified_runner.cpp
@@ -73,9 +73,11 @@ Error UnifiedRunner::load() {
 
   if (config_.max_seq_len < 0)
     config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
-  if (config_.max_context_length < 0)
+  if (config_.max_context_length < 0) {
+    auto ctx = metadata_.at(kMaxContextLen);
     config_.max_context_length =
-        static_cast<int32_t>(metadata_.at(kMaxContextLen));
+        static_cast<int32_t>(ctx > 128 ? ctx : metadata_.at(kMaxSeqLen));
+  }
   if (config_.max_new_tokens < 0)
     config_.max_new_tokens =
         std::min(config_.max_seq_len, config_.max_context_length);

From 520233f0d889ae69bb9f48ac33a0dd8d961beae8 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 17:25:31 +0100
Subject: [PATCH 25/46] =?UTF-8?q?fix:=20address=20code=20review=20?=
 =?UTF-8?q?=E2=80=94=20error=20on=20image/placeholder=20mismatch,=20remove?=
 =?UTF-8?q?=20double=20reset,=20fix=20max=5Fcontext=5Flen=20fallback,=20re?=
 =?UTF-8?q?quire=20tokenizerConfigSource,=20pass=20tools=20in=20multimodal?=
 =?UTF-8?q?=20branch,=20capture=20callback=20by=20value?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../common/rnexecutorch/models/llm/LLM.cpp           | 12 +++++++-----
 .../common/runner/unified_runner.cpp                 |  5 +++--
 .../src/controllers/LLMController.ts                 |  2 +-
 packages/react-native-executorch/src/types/llm.ts    |  2 +-
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index b26cd3b20..acccedbd0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -133,10 +133,13 @@ std::string LLM::generate(std::string prompt,
           llm::make_text_input(prompt.substr(searchPos, found - searchPos)));
     }
     // Image at this position
-    if (imageIdx < imagePaths.size()) {
-      const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]);
-      inputs.push_back(llm::make_image_input(img));
+    if (imageIdx >= imagePaths.size()) {
+      throw RnExecutorchError(
+          RnExecutorchErrorCode::InvalidUserInput,
+          "More <image> placeholders in prompt than image paths provided");
     }
+    const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]);
+    inputs.push_back(llm::make_image_input(img));
     searchPos = found + kImageTokenLen;
   }
 
@@ -146,7 +149,7 @@ std::string LLM::generate(std::string prompt,
   }
 
   std::string output;
-  auto nativeCallback = [this, &callback, &output](const std::string &token) {
+  auto nativeCallback = [this, callback, &output](const std::string &token) {
     output += token;
     if (callback && callInvoker) {
       callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) {
@@ -155,7 +158,6 @@ std::string LLM::generate(std::string prompt,
     }
   };
 
-  runner_->reset();
   auto error =
       runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
   if (error != Error::Ok) {
diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp
index 12b431e8b..6f4238ca6 100644
--- a/packages/react-native-executorch/common/runner/unified_runner.cpp
+++ b/packages/react-native-executorch/common/runner/unified_runner.cpp
@@ -74,9 +74,10 @@ Error UnifiedRunner::load() {
   if (config_.max_seq_len < 0)
     config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
   if (config_.max_context_length < 0) {
-    auto ctx = metadata_.at(kMaxContextLen);
     config_.max_context_length =
-        static_cast<int32_t>(ctx > 128 ? ctx : metadata_.at(kMaxSeqLen));
+        method_names.count(kMaxContextLen)
+            ? static_cast<int32_t>(metadata_.at(kMaxContextLen))
+            : static_cast<int32_t>(metadata_.at(kMaxSeqLen));
   }
   if (config_.max_new_tokens < 0)
     config_.max_new_tokens =
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 7e9a37c86..180be9677 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -368,7 +368,7 @@ export class LLMController {
       const renderedPrompt = this.applyChatTemplate(
         messageHistoryWithPrompt,
         this.tokenizerConfig,
-        undefined,
+        this.toolsConfig?.tools,
         // eslint-disable-next-line camelcase
         { tools_in_user_message: false, add_generation_prompt: true }
       );
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index df0b3a06d..c660e3a7d 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -19,7 +19,7 @@ export interface LLMProps {
     /**
      * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
      */
-    tokenizerConfigSource?: ResourceSource;
+    tokenizerConfigSource: ResourceSource;
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.

From dfd1a811d38f9dc2acaddbe89b1a4893a668ced9 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Mon, 2 Mar 2026 17:42:04 +0100
Subject: [PATCH 26/46] feat: dynamic sendMessage type based on flag

---
 apps/llm/app/multimodal_llm/index.tsx         | 22 ++++----
 .../natural_language_processing/useLLM.ts     | 14 ++++-
 .../react-native-executorch/src/types/llm.ts  | 55 ++++++++++++++-----
 3 files changed, 63 insertions(+), 28 deletions(-)

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index cf9a3f4e6..3178b602b 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -21,13 +21,6 @@ import Messages from '../../components/Messages';
 import Spinner from '../../components/Spinner';
 import { GeneratingContext } from '../../context';
 
-const MODEL_SOURCE =
-  'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte';
-const TOKENIZER_SOURCE =
-  'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json';
-const TOKENIZER_CONFIG_SOURCE =
-  'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json';
-
 export default function MultimodalLLMScreenWrapper() {
   const isFocused = useIsFocused();
   return isFocused ? <MultimodalLLMScreen /> : null;
@@ -42,9 +35,13 @@ function MultimodalLLMScreen() {
 
   const vlm = useLLM({
     model: {
-      modelSource: MODEL_SOURCE,
-      tokenizerSource: TOKENIZER_SOURCE,
-      tokenizerConfigSource: TOKENIZER_CONFIG_SOURCE,
+      isMultimodal: true,
+      modelSource:
+        'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte',
+      tokenizerSource:
+        'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json',
+      tokenizerConfigSource:
+        'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json',
     },
   });
 
@@ -70,9 +67,10 @@ function MultimodalLLMScreen() {
     setUserInput('');
     textInputRef.current?.clear();
     Keyboard.dismiss();
+    const currentImageUri = imageUri;
+    setImageUri(null);
     try {
-      await vlm.sendMessage(text, imageUri ?? undefined);
-      setImageUri(null);
+      await vlm.sendMessage(text, currentImageUri ?? undefined);
     } catch (e) {
       console.error('Generation error:', e);
     }
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 99210e357..9846bbff7 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -4,6 +4,7 @@ import {
   LLMProps,
   LLMTool,
   LLMType,
+  LLMTypeMultimodal,
   Message,
 } from '../../types/llm';
 import { LLMController } from '../../controllers/LLMController';
@@ -14,9 +15,16 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
  *
  * @category Hooks
  * @param model - Object containing model, tokenizer, and tokenizer config sources.
- * @returns An object implementing the `LLMType` interface for interacting with the LLM.
+ * @returns An object implementing the `LLMTypeMultimodal` interface when `model.isMultimodal` is `true`, otherwise `LLMType`.
  */
-export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
+export function useLLM(
+  props: LLMProps & { model: { isMultimodal: true } }
+): LLMTypeMultimodal;
+export function useLLM(props: LLMProps): LLMType;
+export function useLLM({
+  model,
+  preventLoad = false,
+}: LLMProps): LLMType | LLMTypeMultimodal {
   const [token, setToken] = useState<string>('');
   const [response, setResponse] = useState<string>('');
   const [messageHistory, setMessageHistory] = useState<Message[]>([]);
@@ -141,4 +149,4 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => {
     deleteMessage: deleteMessage,
     interrupt: interrupt,
   };
-};
+}
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index c660e3a7d..1429b7c84 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -20,6 +20,10 @@ export interface LLMProps {
      * `ResourceSource` pointing to the JSON file which contains the tokenizer config.
      */
     tokenizerConfigSource: ResourceSource;
+    /**
+     * Set to `true` for vision-language models that accept image inputs via `sendMessage`.
+     */
+    isMultimodal?: boolean;
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
@@ -28,11 +32,11 @@ export interface LLMProps {
 }
 
 /**
- * React hook for managing a Large Language Model (LLM) instance.
+ * Base return type for `useLLM`. Contains all fields except `sendMessage`.
  *
  * @category Types
  */
-export interface LLMType {
+export interface LLMTypeBase {
   /**
    * History containing all messages in conversation. This field is updated after model responds to sendMessage.
    */
@@ -91,7 +95,7 @@ export interface LLMType {
    */
   generate: (messages: Message[], tools?: LLMTool[]) => Promise<string>;
   /**
-   * Returns the number of total tokens from the previous generation.This is a sum of prompt tokens and generated tokens.
+   * Returns the number of total tokens from the previous generation. This is a sum of prompt tokens and generated tokens.
    *
    * @returns The count of prompt and generated tokens.
    */
@@ -103,28 +107,53 @@ export interface LLMType {
    */
   getPromptTokenCount: () => number;
 
+  /**
+   * Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated.
+   *
+   * @param index - The index of the message to delete from history.
+   */
+  deleteMessage: (index: number) => void;
+
+  /**
+   * Function to interrupt the current inference.
+   */
+  interrupt: () => void;
+}
+
+/**
+ * Return type for `useLLM` when `model.isMultimodal` is `true`.
+ * `sendMessage` accepts an optional `mediaPath` argument for image inputs.
+ *
+ * @category Types
+ */
+export interface LLMTypeMultimodal extends LLMTypeBase {
   /**
    * Function to add user message to conversation.
-   * Pass `mediaPath` for a multimodal message (image, audio, etc.).
+   * Pass `mediaPath` with a local image path to send a multimodal message.
    * After model responds, `messageHistory` will be updated.
    *
    * @param message - The message string to send.
-   * @param mediaPath - Optional local file path to media.
+   * @param mediaPath - Optional local file path to an image.
    * @returns The model's response as a `string`.
    */
   sendMessage: (message: string, mediaPath?: string) => Promise<string>;
+}
 
+/**
+ * Return type for `useLLM` when `model.isMultimodal` is absent or `false`.
+ * `sendMessage` accepts only text.
+ *
+ * @category Types
+ */
+export interface LLMType extends LLMTypeBase {
   /**
-   * Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated.
+   * Function to add user message to conversation.
+   * After model responds, `messageHistory` will be updated.
    *
-   * @param index - The index of the message to delete from history.
-   */
-  deleteMessage: (index: number) => void;
-
-  /**
-   * Function to interrupt the current inference.
+   * @param message - The message string to send.
+   * @returns The model's response as a `string`.
    */
-  interrupt: () => void;
+  sendMessage: (message: string) => Promise<string>;
 }
 
 /**

From 3d67b669c610a57d6db6107a7f34515a7bee4bef Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 10:32:00 +0100
Subject: [PATCH 27/46] fix: model stopping generation in the middle of its
 answer

---
 .../common/runner/unified_runner.cpp                       | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp
index 6f4238ca6..5dcdf127c 100644
--- a/packages/react-native-executorch/common/runner/unified_runner.cpp
+++ b/packages/react-native-executorch/common/runner/unified_runner.cpp
@@ -263,7 +263,6 @@ Error UnifiedRunner::generate(
 
   stats_.inference_start_ms = llm::time_in_ms();
 
-  int64_t pos_before_prefill = pos_;
   uint64_t prefill_next_token = 0;
   for (size_t i = 0; i < inputs.size(); ++i) {
     auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_);
@@ -276,14 +275,10 @@ Error UnifiedRunner::generate(
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   stats_.num_prompt_tokens = pos_;
 
-  int64_t context_len = metadata_.count(kMaxContextLen)
-                            ? metadata_.at(kMaxContextLen)
-                        : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen)
-                                                      : 2048;
   int32_t resolved_max_new =
       max_new_tokens > 0
           ? max_new_tokens
-          : static_cast<int32_t>(context_len - pos_before_prefill);
+          : static_cast<int32_t>(config_.max_context_length - pos_);
   resolved_max_new = std::max(0, resolved_max_new);
 
   std::vector<uint64_t> seed_tokens = {prefill_next_token};

From 2b26c5dd152f6b9ee02bc51097e5baea7db24e8d Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 11:44:12 +0100
Subject: [PATCH 28/46] feat: add LLMCapability type and parameterize
 LLMTypeMultimodal

---
 apps/llm/app/multimodal_llm/index.tsx         |  7 ++--
 .../natural_language_processing/useLLM.ts     |  9 ++---
 .../react-native-executorch/src/types/llm.ts  | 35 ++++++++++++++-----
 3 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 3178b602b..542af4740 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -35,7 +35,7 @@ function MultimodalLLMScreen() {
 
   const vlm = useLLM({
     model: {
-      isMultimodal: true,
+      capabilities: ['vision'] as const,
       modelSource:
         'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte',
       tokenizerSource:
@@ -70,7 +70,10 @@ function MultimodalLLMScreen() {
     const currentImageUri = imageUri;
     setImageUri(null);
     try {
-      await vlm.sendMessage(text, currentImageUri ?? undefined);
+      await vlm.sendMessage(
+        text,
+        currentImageUri ? { imagePath: currentImageUri } : undefined
+      );
     } catch (e) {
       console.error('Generation error:', e);
     }
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 9846bbff7..570ae640d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -1,5 +1,6 @@
 import { useCallback, useEffect, useState } from 'react';
 import {
+  LLMCapability,
   LLMConfig,
   LLMProps,
   LLMTool,
@@ -15,11 +16,11 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
  *
  * @category Hooks
  * @param model - Object containing model, tokenizer, and tokenizer config sources.
- * @returns An object implementing the `LLMTypeMultimodal` interface when `model.isMultimodal` is `true`, otherwise `LLMType`.
+ * @returns An object implementing the `LLMTypeMultimodal` interface when `model.capabilities` is provided, otherwise `LLMType`.
  */
-export function useLLM(
-  props: LLMProps & { model: { isMultimodal: true } }
-): LLMTypeMultimodal;
+export function useLLM<C extends readonly LLMCapability[]>(
+  props: LLMProps & { model: { capabilities: C } }
+): LLMTypeMultimodal<C>;
 export function useLLM(props: LLMProps): LLMType;
 export function useLLM({
   model,
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 1429b7c84..aea9817bb 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -1,6 +1,20 @@
 import { RnExecutorchError } from '../errors/errorUtils';
 import { ResourceSource } from './common';
 
+/**
+ * Capabilities a multimodal LLM can have.
+ * @category Types
+ */
+export type LLMCapability = 'vision' | 'audio';
+
+/**
+ * Derives the media argument shape for `sendMessage` from a capabilities tuple.
+ * @category Types
+ */
+export type MediaArg<C extends readonly LLMCapability[]> =
+  ('vision' extends C[number] ? { imagePath?: string } : object) &
+    ('audio' extends C[number] ? { audioPath?: string } : object);
+
 /**
  * Properties for initializing and configuring a Large Language Model (LLM) instance.
  *
@@ -21,9 +35,11 @@ export interface LLMProps {
      */
     tokenizerConfigSource: ResourceSource;
     /**
-     * Set to `true` for vision-language models that accept image inputs via `sendMessage`.
+     * Optional list of modality capabilities the model supports.
+     * Determines the type of the `media` argument in `sendMessage`.
+     * Example: `['vision']` enables `sendMessage(text, { imagePath })`.
      */
-    isMultimodal?: boolean;
+    capabilities?: readonly LLMCapability[];
   };
   /**
    * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
@@ -121,22 +137,23 @@ export interface LLMTypeBase {
 }
 
 /**
- * Return type for `useLLM` when `model.isMultimodal` is `true`.
- * `sendMessage` accepts an optional `mediaPath` argument for image inputs.
- *
+ * Return type for `useLLM` when `model.capabilities` is provided.
+ * `sendMessage` accepts a typed `media` object based on declared capabilities.
  * @category Types
  */
-export interface LLMTypeMultimodal extends LLMTypeBase {
+export interface LLMTypeMultimodal<
+  C extends readonly LLMCapability[] = readonly LLMCapability[],
+> extends LLMTypeBase {
   /**
    * Function to add user message to conversation.
-   * Pass `mediaPath` with a local image path to send a multimodal message.
+   * Pass a `media` object whose shape is determined by the declared capabilities.
    * After model responds, `messageHistory` will be updated.
    *
    * @param message - The message string to send.
-   * @param mediaPath - Optional local file path to an image.
+   * @param media - Optional media object (e.g. `{ imagePath }` for vision, `{ audioPath }` for audio).
    * @returns The model's response as a `string`.
    */
-  sendMessage: (message: string, mediaPath?: string) => Promise<string>;
+  sendMessage: (message: string, media?: MediaArg<C>) => Promise<string>;
 }
 
 /**

From 8d1b4ebccaabf16e3a3d8b996c187d369d8dd9c7 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 11:45:49 +0100
Subject: [PATCH 29/46] feat: update sendMessage to accept typed media object

---
 .../react-native-executorch/src/controllers/LLMController.ts  | 3 ++-
 .../src/hooks/natural_language_processing/useLLM.ts           | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 180be9677..e09646d08 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -312,8 +312,9 @@ export class LLMController {
 
   public async sendMessage(
     message: string,
-    mediaPath?: string
+    media?: { imagePath?: string; audioPath?: string }
   ): Promise<string> {
+    const mediaPath = media?.imagePath ?? media?.audioPath;
     const newMessage: Message = {
       content: message,
       role: 'user',
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 570ae640d..f83d39352 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -101,9 +101,9 @@ export function useLLM({
   );
 
   const sendMessage = useCallback(
-    (message: string, mediaPath?: string) => {
+    (message: string, media?: { imagePath?: string; audioPath?: string }) => {
       setResponse('');
-      return controllerInstance.sendMessage(message, mediaPath);
+      return controllerInstance.sendMessage(message, media);
     },
     [controllerInstance]
   );

From f3edf5d0aafd04c0c7e4122c4fa39ba68971a6a6 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 11:46:16 +0100
Subject: [PATCH 30/46] feat: add LFM2_VL_1_6B and LFM2_VL_1_6B_QUANTIZED model
 constants

---
 .../src/constants/modelUrls.ts                | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 499abf63a..325a13133 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -371,6 +371,32 @@ export const LFM2_5_1_2B_INSTRUCT_QUANTIZED = {
   tokenizerConfigSource: LFM2_5_1_2B_TOKENIZER_CONFIG,
 };
 
+// LFM2.5-VL-1.6B (Vision-Language)
+const LFM2_VL_1_6B_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_xnnpack.pte`;
+const LFM2_VL_1_6B_QUANTIZED_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte`;
+const LFM2_VL_TOKENIZER = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json`;
+const LFM2_VL_TOKENIZER_CONFIG = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json`;
+
+/**
+ * @category Models - VLM
+ */
+export const LFM2_VL_1_6B = {
+  capabilities: ['vision'] as const,
+  modelSource: LFM2_VL_1_6B_MODEL,
+  tokenizerSource: LFM2_VL_TOKENIZER,
+  tokenizerConfigSource: LFM2_VL_TOKENIZER_CONFIG,
+};
+
+/**
+ * @category Models - VLM
+ */
+export const LFM2_VL_1_6B_QUANTIZED = {
+  capabilities: ['vision'] as const,
+  modelSource: LFM2_VL_1_6B_QUANTIZED_MODEL,
+  tokenizerSource: LFM2_VL_TOKENIZER,
+  tokenizerConfigSource: LFM2_VL_TOKENIZER_CONFIG,
+};
+
 // Classification
 const EFFICIENTNET_V2_S_MODEL =
   Platform.OS === `ios`

From 6eba3f78ea4fce5ff86971c8248db89fbbd4a034 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 12:55:44 +0100
Subject: [PATCH 31/46] feat: add IEncoder interface and VisionEncoder

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/tests/CMakeLists.txt  |  3 +-
 .../tests/integration/LLMTest.cpp             | 15 ++++++
 .../common/runner/encoders/iencoder.h         | 21 ++++++++
 .../common/runner/encoders/vision_encoder.cpp | 49 +++++++++++++++++++
 .../common/runner/encoders/vision_encoder.h   | 22 +++++++++
 5 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 packages/react-native-executorch/common/runner/encoders/iencoder.h
 create mode 100644 packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
 create mode 100644 packages/react-native-executorch/common/runner/encoders/vision_encoder.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index e2a8c16bf..874b96732 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -210,11 +210,12 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp
 add_rn_test(LLMTests integration/LLMTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp
-        ${COMMON_DIR}/runner/runner.cpp
+        ${COMMON_DIR}/runner/runner.cpp          # keep until Task 5
         ${COMMON_DIR}/runner/text_prefiller.cpp
         ${COMMON_DIR}/runner/text_decoder_runner.cpp
         ${COMMON_DIR}/runner/sampler.cpp
         ${COMMON_DIR}/runner/arange_util.cpp
+        ${COMMON_DIR}/runner/encoders/vision_encoder.cpp   # add this
     LIBS tokenizers_deps
 )
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index e79294090..f44fff810 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -6,6 +6,7 @@
 #include <ReactCommon/CallInvoker.h>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/models/llm/LLM.h>
+#include <runner/encoders/vision_encoder.h>
 
 using namespace rnexecutorch;
 using namespace rnexecutorch::models::llm;
@@ -153,3 +154,17 @@ TEST_F(LLMTest, EmptyPromptThrows) {
   LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
   EXPECT_THROW((void)model.generate("", nullptr), RnExecutorchError);
 }
+
+TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) {
+  // smolLm2_135M_8da4w.pte has no vision_encoder method
+  auto module = std::make_unique<::executorch::extension::Module>(
+      "smolLm2_135M_8da4w.pte",
+      ::executorch::extension::Module::LoadMode::File);
+
+  auto encoder =
+      std::make_unique<executorch::extension::llm::VisionEncoder>(module.get());
+
+  EXPECT_THROW(
+      { ET_CHECK_OK_OR_RETURN_ERROR(encoder->load()); },
+      rnexecutorch::RnExecutorchError);
+}
diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h
new file mode 100644
index 000000000..3f46ef775
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/encoders/iencoder.h
@@ -0,0 +1,21 @@
+// common/runner/encoders/iencoder.h
+#pragma once
+
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/result.h>
+#include <runner/multimodal_input.h>
+
+namespace executorch::extension::llm {
+
+class IEncoder {
+public:
+  virtual ~IEncoder() = default;
+  virtual ::executorch::runtime::Error load() = 0;
+  virtual bool is_loaded() const = 0;
+  // Encodes one input segment, returns embeddings EValue
+  virtual ::executorch::runtime::Result<::executorch::runtime::EValue>
+  encode(const MultimodalInput &input) = 0;
+};
+
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
new file mode 100644
index 000000000..0b7a56bb7
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -0,0 +1,49 @@
+// common/runner/encoders/vision_encoder.cpp
+#include "vision_encoder.h"
+
+#include <rnexecutorch/Error.h>
+#include <runner/constants.h>
+#include <runner/image.h>
+
+namespace executorch::extension::llm {
+
+using ::executorch::runtime::Error;
+using ::executorch::runtime::EValue;
+using ::executorch::runtime::Result;
+
+VisionEncoder::VisionEncoder(::executorch::extension::Module *module)
+    : module_(module) {}
+
+Error VisionEncoder::load() {
+  auto method_names_result = module_->method_names();
+  if (!method_names_result.ok() ||
+      method_names_result->count(kVisionEncoderMethod) == 0) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::InvalidConfig,
+        "Model does not support vision: 'vision_encoder' method not found. "
+        "Check that the .pte file matches the declared capabilities.");
+  }
+  return module_->load_method(kVisionEncoderMethod);
+}
+
+bool VisionEncoder::is_loaded() const {
+  return module_->is_method_loaded(kVisionEncoderMethod);
+}
+
+Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
+  if (!input.is_image()) {
+    return Error::InvalidArgument;
+  }
+  const Image &image = input.get_image();
+  auto image_tensor_result = image.toTensor(/*with_batch=*/true);
+  if (!image_tensor_result.ok()) {
+    return image_tensor_result.error();
+  }
+  auto result = module_->execute(kVisionEncoderMethod, *image_tensor_result);
+  if (!result.ok()) {
+    return result.error();
+  }
+  return (*result)[0];
+}
+
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
new file mode 100644
index 000000000..688b0cf3a
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -0,0 +1,22 @@
+// common/runner/encoders/vision_encoder.h
+#pragma once
+
+#include "iencoder.h"
+#include <executorch/extension/module/module.h>
+
+namespace executorch::extension::llm {
+
+class VisionEncoder : public IEncoder {
+public:
+  explicit VisionEncoder(::executorch::extension::Module *module);
+
+  ::executorch::runtime::Error load() override;
+  bool is_loaded() const override;
+  ::executorch::runtime::Result<::executorch::runtime::EValue>
+  encode(const MultimodalInput &input) override;
+
+private:
+  ::executorch::extension::Module *module_;
+};
+
+} // namespace executorch::extension::llm

From 0819c20b01f4200c5a62b0efde3d18e4fbe3ea87 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 13:03:21 +0100
Subject: [PATCH 32/46] fix: address vision_encoder quality review issues

---
 .../tests/integration/LLMTest.cpp             |  4 +--
 .../common/runner/encoders/vision_encoder.cpp | 25 ++++++++++++++++---
 .../common/runner/encoders/vision_encoder.h   |  1 +
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index f44fff810..e3885eba4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -164,7 +164,5 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) {
   auto encoder =
       std::make_unique<executorch::extension::llm::VisionEncoder>(module.get());
 
-  EXPECT_THROW(
-      { ET_CHECK_OK_OR_RETURN_ERROR(encoder->load()); },
-      rnexecutorch::RnExecutorchError);
+  EXPECT_THROW(encoder->load(), rnexecutorch::RnExecutorchError);
 }
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 0b7a56bb7..35ce84c6d 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -15,9 +15,14 @@ VisionEncoder::VisionEncoder(::executorch::extension::Module *module)
     : module_(module) {}
 
 Error VisionEncoder::load() {
+  if (is_loaded()) {
+    return Error::Ok;
+  }
   auto method_names_result = module_->method_names();
-  if (!method_names_result.ok() ||
-      method_names_result->count(kVisionEncoderMethod) == 0) {
+  if (!method_names_result.ok()) {
+    return method_names_result.error();
+  }
+  if (method_names_result->count(kVisionEncoderMethod) == 0) {
     throw rnexecutorch::RnExecutorchError(
         rnexecutorch::RnExecutorchErrorCode::InvalidConfig,
         "Model does not support vision: 'vision_encoder' method not found. "
@@ -31,11 +36,25 @@ bool VisionEncoder::is_loaded() const {
 }
 
 Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
+  if (!is_loaded()) {
+    return Error::InvalidState;
+  }
   if (!input.is_image()) {
     return Error::InvalidArgument;
   }
   const Image &image = input.get_image();
-  auto image_tensor_result = image.toTensor(/*with_batch=*/true);
+  auto method_meta_result = module_->method_meta(kVisionEncoderMethod);
+  if (!method_meta_result.ok()) {
+    return method_meta_result.error();
+  }
+  auto &method_meta = *method_meta_result;
+  auto input_meta_result = method_meta.input_tensor_meta(0);
+  if (!input_meta_result.ok()) {
+    return input_meta_result.error();
+  }
+  auto expected_dims = input_meta_result->sizes();
+  auto image_tensor_result =
+      image.toTensor(/*with_batch=*/expected_dims.size() == 4);
   if (!image_tensor_result.ok()) {
     return image_tensor_result.error();
   }
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
index 688b0cf3a..5b3dd0aec 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -3,6 +3,7 @@
 
 #include "iencoder.h"
 #include <executorch/extension/module/module.h>
+#include <runner/multimodal_input.h>
 
 namespace executorch::extension::llm {
 

From 1de96bb70f531b0b03d8ee819cf8f31b8576e533 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 13:08:26 +0100
Subject: [PATCH 33/46] feat: add BaseLLMRunner with shared state and load()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/tests/CMakeLists.txt  |   1 +
 .../tests/integration/LLMTest.cpp             |  40 +++++
 .../common/runner/base_llm_runner.cpp         | 163 ++++++++++++++++++
 .../common/runner/base_llm_runner.h           |  88 ++++++++++
 4 files changed, 292 insertions(+)
 create mode 100644 packages/react-native-executorch/common/runner/base_llm_runner.cpp
 create mode 100644 packages/react-native-executorch/common/runner/base_llm_runner.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 874b96732..aebecc1b6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -211,6 +211,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp
         ${COMMON_DIR}/runner/runner.cpp          # keep until Task 5
+        ${COMMON_DIR}/runner/base_llm_runner.cpp
         ${COMMON_DIR}/runner/text_prefiller.cpp
         ${COMMON_DIR}/runner/text_decoder_runner.cpp
         ${COMMON_DIR}/runner/sampler.cpp
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index e3885eba4..7f35519af 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -166,3 +166,43 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) {
 
   EXPECT_THROW(encoder->load(), rnexecutorch::RnExecutorchError);
 }
+
+#include <runner/base_llm_runner.h>
+
+// Minimal concrete subclass — only used in tests to verify base class behavior
+class StubRunner : public example::BaseLLMRunner {
+public:
+  using BaseLLMRunner::BaseLLMRunner;
+  bool is_loaded() const override { return loaded_; }
+  ::executorch::runtime::Error load_subcomponents() override {
+    loaded_ = true;
+    return ::executorch::runtime::Error::Ok;
+  }
+  ::executorch::runtime::Error generate_internal(
+      const std::vector<::executorch::extension::llm::MultimodalInput> &,
+      std::function<void(const std::string &)>) override {
+    return ::executorch::runtime::Error::Ok;
+  }
+  void stop_impl() override {}
+  void set_temperature_impl(float t) override { last_temp_ = t; }
+  void set_topp_impl(float) override {}
+  void set_count_interval_impl(size_t) override {}
+  void set_time_interval_impl(size_t) override {}
+
+  bool loaded_ = false;
+  float last_temp_ = -1.f;
+};
+
+TEST(BaseLLMRunnerTest, SetTemperatureWritesConfigAndCallsImpl) {
+  StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json");
+  runner.set_temperature(0.5f);
+  EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f);
+  EXPECT_FLOAT_EQ(runner.last_temp_, 0.5f);
+}
+
+TEST(BaseLLMRunnerTest, ResetZerosPos) {
+  StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json");
+  runner.pos_ = 42;
+  runner.reset();
+  EXPECT_EQ(runner.pos_, 0);
+}
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
new file mode 100644
index 000000000..a987528a0
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -0,0 +1,163 @@
+// common/runner/base_llm_runner.cpp
+#include "base_llm_runner.h"
+#include "constants.h"
+#include "util.h"
+#include <cstdint>
+#include <rnexecutorch/Error.h>
+
+namespace example {
+
+using namespace executorch::extension::llm;
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+
+BaseLLMRunner::BaseLLMRunner(Module *module,
+                             std::unique_ptr<Module> owned_module,
+                             const std::string &tokenizer_path,
+                             const llm::GenerationConfig &config)
+    : config_(config), module_(owned_module ? owned_module.get() : module),
+      owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path),
+      tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
+      metadata_({
+          {kEnableDynamicShape, false},
+          {kMaxSeqLen, 128},
+          {kMaxContextLen, 128},
+          {kUseKVCache, true},
+          {kUseSDPAWithKVCache, false},
+      }) {}
+
+Error BaseLLMRunner::load() {
+  if (is_loaded())
+    return Error::Ok;
+
+  auto status = tokenizer_->load(tokenizer_path_);
+  if (status != tokenizers::Error::Ok) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while loading tokenizer");
+  }
+
+  const auto method_names =
+      ET_UNWRAP(module_->method_names(), "Failed reading method names");
+
+  metadata_[kVocabSize] = tokenizer_->vocab_size();
+  for (auto &pair : metadata_) {
+    const auto &method_name = pair.first;
+    auto &value = pair.second;
+    if (method_names.count(method_name)) {
+      value = ET_UNWRAP(module_->get(method_name))
+                  .toScalar()
+                  .to<decltype(metadata_)::mapped_type>();
+    }
+    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
+  }
+
+  if (config_.max_seq_len < 0)
+    config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
+  if (config_.max_context_length < 0) {
+    config_.max_context_length =
+        method_names.count(kMaxContextLen)
+            ? static_cast<int32_t>(metadata_.at(kMaxContextLen))
+            : static_cast<int32_t>(metadata_.at(kMaxSeqLen));
+  }
+  if (config_.max_new_tokens < 0)
+    config_.max_new_tokens =
+        std::min(config_.max_seq_len, config_.max_context_length);
+  if (config_.enable_dynamic_shape)
+    config_.enable_dynamic_shape =
+        static_cast<bool>(metadata_.at(kEnableDynamicShape));
+  if (config_.enable_kv_cache)
+    config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
+
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  if (method_names.count(kEosIds)) {
+    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
+      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
+    }
+  }
+  if (eos_ids->empty()) {
+    eos_ids->emplace(7); // fallback <|im_end|>
+  }
+
+  io_manager_ = std::make_unique<llm::IOManager>(*module_);
+
+  return load_subcomponents();
+}
+
+Error BaseLLMRunner::generate(
+    const std::string &prompt, const llm::GenerationConfig &generation_config,
+    std::function<void(const std::string &)> token_callback,
+    std::function<void(const llm::Stats &)> stats_callback) {
+
+  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
+
+  std::vector<llm::MultimodalInput> inputs = {llm::make_text_input(prompt)};
+  auto err = generate_internal(inputs, token_callback);
+
+  if (stats_callback)
+    stats_callback(stats_);
+
+  return err;
+}
+
+void BaseLLMRunner::stop() { stop_impl(); }
+
+void BaseLLMRunner::reset() {
+  stats_.reset();
+  pos_ = 0;
+}
+
+int32_t BaseLLMRunner::count_text_tokens(const std::string &text) const {
+  auto encodeResult =
+      tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens);
+  if (!encodeResult.ok()) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Encoding failed during token count check.");
+  }
+  return static_cast<int32_t>(encodeResult.get().size());
+}
+
+int32_t BaseLLMRunner::get_max_context_length() const {
+  if (!is_loaded())
+    return static_cast<int32_t>(metadata_.at(kMaxContextLen));
+  return config_.max_context_length;
+}
+
+void BaseLLMRunner::set_temperature(float temperature) noexcept {
+  config_.temperature = temperature;
+  set_temperature_impl(temperature);
+}
+
+void BaseLLMRunner::set_topp(float topp) noexcept {
+  config_.topp = topp;
+  set_topp_impl(topp);
+}
+
+void BaseLLMRunner::set_count_interval(size_t count_interval) {
+  set_count_interval_impl(count_interval);
+}
+
+void BaseLLMRunner::set_time_interval(size_t time_interval) {
+  set_time_interval_impl(time_interval);
+}
+
+int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,
+                                              int32_t max_seq_len,
+                                              int32_t max_context_len,
+                                              int32_t max_new_tokens) const {
+  int32_t result;
+  if (max_seq_len == -1 && max_new_tokens == -1)
+    result = max_context_len - num_prompt_tokens;
+  else if (max_seq_len == -1)
+    result = std::min(max_new_tokens, max_context_len - num_prompt_tokens);
+  else if (max_new_tokens == -1)
+    result = std::min(max_seq_len, max_context_len) - num_prompt_tokens;
+  else
+    result =
+        std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens,
+                 max_new_tokens);
+  return std::max(0, result);
+}
+
+} // namespace example
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
new file mode 100644
index 000000000..7d2eef285
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -0,0 +1,88 @@
+// common/runner/base_llm_runner.h
+#pragma once
+
+#include "io_manager.h"
+#include "irunner.h"
+#include "multimodal_input.h"
+#include "stats.h"
+#include <cstdint>
+#include <executorch/extension/module/module.h>
+#include <functional>
+#include <memory>
+#include <pytorch/tokenizers/hf_tokenizer.h>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+namespace example {
+
+namespace llm = ::executorch::extension::llm;
+
+class BaseLLMRunner {
+public:
+  explicit BaseLLMRunner(
+      ::executorch::extension::Module *module,
+      std::unique_ptr<::executorch::extension::Module> owned_module,
+      const std::string &tokenizer_path,
+      const llm::GenerationConfig &config = {.temperature = 0.8F,
+                                             .topp = 0.9F});
+
+  virtual ~BaseLLMRunner() = default;
+
+  virtual bool is_loaded() const = 0;
+
+  // Loads tokenizer + metadata + eos, then calls load_subcomponents()
+  virtual ::executorch::runtime::Error load();
+
+  // Text convenience — wraps string in make_text_input, calls generate_internal
+  ::executorch::runtime::Error
+  generate(const std::string &prompt,
+           const llm::GenerationConfig &generation_config = {},
+           std::function<void(const std::string &)> token_callback = {},
+           std::function<void(const llm::Stats &)> stats_callback = {});
+
+  // Multimodal entry point — subclasses implement this
+  virtual ::executorch::runtime::Error generate_internal(
+      const std::vector<llm::MultimodalInput> &inputs,
+      std::function<void(const std::string &)> token_callback) = 0;
+
+  void stop();
+  void reset();
+  int32_t count_text_tokens(const std::string &text) const;
+  int32_t get_max_context_length() const;
+
+  // Writes config_ then propagates to subclass impl
+  void set_temperature(float temperature) noexcept;
+  void set_topp(float topp) noexcept;
+  void set_count_interval(size_t count_interval);
+  void set_time_interval(size_t time_interval);
+
+  llm::Stats stats_;
+
+  // Public for test access
+  llm::GenerationConfig config_;
+  int64_t pos_{0};
+
+protected:
+  virtual ::executorch::runtime::Error load_subcomponents() = 0;
+  virtual void stop_impl() = 0;
+  virtual void set_temperature_impl(float temperature) = 0;
+  virtual void set_topp_impl(float topp) = 0;
+  virtual void set_count_interval_impl(size_t count_interval) = 0;
+  virtual void set_time_interval_impl(size_t time_interval) = 0;
+
+  int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
+                                 int32_t max_context_len,
+                                 int32_t max_new_tokens = -1) const;
+
+  ::executorch::extension::Module *module_;
+  std::unique_ptr<::executorch::extension::Module> owned_module_;
+  std::string tokenizer_path_;
+  std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
+  std::unordered_map<std::string, int64_t> metadata_;
+  std::unique_ptr<llm::IOManager> io_manager_;
+  bool shouldStop_{false};
+};
+
+} // namespace example

From e08b391611e477d720bcab046e6d917063a48320 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 13:11:20 +0100
Subject: [PATCH 34/46] feat: add TextRunner

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/tests/CMakeLists.txt  |   1 +
 .../tests/integration/LLMTest.cpp             |  24 +++
 .../common/runner/text_runner.cpp             | 168 ++++++++++++++++++
 .../common/runner/text_runner.h               |  42 +++++
 4 files changed, 235 insertions(+)
 create mode 100644 packages/react-native-executorch/common/runner/text_runner.cpp
 create mode 100644 packages/react-native-executorch/common/runner/text_runner.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index aebecc1b6..66e65cd6d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -212,6 +212,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp
         ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp
         ${COMMON_DIR}/runner/runner.cpp          # keep until Task 5
         ${COMMON_DIR}/runner/base_llm_runner.cpp
+        ${COMMON_DIR}/runner/text_runner.cpp
         ${COMMON_DIR}/runner/text_prefiller.cpp
         ${COMMON_DIR}/runner/text_decoder_runner.cpp
         ${COMMON_DIR}/runner/sampler.cpp
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index 7f35519af..dd253b487 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -206,3 +206,27 @@ TEST(BaseLLMRunnerTest, ResetZerosPos) {
   runner.reset();
   EXPECT_EQ(runner.pos_, 0);
 }
+
+#include <runner/text_runner.h>
+
+TEST(TextRunnerTest, LoadsSuccessfully) {
+  auto module = std::make_unique<::executorch::extension::Module>(
+      "smolLm2_135M_8da4w.pte",
+      ::executorch::extension::Module::LoadMode::File);
+
+  example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json");
+  auto err = runner.load();
+  EXPECT_EQ(err, ::executorch::runtime::Error::Ok);
+  EXPECT_TRUE(runner.is_loaded());
+}
+
+TEST(TextRunnerTest, SetTemperaturePropagatesToDecoder) {
+  auto module = std::make_unique<::executorch::extension::Module>(
+      "smolLm2_135M_8da4w.pte",
+      ::executorch::extension::Module::LoadMode::File);
+
+  example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json");
+  runner.load();
+  EXPECT_NO_THROW(runner.set_temperature(0.5f));
+  EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f);
+}
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
new file mode 100644
index 000000000..279244855
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -0,0 +1,168 @@
+// common/runner/text_runner.cpp
+#include "text_runner.h"
+#include "constants.h"
+#include "util.h"
+#include <cstdint>
+#include <rnexecutorch/Error.h>
+
+namespace example {
+
+using namespace executorch::extension::llm;
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+
+TextRunner::TextRunner(Module *module, std::unique_ptr<Module> owned_module,
+                       const std::string &tokenizer_path,
+                       const llm::GenerationConfig &config)
+    : BaseLLMRunner(module, std::move(owned_module), tokenizer_path, config) {}
+
+bool TextRunner::is_loaded() const {
+  return module_ && module_->is_loaded() && tokenizer_ &&
+         tokenizer_->is_loaded() && text_decoder_runner_ && text_prefiller_ &&
+         text_token_generator_;
+}
+
+Error TextRunner::load_subcomponents() {
+  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
+
+  // Re-detect eos_ids from the module (base class built them but doesn't pass
+  // them down yet — reconstruct with the same fallback logic).
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  const auto method_names =
+      ET_UNWRAP(module_->method_names(), "Failed reading method names");
+  if (method_names.count(kEosIds)) {
+    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
+      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
+    }
+  }
+  if (eos_ids->empty()) {
+    eos_ids->emplace(7); // fallback <|im_end|>
+  }
+
+  llm::Stats *stats_ptr = &stats_;
+
+  text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
+      module_, io_manager_.get(), config_.temperature, config_.topp);
+  text_prefiller_ = std::make_unique<llm::TextPrefiller>(
+      text_decoder_runner_.get(), config_.enable_kv_cache,
+      config_.enable_dynamic_shape, config_.max_seq_len);
+  text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
+      tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
+      std::move(eos_ids), stats_ptr);
+
+  return Error::Ok;
+}
+
+Error TextRunner::generate_internal(
+    const std::vector<llm::MultimodalInput> &inputs,
+    std::function<void(const std::string &)> token_callback) {
+
+  if (inputs.empty()) {
+    ET_LOG(Error, "MultimodalInput vector cannot be empty");
+    return Error::InvalidArgument;
+  }
+
+  const std::string &prompt = inputs[0].get_text();
+  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
+
+  if (!is_loaded()) {
+    stats_.model_load_start_ms = llm::time_in_ms();
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+    stats_.model_load_end_ms = llm::time_in_ms();
+  }
+
+  std::function<void(const std::string &)> wrapped_callback =
+      [token_callback](const std::string &piece) {
+        llm::safe_printf(piece.c_str());
+        fflush(stdout);
+        if (token_callback)
+          token_callback(piece);
+      };
+
+  stats_.inference_start_ms = llm::time_in_ms();
+  shouldStop_ = false;
+
+  int64_t context_len_left =
+      static_cast<int64_t>(config_.max_context_length) - pos_;
+
+  auto encodeResult =
+      tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens);
+  if (!encodeResult.ok()) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while encoding: " +
+            std::to_string(static_cast<int32_t>(encodeResult.error())));
+  }
+  std::vector<uint64_t> prompt_tokens = encodeResult.get();
+  int num_prompt_tokens = prompt_tokens.size();
+
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
+                           "Expected at least 1 prompt token");
+  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < config_.max_seq_len,
+                           InvalidArgument,
+                           "num_prompt_tokens %d >= max_seq_len %" PRId32,
+                           num_prompt_tokens, config_.max_seq_len);
+
+  int32_t max_new_tokens = resolve_max_new_tokens(
+      num_prompt_tokens, config_.max_seq_len,
+      static_cast<int32_t>(context_len_left), config_.max_new_tokens);
+
+  ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
+                           "Max new tokens %d is <= 0", max_new_tokens);
+
+  if (config_.echo)
+    wrapped_callback(prompt);
+
+  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_);
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
+
+  uint64_t cur_token = prefill_res.get();
+  auto decodeResult = tokenizer_->decode({cur_token});
+  if (!decodeResult.ok()) {
+    throw rnexecutorch::RnExecutorchError(
+        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
+        "Unexpected issue occurred while decoding: " +
+            std::to_string(static_cast<int32_t>(decodeResult.error())));
+  }
+
+  prompt_tokens.push_back(cur_token);
+  int64_t num_generated = ET_UNWRAP(text_token_generator_->generate(
+      prompt_tokens, pos_, max_new_tokens - 1, config_.temperature,
+      config_.topp, wrapped_callback));
+
+  pos_ += num_generated;
+  stats_.inference_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = num_prompt_tokens;
+  stats_.num_generated_tokens = num_generated;
+
+  return Error::Ok;
+}
+
+void TextRunner::stop_impl() {
+  if (text_token_generator_)
+    text_token_generator_->stop();
+}
+
+void TextRunner::set_temperature_impl(float temperature) {
+  if (text_decoder_runner_)
+    text_decoder_runner_->set_temperature(temperature);
+}
+
+void TextRunner::set_topp_impl(float topp) {
+  if (text_decoder_runner_)
+    text_decoder_runner_->set_topp(topp);
+}
+
+void TextRunner::set_count_interval_impl(size_t count_interval) {
+  if (text_token_generator_)
+    text_token_generator_->set_count_interval(count_interval);
+}
+
+void TextRunner::set_time_interval_impl(size_t time_interval) {
+  if (text_token_generator_)
+    text_token_generator_->set_time_interval(time_interval);
+}
+
+} // namespace example
diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h
new file mode 100644
index 000000000..e590f4c88
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/text_runner.h
@@ -0,0 +1,42 @@
+// common/runner/text_runner.h
+#pragma once
+
+#include "base_llm_runner.h"
+#include "text_decoder_runner.h"
+#include "text_prefiller.h"
+#include "text_token_generator.h"
+
+namespace example {
+
+class TextRunner : public BaseLLMRunner {
+public:
+  explicit TextRunner(
+      ::executorch::extension::Module *module,
+      std::unique_ptr<::executorch::extension::Module> owned_module,
+      const std::string &tokenizer_path,
+      const ::executorch::extension::llm::GenerationConfig &config = {
+          .temperature = 0.8F, .topp = 0.9F});
+
+  bool is_loaded() const override;
+
+  ::executorch::runtime::Error generate_internal(
+      const std::vector<::executorch::extension::llm::MultimodalInput> &inputs,
+      std::function<void(const std::string &)> token_callback) override;
+
+protected:
+  ::executorch::runtime::Error load_subcomponents() override;
+  void stop_impl() override;
+  void set_temperature_impl(float temperature) override;
+  void set_topp_impl(float topp) override;
+  void set_count_interval_impl(size_t count_interval) override;
+  void set_time_interval_impl(size_t time_interval) override;
+
+private:
+  std::unique_ptr<::executorch::extension::llm::TextDecoderRunner>
+      text_decoder_runner_;
+  std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_;
+  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+      text_token_generator_;
+};
+
+} // namespace example

From 6703559d48ed00cb9858769f8c8a8563b2aec8db Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 13:14:18 +0100
Subject: [PATCH 35/46] feat: add MultimodalRunner with plug-in encoder map

---
 .../common/rnexecutorch/tests/CMakeLists.txt  |   4 +-
 .../tests/integration/LLMTest.cpp             |  21 +++
 .../common/runner/multimodal_runner.cpp       | 121 ++++++++++++++++++
 .../common/runner/multimodal_runner.h         |  57 +++++++++
 4 files changed, 202 insertions(+), 1 deletion(-)
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.cpp
 create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 66e65cd6d..a077a0c5a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -213,12 +213,14 @@ add_rn_test(LLMTests integration/LLMTest.cpp
         ${COMMON_DIR}/runner/runner.cpp          # keep until Task 5
         ${COMMON_DIR}/runner/base_llm_runner.cpp
         ${COMMON_DIR}/runner/text_runner.cpp
+        ${COMMON_DIR}/runner/multimodal_runner.cpp
+        ${COMMON_DIR}/runner/multimodal_prefiller.cpp
         ${COMMON_DIR}/runner/text_prefiller.cpp
         ${COMMON_DIR}/runner/text_decoder_runner.cpp
         ${COMMON_DIR}/runner/sampler.cpp
         ${COMMON_DIR}/runner/arange_util.cpp
         ${COMMON_DIR}/runner/encoders/vision_encoder.cpp   # add this
-    LIBS tokenizers_deps
+    LIBS tokenizers_deps opencv_deps
 )
 
 add_rn_test(TextToImageTests integration/TextToImageTest.cpp
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index dd253b487..9ee8a64b2 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -230,3 +230,24 @@ TEST(TextRunnerTest, SetTemperaturePropagatesToDecoder) {
   EXPECT_NO_THROW(runner.set_temperature(0.5f));
   EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f);
 }
+
+#include <runner/multimodal_runner.h>
+
+TEST(MultimodalRunnerTest, LoadFailsWithClearErrorWhenCapabilityMismatch) {
+  // smolLm2_135M_8da4w.pte is text-only — declaring vision capability should
+  // throw
+  auto module = std::make_unique<::executorch::extension::Module>(
+      "smolLm2_135M_8da4w.pte",
+      ::executorch::extension::Module::LoadMode::File);
+
+  std::map<executorch::extension::llm::MultimodalType,
+           std::unique_ptr<executorch::extension::llm::IEncoder>>
+      encoders;
+  encoders[executorch::extension::llm::MultimodalType::Image] =
+      std::make_unique<executorch::extension::llm::VisionEncoder>(module.get());
+
+  example::MultimodalRunner runner(std::move(module), "smollm_tokenizer.json",
+                                   std::move(encoders));
+
+  EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError);
+}
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
new file mode 100644
index 000000000..363f11d11
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -0,0 +1,121 @@
+// common/runner/multimodal_runner.cpp
+#include "multimodal_runner.h"
+#include "constants.h"
+#include "util.h"
+#include <rnexecutorch/Error.h>
+
+namespace example {
+
+using namespace executorch::extension::llm;
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+
+MultimodalRunner::MultimodalRunner(
+    std::unique_ptr<Module> owned_module, const std::string &tokenizer_path,
+    std::map<MultimodalType, std::unique_ptr<IEncoder>> encoders,
+    const llm::GenerationConfig &config)
+    : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config),
+      encoders_(std::move(encoders)) {}
+
+bool MultimodalRunner::is_loaded() const {
+  if (!mm_prefiller_ || !mm_token_generator_)
+    return false;
+  if (!mm_prefiller_->is_method_loaded() || !mm_token_generator_->is_loaded())
+    return false;
+  for (const auto &[type, encoder] : encoders_) {
+    if (!encoder->is_loaded())
+      return false;
+  }
+  return true;
+}
+
+Error MultimodalRunner::load_subcomponents() {
+  // Load and validate all declared encoders — throws on mismatch
+  for (auto &[type, encoder] : encoders_) {
+    encoder->load();
+  }
+
+  llm::Stats *stats_ptr = &stats_;
+  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  eos_ids->emplace(7); // fallback
+
+  mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
+      module_, io_manager_.get());
+  mm_prefiller_ = std::make_unique<llm::MultimodalPrefiller>(
+      module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get());
+  mm_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
+      tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
+      std::move(eos_ids), stats_ptr);
+
+  ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load());
+  ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load());
+  return Error::Ok;
+}
+
+Error MultimodalRunner::generate_internal(
+    const std::vector<llm::MultimodalInput> &inputs,
+    std::function<void(const std::string &)> token_callback) {
+
+  if (inputs.empty())
+    return Error::InvalidArgument;
+  if (!is_loaded())
+    ET_CHECK_OK_OR_RETURN_ERROR(load());
+
+  stats_.inference_start_ms = llm::time_in_ms();
+
+  uint64_t prefill_next_token = 0;
+  for (const auto &input : inputs) {
+    auto prefill_result = mm_prefiller_->prefill(input, pos_);
+    if (!prefill_result.ok())
+      return prefill_result.error();
+    prefill_next_token = prefill_result.get();
+  }
+
+  stats_.first_token_ms = llm::time_in_ms();
+  stats_.prompt_eval_end_ms = llm::time_in_ms();
+  stats_.num_prompt_tokens = pos_;
+
+  int32_t resolved_max_new =
+      static_cast<int32_t>(config_.max_context_length - pos_);
+  resolved_max_new = std::max(0, resolved_max_new);
+
+  std::vector<uint64_t> seed_tokens = {prefill_next_token};
+  auto wrapped_callback = [&](const std::string &piece) {
+    llm::safe_printf(piece.c_str());
+    fflush(stdout);
+    if (token_callback)
+      token_callback(piece);
+  };
+
+  auto generate_result = mm_token_generator_->generate(
+      seed_tokens, pos_,
+      static_cast<uint64_t>(std::max(0, resolved_max_new - 1)),
+      config_.temperature, config_.topp, wrapped_callback);
+
+  if (!generate_result.ok())
+    return generate_result.error();
+
+  int64_t num_generated = generate_result.get();
+  pos_ += num_generated;
+  stats_.inference_end_ms = llm::time_in_ms();
+  stats_.num_generated_tokens = num_generated;
+
+  return Error::Ok;
+}
+
+void MultimodalRunner::stop_impl() {
+  if (mm_token_generator_)
+    mm_token_generator_->stop();
+}
+
+void MultimodalRunner::set_count_interval_impl(size_t count_interval) {
+  if (mm_token_generator_)
+    mm_token_generator_->set_count_interval(count_interval);
+}
+
+void MultimodalRunner::set_time_interval_impl(size_t time_interval) {
+  if (mm_token_generator_)
+    mm_token_generator_->set_time_interval(time_interval);
+}
+
+} // namespace example
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
new file mode 100644
index 000000000..4190127e6
--- /dev/null
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.h
@@ -0,0 +1,57 @@
+// common/runner/multimodal_runner.h
+#pragma once
+
+#include "base_llm_runner.h"
+#include "encoders/iencoder.h"
+#include "multimodal_decoder_runner.h"
+#include "multimodal_input.h"
+#include "multimodal_prefiller.h"
+#include "text_token_generator.h"
+#include <map>
+
+namespace executorch::extension::llm {
+// Tag enum for keying encoder map
+enum class MultimodalType { Image, Audio };
+} // namespace executorch::extension::llm
+
+namespace example {
+
+class MultimodalRunner : public BaseLLMRunner {
+public:
+  explicit MultimodalRunner(
+      std::unique_ptr<::executorch::extension::Module> owned_module,
+      const std::string &tokenizer_path,
+      std::map<::executorch::extension::llm::MultimodalType,
+               std::unique_ptr<::executorch::extension::llm::IEncoder>>
+          encoders,
+      const ::executorch::extension::llm::GenerationConfig &config = {
+          .temperature = 0.8F, .topp = 0.9F});
+
+  bool is_loaded() const override;
+
+  ::executorch::runtime::Error generate_internal(
+      const std::vector<::executorch::extension::llm::MultimodalInput> &inputs,
+      std::function<void(const std::string &)> token_callback) override;
+
+protected:
+  ::executorch::runtime::Error load_subcomponents() override;
+  void stop_impl() override;
+  void set_temperature_impl(float) override {
+  }                                     // config_ already updated by base
+  void set_topp_impl(float) override {} // config_ already updated by base
+  void set_count_interval_impl(size_t count_interval) override;
+  void set_time_interval_impl(size_t time_interval) override;
+
+private:
+  std::map<::executorch::extension::llm::MultimodalType,
+           std::unique_ptr<::executorch::extension::llm::IEncoder>>
+      encoders_;
+  std::unique_ptr<::executorch::extension::llm::MultimodalDecoderRunner>
+      mm_decoder_runner_;
+  std::unique_ptr<::executorch::extension::llm::MultimodalPrefiller>
+      mm_prefiller_;
+  std::unique_ptr<::executorch::extension::llm::TextTokenGenerator>
+      mm_token_generator_;
+};
+
+} // namespace example

From a1edb3c809bf1a5d91fc373225e1bb4fc1b314a4 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 13:19:28 +0100
Subject: [PATCH 36/46] feat: wire capabilities through LLM.cpp, delete
 UnifiedRunner

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/models/llm/LLM.cpp    |  38 +-
 .../common/rnexecutorch/models/llm/LLM.h      |  10 +-
 .../common/rnexecutorch/tests/CMakeLists.txt  |   3 +-
 .../tests/integration/LLMTest.cpp             |  36 +-
 .../common/runner/unified_runner.cpp          | 387 ------------------
 .../common/runner/unified_runner.h            | 101 -----
 6 files changed, 46 insertions(+), 529 deletions(-)
 delete mode 100644 packages/react-native-executorch/common/runner/unified_runner.cpp
 delete mode 100644 packages/react-native-executorch/common/runner/unified_runner.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index acccedbd0..a23957bb6 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -2,10 +2,14 @@
 
 #include <executorch/extension/tensor/tensor.h>
 #include <filesystem>
+#include <map>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
+#include <runner/encoders/vision_encoder.h>
 #include <runner/image.h>
+#include <runner/multimodal_runner.h>
+#include <runner/text_runner.h>
 
 namespace rnexecutorch::models::llm {
 namespace llm = ::executorch::extension::llm;
@@ -43,24 +47,23 @@ const llm::Image &LLM::getOrLoadImage(const std::string &path) {
 }
 
 LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
+         std::vector<std::string> capabilities,
          std::shared_ptr<react::CallInvoker> callInvoker)
     : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
 
-  // Peek at method names to decide text vs multimodal before constructing
-  // runner
-  auto method_names_result = module_->method_names();
-  multimodal_ = method_names_result.ok() &&
-                method_names_result->count(llm::kTokenEmbeddingMethod) > 0 &&
-                method_names_result->count(llm::kTextModelMethod) > 0;
-
-  if (multimodal_) {
-    // Transfer module_ ownership to the runner (same as old MultimodalLLM)
-    runner_ = std::make_unique<example::UnifiedRunner>(
-        nullptr, std::move(module_), tokenizerSource);
+  if (capabilities.empty()) {
+    runner_ = std::make_unique<example::TextRunner>(module_.get(), nullptr,
+                                                    tokenizerSource);
   } else {
-    // Lend module_ as a raw pointer (same as old LLM)
-    runner_ = std::make_unique<example::UnifiedRunner>(module_.get(), nullptr,
-                                                       tokenizerSource);
+    std::map<llm::MultimodalType, std::unique_ptr<llm::IEncoder>> encoders;
+    for (const auto &cap : capabilities) {
+      if (cap == "vision") {
+        encoders[llm::MultimodalType::Image] =
+            std::make_unique<llm::VisionEncoder>(module_.get());
+      }
+    }
+    runner_ = std::make_unique<example::MultimodalRunner>(
+        std::move(module_), tokenizerSource, std::move(encoders));
   }
 
   auto loadResult = runner_->load();
@@ -104,7 +107,7 @@ std::string LLM::generate(std::string prompt,
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
-  if (!multimodal_) {
+  if (!dynamic_cast<example::MultimodalRunner *>(runner_.get())) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidUserInput,
         "This is a text-only model. Call generate(prompt, cb).");
@@ -158,8 +161,7 @@ std::string LLM::generate(std::string prompt,
     }
   };
 
-  auto error =
-      runner_->generate(inputs, temperature_, topp_, -1, nativeCallback);
+  auto error = runner_->generate_internal(inputs, nativeCallback);
   if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate multimodal response");
   }
@@ -242,7 +244,6 @@ void LLM::setTemperature(float temperature) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
                             "Temperature must be non-negative");
   }
-  temperature_ = temperature;
   runner_->set_temperature(temperature);
 }
 
@@ -255,7 +256,6 @@ void LLM::setTopp(float topp) {
     throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig,
                             "Top-p must be between 0.0 and 1.0");
   }
-  topp_ = topp;
   runner_->set_topp(topp);
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 6e47dbed0..60c8bc148 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -2,12 +2,13 @@
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <ReactCommon/CallInvoker.h>
 #include <jsi/jsi.h>
 #include <rnexecutorch/models/BaseModel.h>
+#include <runner/base_llm_runner.h>
 #include <runner/image.h>
-#include <runner/unified_runner.h>
 
 namespace rnexecutorch {
 namespace models::llm {
@@ -17,6 +18,7 @@ class LLM : public BaseModel {
 public:
   explicit LLM(const std::string &modelSource,
                const std::string &tokenizerSource,
+               std::vector<std::string> capabilities,
                std::shared_ptr<react::CallInvoker> callInvoker);
 
   // Text-only: pre-rendered prompt string
@@ -42,10 +44,7 @@ class LLM : public BaseModel {
   int32_t getMaxContextLength() const;
 
 private:
-  std::unique_ptr<example::UnifiedRunner> runner_;
-  bool multimodal_;
-  float temperature_ = 0.8f;
-  float topp_ = 0.9f;
+  std::unique_ptr<example::BaseLLMRunner> runner_;
   std::unordered_map<std::string, executorch::extension::llm::Image>
       imageCache_;
   const executorch::extension::llm::Image &
@@ -54,5 +53,6 @@ class LLM : public BaseModel {
 } // namespace models::llm
 
 REGISTER_CONSTRUCTOR(models::llm::LLM, std::string, std::string,
+                     std::vector<std::string>,
                      std::shared_ptr<react::CallInvoker>);
 } // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index a077a0c5a..56b640cc0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -210,7 +210,6 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp
 add_rn_test(LLMTests integration/LLMTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp
-        ${COMMON_DIR}/runner/runner.cpp          # keep until Task 5
         ${COMMON_DIR}/runner/base_llm_runner.cpp
         ${COMMON_DIR}/runner/text_runner.cpp
         ${COMMON_DIR}/runner/multimodal_runner.cpp
@@ -219,7 +218,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp
         ${COMMON_DIR}/runner/text_decoder_runner.cpp
         ${COMMON_DIR}/runner/sampler.cpp
         ${COMMON_DIR}/runner/arange_util.cpp
-        ${COMMON_DIR}/runner/encoders/vision_encoder.cpp   # add this
+        ${COMMON_DIR}/runner/encoders/vision_encoder.cpp
     LIBS tokenizers_deps opencv_deps
 )
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index 9ee8a64b2..cad94fa10 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -38,12 +38,12 @@ template <> struct ModelTraits<LLM> {
   using ModelType = LLM;
 
   static ModelType createValid() {
-    return ModelType(kValidModelPath, kValidTokenizerPath,
+    return ModelType(kValidModelPath, kValidTokenizerPath, {},
                      rnexecutorch::createMockCallInvoker());
   }
 
   static ModelType createInvalid() {
-    return ModelType("nonexistent.pte", kValidTokenizerPath,
+    return ModelType("nonexistent.pte", kValidTokenizerPath, {},
                      rnexecutorch::createMockCallInvoker());
   }
 
@@ -68,18 +68,24 @@ class LLMTest : public ::testing::Test {
 };
 
 TEST(LLMCtorTests, InvalidTokenizerPathThrows) {
-  EXPECT_THROW(LLM(kValidModelPath, "nonexistent_tokenizer.json",
+  EXPECT_THROW(LLM(kValidModelPath, "nonexistent_tokenizer.json", {},
                    createMockCallInvoker()),
                RnExecutorchError);
 }
 
+TEST(LLMCtorTests, WrongCapabilitiesThrowsClearError) {
+  EXPECT_THROW(LLM(kValidModelPath, kValidTokenizerPath, {"vision"},
+                   createMockCallInvoker()),
+               rnexecutorch::RnExecutorchError);
+}
+
 TEST_F(LLMTest, GetGeneratedTokenCountInitiallyZero) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_EQ(model.getGeneratedTokenCount(), 0);
 }
 
 TEST_F(LLMTest, SetTemperature) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   // Should not throw for valid values
   EXPECT_NO_THROW(model.setTemperature(0.5f));
   EXPECT_NO_THROW(model.setTemperature(1.0f));
@@ -87,43 +93,43 @@ TEST_F(LLMTest, SetTemperature) {
 }
 
 TEST_F(LLMTest, SetTemperatureNegativeThrows) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_THROW(model.setTemperature(-0.1f), RnExecutorchError);
 }
 
 TEST_F(LLMTest, SetTopp) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_NO_THROW(model.setTopp(0.9f));
   EXPECT_NO_THROW(model.setTopp(0.5f));
   EXPECT_NO_THROW(model.setTopp(1.0f));
 }
 
 TEST_F(LLMTest, SetToppInvalidThrows) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_THROW(model.setTopp(-0.1f), RnExecutorchError);
   EXPECT_THROW(model.setTopp(1.1f), RnExecutorchError);
 }
 
 TEST_F(LLMTest, SetCountInterval) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_NO_THROW(model.setCountInterval(5));
   EXPECT_NO_THROW(model.setCountInterval(10));
 }
 
 TEST_F(LLMTest, SetTimeInterval) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_NO_THROW(model.setTimeInterval(100));
   EXPECT_NO_THROW(model.setTimeInterval(500));
 }
 
 TEST_F(LLMTest, InterruptThrowsWhenUnloaded) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   model.unload();
   EXPECT_THROW(model.interrupt(), RnExecutorchError);
 }
 
 TEST_F(LLMTest, SettersThrowWhenUnloaded) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   model.unload();
   // All setters should throw when model is unloaded
   EXPECT_THROW(model.setTemperature(0.5f), RnExecutorchError);
@@ -133,7 +139,7 @@ TEST_F(LLMTest, SettersThrowWhenUnloaded) {
 }
 
 TEST_F(LLMTest, GenerateProducesValidOutput) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   model.setTemperature(0.0f);
   std::string prompt =
       formatChatML(kSystemPrompt, "Repeat exactly this: `naszponcilem testy`");
@@ -142,7 +148,7 @@ TEST_F(LLMTest, GenerateProducesValidOutput) {
 }
 
 TEST_F(LLMTest, GenerateUpdatesTokenCount) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_EQ(model.getGeneratedTokenCount(), 0);
   std::string prompt =
       formatChatML(kSystemPrompt, "Repeat exactly this: 'naszponcilem testy'");
@@ -151,7 +157,7 @@ TEST_F(LLMTest, GenerateUpdatesTokenCount) {
 }
 
 TEST_F(LLMTest, EmptyPromptThrows) {
-  LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_);
+  LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_);
   EXPECT_THROW((void)model.generate("", nullptr), RnExecutorchError);
 }
 
diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp
deleted file mode 100644
index 5dcdf127c..000000000
--- a/packages/react-native-executorch/common/runner/unified_runner.cpp
+++ /dev/null
@@ -1,387 +0,0 @@
-// packages/react-native-executorch/common/runner/unified_runner.cpp
-#include "unified_runner.h"
-#include "constants.h"
-#include "util.h"
-#include <cstdint>
-#include <ctime>
-#include <rnexecutorch/Error.h>
-
-namespace example {
-
-using namespace executorch::extension::llm;
-using ::executorch::extension::Module;
-using ::executorch::runtime::Error;
-using ::executorch::runtime::Result;
-
-UnifiedRunner::UnifiedRunner(Module *module,
-                             std::unique_ptr<Module> owned_module,
-                             const std::string &tokenizer_path,
-                             const llm::GenerationConfig &config)
-    : config_(config), module_(owned_module ? owned_module.get() : module),
-      owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path),
-      tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
-      metadata_({
-          {kEnableDynamicShape, false},
-          {kMaxSeqLen, 128},
-          {kMaxContextLen, 128},
-          {kUseKVCache, true},
-          {kUseSDPAWithKVCache, false},
-      }) {}
-
-bool UnifiedRunner::is_multimodal() const noexcept { return multimodal_; }
-
-bool UnifiedRunner::is_loaded() const {
-  if (multimodal_) {
-    return mm_prefiller_ && mm_prefiller_->is_method_loaded() &&
-           mm_token_generator_ && mm_token_generator_->is_loaded();
-  }
-  return module_->is_loaded() && tokenizer_->is_loaded() &&
-         text_decoder_runner_ && text_prefiller_ && text_token_generator_;
-}
-
-Error UnifiedRunner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-
-  auto status = tokenizer_->load(tokenizer_path_);
-  if (status != tokenizers::Error::Ok) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occurred while loading tokenizer");
-  }
-
-  // Detect mode by inspecting method names
-  const auto method_names =
-      ET_UNWRAP(module_->method_names(), "Failed reading method names");
-
-  multimodal_ = method_names.count(kTokenEmbeddingMethod) > 0 &&
-                method_names.count(kTextModelMethod) > 0;
-
-  // Load metadata
-  metadata_[kVocabSize] = tokenizer_->vocab_size();
-  for (auto &pair : metadata_) {
-    const auto &method_name = pair.first;
-    auto &value = pair.second;
-    if (method_names.count(method_name)) {
-      value = ET_UNWRAP(module_->get(method_name))
-                  .toScalar()
-                  .to<decltype(metadata_)::mapped_type>();
-    }
-    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
-  }
-
-  if (config_.max_seq_len < 0)
-    config_.max_seq_len = static_cast<int32_t>(metadata_.at(kMaxSeqLen));
-  if (config_.max_context_length < 0) {
-    config_.max_context_length =
-        method_names.count(kMaxContextLen)
-            ? static_cast<int32_t>(metadata_.at(kMaxContextLen))
-            : static_cast<int32_t>(metadata_.at(kMaxSeqLen));
-  }
-  if (config_.max_new_tokens < 0)
-    config_.max_new_tokens =
-        std::min(config_.max_seq_len, config_.max_context_length);
-  if (config_.enable_dynamic_shape)
-    config_.enable_dynamic_shape =
-        static_cast<bool>(metadata_.at(kEnableDynamicShape));
-  if (config_.enable_kv_cache)
-    config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
-
-  // Load EOS ids
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  if (method_names.count(kEosIds)) {
-    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
-      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
-    }
-  }
-  if (eos_ids->empty()) {
-    eos_ids->emplace(7); // fallback <|im_end|>
-  }
-
-  io_manager_ = std::make_unique<llm::IOManager>(*module_);
-  llm::Stats *stats_ptr = &stats_;
-
-  if (multimodal_) {
-    mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
-        module_, io_manager_.get());
-    mm_prefiller_ = std::make_unique<llm::MultimodalPrefiller>(
-        module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get());
-    mm_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
-        tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
-        std::move(eos_ids), stats_ptr);
-
-    ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load());
-    ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load());
-  } else {
-    ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
-
-    text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-        module_, io_manager_.get(), config_.temperature, config_.topp);
-    text_prefiller_ = std::make_unique<llm::TextPrefiller>(
-        text_decoder_runner_.get(), config_.enable_kv_cache,
-        config_.enable_dynamic_shape, config_.max_seq_len);
-    text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
-        tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
-        std::move(eos_ids), stats_ptr);
-  }
-
-  return Error::Ok;
-}
-
-Error UnifiedRunner::generate(
-    const std::string &prompt, const llm::GenerationConfig &generation_config,
-    std::function<void(const std::string &)> token_callback,
-    std::function<void(const llm::Stats &)> stats_callback) {
-
-  ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null");
-
-  // In multimodal mode, delegate to the multimodal generate path with
-  // text-only input (no image).
-  if (multimodal_) {
-    std::vector<llm::MultimodalInput> text_inputs = {
-        llm::make_text_input(prompt)};
-    float temp =
-        generation_config.temperature >= 0.F
-            ? generation_config.temperature
-            : (config_.temperature >= 0.F ? config_.temperature : 0.8F);
-    float topp = generation_config.topp >= 0.F
-                     ? generation_config.topp
-                     : (config_.topp >= 0.F ? config_.topp : 0.9F);
-    return generate(text_inputs, temp, topp, -1, token_callback);
-  }
-
-  if (!is_loaded()) {
-    stats_.model_load_start_ms = llm::time_in_ms();
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-    stats_.model_load_end_ms = llm::time_in_ms();
-  }
-
-  std::function<void(const std::string &)> wrapped_callback =
-      [token_callback, &generation_config](const std::string &piece) {
-        if (!generation_config.warming) {
-          llm::safe_printf(piece.c_str());
-          fflush(stdout);
-        }
-        if (token_callback)
-          token_callback(piece);
-      };
-
-  stats_.inference_start_ms = llm::time_in_ms();
-  shouldStop_ = false;
-
-  int32_t max_seq_len = generation_config.max_seq_len >= 0
-                            ? generation_config.max_seq_len
-                            : config_.max_seq_len;
-  int32_t max_context_length = generation_config.max_context_length >= 0
-                                   ? generation_config.max_context_length
-                                   : config_.max_context_length;
-  int32_t new_tokens_limit = generation_config.max_new_tokens >= 0
-                                 ? generation_config.max_new_tokens
-                                 : config_.max_new_tokens;
-  float temperature = generation_config.temperature >= 0.F
-                          ? generation_config.temperature
-                          : config_.temperature;
-  float topp =
-      generation_config.topp >= 0.F ? generation_config.topp : config_.topp;
-
-  int64_t context_len_left = static_cast<int64_t>(max_context_length) - pos_;
-
-  auto encodeResult =
-      tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens);
-  if (!encodeResult.ok()) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occurred while encoding: " +
-            std::to_string(static_cast<int32_t>(encodeResult.error())));
-  }
-  std::vector<uint64_t> prompt_tokens = encodeResult.get();
-  int num_prompt_tokens = prompt_tokens.size();
-
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument,
-                           "Expected at least 1 prompt token");
-  ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument,
-                           "num_prompt_tokens %d >= max_seq_len %" PRId32,
-                           num_prompt_tokens, max_seq_len);
-
-  int32_t max_new_tokens = resolve_max_new_tokens(
-      num_prompt_tokens, max_seq_len, static_cast<int32_t>(context_len_left),
-      new_tokens_limit);
-
-  ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument,
-                           "Max new tokens %d is <= 0", max_new_tokens);
-
-  if (generation_config.echo)
-    wrapped_callback(prompt);
-
-  auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_);
-  stats_.first_token_ms = llm::time_in_ms();
-  stats_.prompt_eval_end_ms = llm::time_in_ms();
-  ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
-
-  uint64_t cur_token = prefill_res.get();
-  auto decodeResult = tokenizer_->decode({cur_token});
-  if (!decodeResult.ok()) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Unexpected issue occurred while decoding: " +
-            std::to_string(static_cast<int32_t>(decodeResult.error())));
-  }
-
-  prompt_tokens.push_back(cur_token);
-  int64_t num_generated = ET_UNWRAP(
-      text_token_generator_->generate(prompt_tokens, pos_, max_new_tokens - 1,
-                                      temperature, topp, wrapped_callback));
-
-  pos_ += num_generated;
-  stats_.inference_end_ms = llm::time_in_ms();
-  stats_.num_prompt_tokens = num_prompt_tokens;
-  stats_.num_generated_tokens = num_generated;
-
-  if (stats_callback)
-    stats_callback(stats_);
-
-  return Error::Ok;
-}
-
-Error UnifiedRunner::generate(
-    const std::vector<llm::MultimodalInput> &inputs, float temperature,
-    float topp, int32_t max_new_tokens,
-    std::function<void(const std::string &)> token_callback) {
-
-  ET_CHECK_MSG(multimodal_,
-               "generate(MultimodalInput) called on a text-only runner. Use "
-               "generate(string) instead.");
-
-  if (inputs.empty()) {
-    ET_LOG(Error, "MultimodalInput vector cannot be empty");
-    return Error::InvalidArgument;
-  }
-
-  if (!is_loaded())
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-
-  stats_.inference_start_ms = llm::time_in_ms();
-
-  uint64_t prefill_next_token = 0;
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_);
-    if (!prefill_result.ok())
-      return prefill_result.error();
-    prefill_next_token = prefill_result.get();
-  }
-
-  stats_.first_token_ms = llm::time_in_ms();
-  stats_.prompt_eval_end_ms = llm::time_in_ms();
-  stats_.num_prompt_tokens = pos_;
-
-  int32_t resolved_max_new =
-      max_new_tokens > 0
-          ? max_new_tokens
-          : static_cast<int32_t>(config_.max_context_length - pos_);
-  resolved_max_new = std::max(0, resolved_max_new);
-
-  std::vector<uint64_t> seed_tokens = {prefill_next_token};
-  auto wrapped_callback = [&](const std::string &piece) {
-    llm::safe_printf(piece.c_str());
-    fflush(stdout);
-    if (token_callback)
-      token_callback(piece);
-  };
-
-  auto generate_result = mm_token_generator_->generate(
-      seed_tokens, pos_,
-      static_cast<uint64_t>(std::max(0, resolved_max_new - 1)), temperature,
-      topp, wrapped_callback);
-
-  if (!generate_result.ok())
-    return generate_result.error();
-
-  int64_t num_generated = generate_result.get();
-  pos_ += num_generated;
-
-  stats_.inference_end_ms = llm::time_in_ms();
-  stats_.num_generated_tokens = num_generated;
-
-  return Error::Ok;
-}
-
-void UnifiedRunner::stop() {
-  if (multimodal_) {
-    if (mm_token_generator_)
-      mm_token_generator_->stop();
-  } else {
-    if (text_token_generator_)
-      text_token_generator_->stop();
-  }
-}
-
-void UnifiedRunner::reset() {
-  stats_.reset();
-  pos_ = 0;
-}
-
-int32_t UnifiedRunner::count_text_tokens(const std::string &text) const {
-  auto encodeResult =
-      tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens);
-  if (!encodeResult.ok()) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::TokenizerError,
-        "Encoding failed during token count check.");
-  }
-  return static_cast<int32_t>(encodeResult.get().size());
-}
-
-int32_t UnifiedRunner::get_max_context_length() const {
-  if (!is_loaded()) {
-    return static_cast<int32_t>(metadata_.at(kMaxContextLen));
-  }
-  return config_.max_context_length;
-}
-
-void UnifiedRunner::set_temperature(float temperature) noexcept {
-  config_.temperature = temperature;
-  if (text_decoder_runner_)
-    text_decoder_runner_->set_temperature(temperature);
-}
-
-void UnifiedRunner::set_topp(float topp) noexcept {
-  config_.topp = topp;
-  if (text_decoder_runner_)
-    text_decoder_runner_->set_topp(topp);
-}
-
-void UnifiedRunner::set_count_interval(size_t count_interval) {
-  if (text_token_generator_)
-    text_token_generator_->set_count_interval(count_interval);
-  if (mm_token_generator_)
-    mm_token_generator_->set_count_interval(count_interval);
-}
-
-void UnifiedRunner::set_time_interval(size_t time_interval) {
-  if (text_token_generator_)
-    text_token_generator_->set_time_interval(time_interval);
-  if (mm_token_generator_)
-    mm_token_generator_->set_time_interval(time_interval);
-}
-
-int32_t UnifiedRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,
-                                              int32_t max_seq_len,
-                                              int32_t max_context_len,
-                                              int32_t max_new_tokens) const {
-  int32_t result;
-  if (max_seq_len == -1 && max_new_tokens == -1) {
-    result = max_context_len - num_prompt_tokens;
-  } else if (max_seq_len == -1) {
-    result = std::min(max_new_tokens, max_context_len - num_prompt_tokens);
-  } else if (max_new_tokens == -1) {
-    result = std::min(max_seq_len, max_context_len) - num_prompt_tokens;
-  } else {
-    result =
-        std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens,
-                 max_new_tokens);
-  }
-  return std::max(0, result);
-}
-
-} // namespace example
diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h
deleted file mode 100644
index ae7789bbe..000000000
--- a/packages/react-native-executorch/common/runner/unified_runner.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// packages/react-native-executorch/common/runner/unified_runner.h
-#pragma once
-
-#include "irunner.h"
-#include "multimodal_decoder_runner.h"
-#include "multimodal_input.h"
-#include "multimodal_prefiller.h"
-#include "stats.h"
-#include "text_decoder_runner.h"
-#include "text_prefiller.h"
-#include "text_token_generator.h"
-#include <cstdint>
-#include <executorch/extension/module/module.h>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <pytorch/tokenizers/hf_tokenizer.h>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-namespace example {
-
-namespace llm = ::executorch::extension::llm;
-
-class UnifiedRunner {
-public:
-  // module: raw pointer borrowed from BaseModel (text mode uses this)
-  // owned_module: unique_ptr taken for multimodal mode (nullptr in text mode)
-  // tokenizer_path: path to tokenizer JSON
-  // config: generation defaults
-  explicit UnifiedRunner(
-      ::executorch::extension::Module *module,
-      std::unique_ptr<::executorch::extension::Module> owned_module,
-      const std::string &tokenizer_path,
-      const llm::GenerationConfig &config = {.temperature = 0.8F,
-                                             .topp = 0.9F});
-
-  bool is_multimodal() const noexcept;
-  bool is_loaded() const;
-  ::executorch::runtime::Error load();
-
-  // Text-only generate — mirrors Runner::generate signature
-  ::executorch::runtime::Error
-  generate(const std::string &prompt,
-           const llm::GenerationConfig &generation_config = {},
-           std::function<void(const std::string &)> token_callback = {},
-           std::function<void(const llm::Stats &)> stats_callback = {});
-
-  // Multimodal generate — mirrors MultimodalRunner::generate signature
-  ::executorch::runtime::Error
-  generate(const std::vector<llm::MultimodalInput> &inputs, float temperature,
-           float topp, int32_t max_new_tokens,
-           std::function<void(const std::string &)> token_callback = {});
-
-  void stop();
-  void reset();
-
-  // Available for both modes
-  int32_t count_text_tokens(const std::string &text) const;
-  int32_t get_max_context_length() const;
-  void set_temperature(float temperature) noexcept;
-  void set_topp(float topp) noexcept;
-  void set_count_interval(size_t count_interval);
-  void set_time_interval(size_t time_interval);
-
-  llm::Stats stats_;
-
-private:
-  int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len,
-                                 int32_t max_context_len,
-                                 int32_t max_new_tokens = -1) const;
-
-  bool multimodal_{false};
-  llm::GenerationConfig config_;
-  bool shouldStop_{false};
-  int64_t pos_{0};
-
-  // module access — module_ is always a valid raw pointer
-  // In text mode: points to BaseModel's module_ (borrowed)
-  // In multimodal mode: points to owned_module_.get() (owned)
-  ::executorch::extension::Module *module_;
-  std::unique_ptr<::executorch::extension::Module> owned_module_;
-
-  std::string tokenizer_path_;
-  std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
-  std::unordered_map<std::string, int64_t> metadata_;
-  std::unique_ptr<llm::IOManager> io_manager_;
-
-  // Text-only subcomponents (null in multimodal mode)
-  std::unique_ptr<llm::TextDecoderRunner> text_decoder_runner_;
-  std::unique_ptr<llm::TextPrefiller> text_prefiller_;
-  std::unique_ptr<llm::TextTokenGenerator> text_token_generator_;
-
-  // Multimodal subcomponents (null in text mode)
-  std::unique_ptr<llm::MultimodalDecoderRunner> mm_decoder_runner_;
-  std::unique_ptr<llm::MultimodalPrefiller> mm_prefiller_;
-  std::unique_ptr<llm::TextTokenGenerator> mm_token_generator_;
-};
-
-} // namespace example

From 7076a9f403b0fe30cbcdf1b37df8835433e8fec8 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Tue, 3 Mar 2026 13:21:57 +0100
Subject: [PATCH 37/46] feat: forward capabilities from LLMController to native

---
 .../src/controllers/LLMController.ts                       | 7 ++++++-
 .../src/hooks/natural_language_processing/useLLM.ts        | 2 ++
 packages/react-native-executorch/src/index.ts              | 6 +++++-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index e09646d08..c52e537e4 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -5,6 +5,7 @@ import { DEFAULT_CHAT_CONFIG } from '../constants/llmDefaults';
 import {
   ChatConfig,
   GenerationConfig,
+  LLMCapability,
   LLMTool,
   Message,
   SPECIAL_TOKENS,
@@ -74,11 +75,13 @@ export class LLMController {
     modelSource,
     tokenizerSource,
     tokenizerConfigSource,
+    capabilities,
     onDownloadProgressCallback,
   }: {
     modelSource: ResourceSource;
     tokenizerSource: ResourceSource;
     tokenizerConfigSource: ResourceSource;
+    capabilities?: readonly LLMCapability[];
     onDownloadProgressCallback?: (downloadProgress: number) => void;
   }) {
     // reset inner state when loading new model
@@ -117,7 +120,9 @@ export class LLMController {
       this.tokenizerConfig = JSON.parse(
         await ResourceFetcher.fs.readAsString(tokenizerConfigPath!)
       );
-      this.nativeModule = global.loadLLM(modelPath, tokenizerPath);
+      this.nativeModule = global.loadLLM(modelPath, tokenizerPath, [
+        ...(capabilities ?? []),
+      ]);
       this.isReadyCallback(true);
       this.onToken = (data: string) => {
         if (!data) {
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index f83d39352..c2fcd01bc 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -61,6 +61,7 @@ export function useLLM({
           modelSource: model.modelSource,
           tokenizerSource: model.tokenizerSource,
           tokenizerConfigSource: model.tokenizerConfigSource!,
+          capabilities: model.capabilities,
           onDownloadProgressCallback: setDownloadProgress,
         });
       } catch (e) {
@@ -78,6 +79,7 @@ export function useLLM({
     model.modelSource,
     model.tokenizerSource,
     model.tokenizerConfigSource,
+    model.capabilities,
     preventLoad,
   ]);
 
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index dd7557ca2..3e6723b55 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -48,7 +48,11 @@ declare global {
   var loadImageEmbeddings: (source: string) => any;
   var loadVAD: (source: string) => any;
   var loadTextEmbeddings: (modelSource: string, tokenizerSource: string) => any;
-  var loadLLM: (modelSource: string, tokenizerSource: string) => any;
+  var loadLLM: (
+    modelSource: string,
+    tokenizerSource: string,
+    capabilities: string[]
+  ) => any;
   var loadTextToImage: (
     tokenizerSource: string,
     encoderSource: string,

From 96525bcfe208ebbe34bd715895d866fd6a96c69e Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 13:34:54 +0100
Subject: [PATCH 38/46] feat: add logging, fix metadata application, fix module
 ownership and EOS IDs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 apps/llm/app/multimodal_llm/index.tsx         | 12 ++-----
 .../common/rnexecutorch/models/llm/LLM.cpp    |  2 +-
 .../common/rnexecutorch/tests/CMakeLists.txt  |  1 +
 .../common/runner/base_llm_runner.cpp         | 13 ++++----
 .../common/runner/constants.h                 |  1 -
 .../common/runner/encoders/vision_encoder.cpp | 20 ++++++++++++
 .../common/runner/multimodal_runner.cpp       | 32 ++++++++++++++++++-
 .../common/runner/text_runner.cpp             | 11 +++++--
 .../common/runner/text_runner.h               |  1 -
 .../src/constants/modelUrls.ts                | 13 +-------
 .../natural_language_processing/useLLM.ts     |  4 ++-
 11 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx
index 542af4740..1781684a0 100644
--- a/apps/llm/app/multimodal_llm/index.tsx
+++ b/apps/llm/app/multimodal_llm/index.tsx
@@ -13,7 +13,7 @@ import {
 } from 'react-native';
 import { launchImageLibrary } from 'react-native-image-picker';
 import { useIsFocused } from '@react-navigation/native';
-import { useLLM } from 'react-native-executorch';
+import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch';
 import SendIcon from '../../assets/icons/send_icon.svg';
 import PauseIcon from '../../assets/icons/pause_icon.svg';
 import ColorPalette from '../../colors';
@@ -34,15 +34,7 @@ function MultimodalLLMScreen() {
   const { setGlobalGenerating } = useContext(GeneratingContext);
 
   const vlm = useLLM({
-    model: {
-      capabilities: ['vision'] as const,
-      modelSource:
-        'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte',
-      tokenizerSource:
-        'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json',
-      tokenizerConfigSource:
-        'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json',
-    },
+    model: LFM2_VL_1_6B_QUANTIZED,
   });
 
   useEffect(() => {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index a23957bb6..8d046d6e9 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -52,7 +52,7 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
     : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
 
   if (capabilities.empty()) {
-    runner_ = std::make_unique<example::TextRunner>(module_.get(), nullptr,
+    runner_ = std::make_unique<example::TextRunner>(std::move(module_),
                                                     tokenizerSource);
   } else {
     std::map<llm::MultimodalType, std::unique_ptr<llm::IEncoder>> encoders;
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 56b640cc0..159f00159 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -219,6 +219,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp
         ${COMMON_DIR}/runner/sampler.cpp
         ${COMMON_DIR}/runner/arange_util.cpp
         ${COMMON_DIR}/runner/encoders/vision_encoder.cpp
+        ${IMAGE_UTILS_SOURCES}
     LIBS tokenizers_deps opencv_deps
 )
 
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
index a987528a0..fcb647ec5 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -4,6 +4,7 @@
 #include "util.h"
 #include <cstdint>
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/Log.h>
 
 namespace example {
 
@@ -23,7 +24,6 @@ BaseLLMRunner::BaseLLMRunner(Module *module,
           {kMaxSeqLen, 128},
           {kMaxContextLen, 128},
           {kUseKVCache, true},
-          {kUseSDPAWithKVCache, false},
       }) {}
 
 Error BaseLLMRunner::load() {
@@ -49,7 +49,8 @@ Error BaseLLMRunner::load() {
                   .toScalar()
                   .to<decltype(metadata_)::mapped_type>();
     }
-    ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value);
+    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                      "[BaseLLMRunner] Metadata:", method_name, "=", value);
   }
 
   if (config_.max_seq_len < 0)
@@ -63,11 +64,9 @@ Error BaseLLMRunner::load() {
   if (config_.max_new_tokens < 0)
     config_.max_new_tokens =
         std::min(config_.max_seq_len, config_.max_context_length);
-  if (config_.enable_dynamic_shape)
-    config_.enable_dynamic_shape =
-        static_cast<bool>(metadata_.at(kEnableDynamicShape));
-  if (config_.enable_kv_cache)
-    config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
+  config_.enable_dynamic_shape =
+      static_cast<bool>(metadata_.at(kEnableDynamicShape));
+  config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
 
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
   if (method_names.count(kEosIds)) {
diff --git a/packages/react-native-executorch/common/runner/constants.h b/packages/react-native-executorch/common/runner/constants.h
index e75466829..f1fee2347 100644
--- a/packages/react-native-executorch/common/runner/constants.h
+++ b/packages/react-native-executorch/common/runner/constants.h
@@ -17,7 +17,6 @@ inline constexpr auto kMaxSeqLen = "get_max_seq_len";
 inline constexpr auto kMaxContextLen = "get_max_context_len";
 inline constexpr auto kVocabSize = "get_vocab_size";
 inline constexpr auto kUseKVCache = "use_kv_cache";
-inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
 
 // Multimodal method name conventions
 inline constexpr auto kVisionEncoderMethod = "vision_encoder";
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 35ce84c6d..0e49abe5a 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -2,6 +2,7 @@
 #include "vision_encoder.h"
 
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/Log.h>
 #include <runner/constants.h>
 #include <runner/image.h>
 
@@ -22,12 +23,26 @@ Error VisionEncoder::load() {
   if (!method_names_result.ok()) {
     return method_names_result.error();
   }
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug,
+                    "[VisionEncoder] Available methods:");
+  for (const auto &name : *method_names_result) {
+    auto val = module_->get(name);
+    if (val.ok()) {
+      rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name, "=",
+                        val->toScalar().to<int64_t>());
+    } else {
+      rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name);
+    }
+  }
+
   if (method_names_result->count(kVisionEncoderMethod) == 0) {
     throw rnexecutorch::RnExecutorchError(
         rnexecutorch::RnExecutorchErrorCode::InvalidConfig,
         "Model does not support vision: 'vision_encoder' method not found. "
         "Check that the .pte file matches the declared capabilities.");
   }
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[VisionEncoder] Loading method:", kVisionEncoderMethod);
   return module_->load_method(kVisionEncoderMethod);
 }
 
@@ -53,11 +68,16 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
     return input_meta_result.error();
   }
   auto expected_dims = input_meta_result->sizes();
+  rnexecutorch::log(
+      rnexecutorch::LOG_LEVEL::Debug, "[VisionEncoder] Expected input dims:",
+      std::vector<int32_t>(expected_dims.begin(), expected_dims.end()));
   auto image_tensor_result =
       image.toTensor(/*with_batch=*/expected_dims.size() == 4);
   if (!image_tensor_result.ok()) {
     return image_tensor_result.error();
   }
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[VisionEncoder] Running encode");
   auto result = module_->execute(kVisionEncoderMethod, *image_tensor_result);
   if (!result.ok()) {
     return result.error();
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index 363f11d11..4960f7845 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -3,6 +3,7 @@
 #include "constants.h"
 #include "util.h"
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/Log.h>
 
 namespace example {
 
@@ -30,14 +31,37 @@ bool MultimodalRunner::is_loaded() const {
 }
 
 Error MultimodalRunner::load_subcomponents() {
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading",
+                    encoders_.size(), "encoder(s)");
   // Load and validate all declared encoders — throws on mismatch
   for (auto &[type, encoder] : encoders_) {
+    rnexecutorch::log(
+        rnexecutorch::LOG_LEVEL::Debug,
+        "[MultimodalRunner] Loading encoder type:", static_cast<int>(type));
     encoder->load();
+    rnexecutorch::log(
+        rnexecutorch::LOG_LEVEL::Info,
+        "[MultimodalRunner] Encoder loaded, type:", static_cast<int>(type));
   }
 
   llm::Stats *stats_ptr = &stats_;
   auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  eos_ids->emplace(7); // fallback
+  const auto method_names =
+      ET_UNWRAP(module_->method_names(), "Failed reading method names");
+  if (method_names.count(kEosIds)) {
+    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
+      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
+    }
+  }
+  if (eos_ids->empty()) {
+    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Warn,
+                      "[MultimodalRunner] get_eos_ids not found in model, "
+                      "falling back to {7}");
+    eos_ids->emplace(7);
+  } else {
+    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                      "[MultimodalRunner] EOS IDs loaded:", *eos_ids);
+  }
 
   mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
       module_, io_manager_.get());
@@ -49,6 +73,8 @@ Error MultimodalRunner::load_subcomponents() {
 
   ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load());
   ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load());
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[MultimodalRunner] All subcomponents loaded successfully");
   return Error::Ok;
 }
 
@@ -74,6 +100,10 @@ Error MultimodalRunner::generate_internal(
   stats_.first_token_ms = llm::time_in_ms();
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   stats_.num_prompt_tokens = pos_;
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[MultimodalRunner] Prefill took",
+                    stats_.prompt_eval_end_ms - stats_.inference_start_ms,
+                    "ms for", pos_, "tokens");
 
   int32_t resolved_max_new =
       static_cast<int32_t>(config_.max_context_length - pos_);
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
index 279244855..fa2225f3d 100644
--- a/packages/react-native-executorch/common/runner/text_runner.cpp
+++ b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -4,6 +4,7 @@
 #include "util.h"
 #include <cstdint>
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/Log.h>
 
 namespace example {
 
@@ -11,10 +12,10 @@ using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
 using ::executorch::runtime::Error;
 
-TextRunner::TextRunner(Module *module, std::unique_ptr<Module> owned_module,
+TextRunner::TextRunner(std::unique_ptr<Module> owned_module,
                        const std::string &tokenizer_path,
                        const llm::GenerationConfig &config)
-    : BaseLLMRunner(module, std::move(owned_module), tokenizer_path, config) {}
+    : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config) {}
 
 bool TextRunner::is_loaded() const {
   return module_ && module_->is_loaded() && tokenizer_ &&
@@ -43,6 +44,9 @@ Error TextRunner::load_subcomponents() {
 
   text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
       module_, io_manager_.get(), config_.temperature, config_.topp);
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
+                    "[TextRunner] Parallel prefill (enable_dynamic_shape):",
+                    config_.enable_dynamic_shape);
   text_prefiller_ = std::make_unique<llm::TextPrefiller>(
       text_decoder_runner_.get(), config_.enable_kv_cache,
       config_.enable_dynamic_shape, config_.max_seq_len);
@@ -116,6 +120,9 @@ Error TextRunner::generate_internal(
   auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_);
   stats_.first_token_ms = llm::time_in_ms();
   stats_.prompt_eval_end_ms = llm::time_in_ms();
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[TextRunner] Prefill took",
+                    stats_.prompt_eval_end_ms - stats_.inference_start_ms,
+                    "ms for", num_prompt_tokens, "tokens");
   ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error());
 
   uint64_t cur_token = prefill_res.get();
diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h
index e590f4c88..17394ee3f 100644
--- a/packages/react-native-executorch/common/runner/text_runner.h
+++ b/packages/react-native-executorch/common/runner/text_runner.h
@@ -11,7 +11,6 @@ namespace example {
 class TextRunner : public BaseLLMRunner {
 public:
   explicit TextRunner(
-      ::executorch::extension::Module *module,
       std::unique_ptr<::executorch::extension::Module> owned_module,
       const std::string &tokenizer_path,
       const ::executorch::extension::llm::GenerationConfig &config = {
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 325a13133..0dec9a6b0 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -372,21 +372,10 @@ export const LFM2_5_1_2B_INSTRUCT_QUANTIZED = {
 };
 
 // LFM2.5-VL-1.6B (Vision-Language)
-const LFM2_VL_1_6B_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_xnnpack.pte`;
-const LFM2_VL_1_6B_QUANTIZED_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte`;
+const LFM2_VL_1_6B_QUANTIZED_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_v2.pte`;
 const LFM2_VL_TOKENIZER = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json`;
 const LFM2_VL_TOKENIZER_CONFIG = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json`;
 
-/**
- * @category Models - VLM
- */
-export const LFM2_VL_1_6B = {
-  capabilities: ['vision'] as const,
-  modelSource: LFM2_VL_1_6B_MODEL,
-  tokenizerSource: LFM2_VL_TOKENIZER,
-  tokenizerConfigSource: LFM2_VL_TOKENIZER_CONFIG,
-};
-
 /**
  * @category Models - VLM
  */
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index c2fcd01bc..877f3a02d 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -33,6 +33,7 @@ export function useLLM({
   const [isGenerating, setIsGenerating] = useState(false);
   const [downloadProgress, setDownloadProgress] = useState(0);
   const [error, setError] = useState<null | RnExecutorchError>(null);
+  const capabilitiesKey = model.capabilities?.join(',') ?? '';
 
   const tokenCallback = useCallback((newToken: string) => {
     setToken(newToken);
@@ -74,12 +75,13 @@ export function useLLM({
         controllerInstance.delete();
       }
     };
+    // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [
     controllerInstance,
     model.modelSource,
     model.tokenizerSource,
     model.tokenizerConfigSource,
-    model.capabilities,
+    capabilitiesKey, // intentional: serialized string to avoid array reference re-runs
     preventLoad,
   ]);
 

From b3ce27eaae5771c17a62af2205fec2710fb59bdb Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 14:51:19 +0100
Subject: [PATCH 39/46] refactor: replace Image class with ImagePath +
 VisionEncoder embedding cache

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/models/llm/LLM.cpp    | 34 +------
 .../common/rnexecutorch/models/llm/LLM.h      |  5 --
 .../common/runner/encoders/vision_encoder.cpp | 65 +++++++++++---
 .../common/runner/encoders/vision_encoder.h   |  5 ++
 .../common/runner/image.h                     | 88 -------------------
 .../common/runner/multimodal_input.h          | 25 +++---
 .../common/runner/multimodal_prefiller.cpp    | 41 ++-------
 .../common/runner/multimodal_prefiller.h      |  5 +-
 .../common/runner/multimodal_runner.cpp       |  8 +-
 9 files changed, 90 insertions(+), 186 deletions(-)
 delete mode 100644 packages/react-native-executorch/common/runner/image.h

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 8d046d6e9..8cdeffbe0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -4,10 +4,8 @@
 #include <filesystem>
 #include <map>
 #include <rnexecutorch/Error.h>
-#include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <rnexecutorch/threads/GlobalThreadPool.h>
 #include <runner/encoders/vision_encoder.h>
-#include <runner/image.h>
 #include <runner/multimodal_runner.h>
 #include <runner/text_runner.h>
 
@@ -18,34 +16,6 @@ using namespace facebook;
 using executorch::extension::module::Module;
 using executorch::runtime::Error;
 
-// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values [0,255]
-static constexpr int kImageSize = 512;
-static constexpr int kImageChannels = 3;
-
-static llm::Image loadImageForVLM(const std::string &imagePath) {
-  cv::Mat mat = image_processing::readImage(imagePath);
-  cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
-  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
-
-  std::vector<float> chw(kImageChannels * kImageSize * kImageSize);
-  const int pixelCount = kImageSize * kImageSize;
-  for (int i = 0; i < pixelCount; ++i) {
-    cv::Vec3b px = mat.at<cv::Vec3b>(i / kImageSize, i % kImageSize);
-    for (int c = 0; c < kImageChannels; ++c) {
-      chw[c * pixelCount + i] = static_cast<float>(px[c]);
-    }
-  }
-  return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels);
-}
-
-const llm::Image &LLM::getOrLoadImage(const std::string &path) {
-  auto it = imageCache_.find(path);
-  if (it != imageCache_.end()) {
-    return it->second;
-  }
-  return imageCache_.emplace(path, loadImageForVLM(path)).first->second;
-}
-
 LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
          std::vector<std::string> capabilities,
          std::shared_ptr<react::CallInvoker> callInvoker)
@@ -141,8 +111,7 @@ std::string LLM::generate(std::string prompt,
           RnExecutorchErrorCode::InvalidUserInput,
           "More <image> placeholders in prompt than image paths provided");
     }
-    const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]);
-    inputs.push_back(llm::make_image_input(img));
+    inputs.push_back(llm::make_image_input(imagePaths[imageIdx++]));
     searchPos = found + kImageTokenLen;
   }
 
@@ -183,7 +152,6 @@ void LLM::reset() {
                             "Can't reset a model that's not loaded");
   }
   runner_->reset();
-  imageCache_.clear();
 }
 
 size_t LLM::getGeneratedTokenCount() const noexcept {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 60c8bc148..b341e3811 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -8,7 +8,6 @@
 #include <jsi/jsi.h>
 #include <rnexecutorch/models/BaseModel.h>
 #include <runner/base_llm_runner.h>
-#include <runner/image.h>
 
 namespace rnexecutorch {
 namespace models::llm {
@@ -45,10 +44,6 @@ class LLM : public BaseModel {
 
 private:
   std::unique_ptr<example::BaseLLMRunner> runner_;
-  std::unordered_map<std::string, executorch::extension::llm::Image>
-      imageCache_;
-  const executorch::extension::llm::Image &
-  getOrLoadImage(const std::string &path);
 };
 } // namespace models::llm
 
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 0e49abe5a..44beb2a7c 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -3,8 +3,11 @@
 
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/Log.h>
+#include <rnexecutorch/data_processing/ImageProcessing.h>
 #include <runner/constants.h>
-#include <runner/image.h>
+
+#include <executorch/extension/tensor/tensor.h>
+#include <opencv2/opencv.hpp>
 
 namespace executorch::extension::llm {
 
@@ -12,6 +15,10 @@ using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::Result;
 
+// LFM2-VL vision encoder expects [1, 3, H, W] NCHW float32, values [0, 255]
+static constexpr int kImageSize = 512;
+static constexpr int kImageChannels = 3;
+
 VisionEncoder::VisionEncoder(::executorch::extension::Module *module)
     : module_(module) {}
 
@@ -57,32 +64,62 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
   if (!input.is_image()) {
     return Error::InvalidArgument;
   }
-  const Image &image = input.get_image();
+
+  const std::string &path = input.get_image_path();
+
+  // Return cached embedding if available
+  auto it = embedding_cache_.find(path);
+  if (it != embedding_cache_.end()) {
+    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug,
+                      "[VisionEncoder] Cache hit for:", path);
+    return it->second;
+  }
+
+  // Load and preprocess image: resize → BGR→RGB → HWC uint8 → CHW float32
+  cv::Mat mat = rnexecutorch::image_processing::readImage(path);
+  cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
+  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
+
+  std::vector<float> chw(kImageChannels * kImageSize * kImageSize);
+  const int pixelCount = kImageSize * kImageSize;
+  for (int i = 0; i < pixelCount; ++i) {
+    cv::Vec3b px = mat.at<cv::Vec3b>(i / kImageSize, i % kImageSize);
+    for (int c = 0; c < kImageChannels; ++c) {
+      chw[c * pixelCount + i] = static_cast<float>(px[c]);
+    }
+  }
+
+  // Determine expected input shape (with or without batch dim)
   auto method_meta_result = module_->method_meta(kVisionEncoderMethod);
   if (!method_meta_result.ok()) {
     return method_meta_result.error();
   }
-  auto &method_meta = *method_meta_result;
-  auto input_meta_result = method_meta.input_tensor_meta(0);
+  auto input_meta_result = method_meta_result->input_tensor_meta(0);
   if (!input_meta_result.ok()) {
     return input_meta_result.error();
   }
   auto expected_dims = input_meta_result->sizes();
-  rnexecutorch::log(
-      rnexecutorch::LOG_LEVEL::Debug, "[VisionEncoder] Expected input dims:",
-      std::vector<int32_t>(expected_dims.begin(), expected_dims.end()));
-  auto image_tensor_result =
-      image.toTensor(/*with_batch=*/expected_dims.size() == 4);
-  if (!image_tensor_result.ok()) {
-    return image_tensor_result.error();
+  const bool with_batch = expected_dims.size() == 4;
+
+  std::vector<::executorch::aten::SizesType> sizes = {kImageChannels,
+                                                      kImageSize, kImageSize};
+  if (with_batch) {
+    sizes.insert(sizes.begin(), 1);
   }
+
+  auto image_tensor = ::executorch::extension::from_blob(
+      chw.data(), sizes, ::executorch::aten::ScalarType::Float);
+
   rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[VisionEncoder] Running encode");
-  auto result = module_->execute(kVisionEncoderMethod, *image_tensor_result);
+                    "[VisionEncoder] Running encode for:", path);
+  auto result = module_->execute(kVisionEncoderMethod, image_tensor);
   if (!result.ok()) {
     return result.error();
   }
-  return (*result)[0];
+
+  EValue embedding = (*result)[0];
+  embedding_cache_.emplace(path, embedding);
+  return embedding;
 }
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
index 5b3dd0aec..5af0491bd 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -3,7 +3,10 @@
 
 #include "iencoder.h"
 #include <executorch/extension/module/module.h>
+#include <executorch/runtime/core/evalue.h>
 #include <runner/multimodal_input.h>
+#include <string>
+#include <unordered_map>
 
 namespace executorch::extension::llm {
 
@@ -18,6 +21,8 @@ class VisionEncoder : public IEncoder {
 
 private:
   ::executorch::extension::Module *module_;
+  std::unordered_map<std::string, ::executorch::runtime::EValue>
+      embedding_cache_;
 };
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/image.h b/packages/react-native-executorch/common/runner/image.h
deleted file mode 100644
index 86373ca91..000000000
--- a/packages/react-native-executorch/common/runner/image.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-// Ported from executorch/extension/llm/runner/image.h
-
-#pragma once
-
-#include <cstdint>
-#include <variant>
-#include <vector>
-
-#include <executorch/extension/tensor/tensor.h>
-#include <executorch/runtime/platform/log.h>
-
-namespace executorch {
-namespace extension {
-namespace llm {
-
-class Image {
-public:
-  Image() : width_(0), height_(0), channels_(0) {}
-
-  Image(std::vector<uint8_t> &&data, int32_t width, int32_t height,
-        int32_t channels)
-      : data_(std::move(data)), width_(width), height_(height),
-        channels_(channels) {}
-
-  Image(std::vector<float> &&data, int32_t width, int32_t height,
-        int32_t channels)
-      : data_(std::move(data)), width_(width), height_(height),
-        channels_(channels) {}
-
-  int32_t width() const { return width_; }
-  int32_t height() const { return height_; }
-  int32_t channels() const { return channels_; }
-
-  bool is_uint8() const {
-    return std::holds_alternative<std::vector<uint8_t>>(data_);
-  }
-  bool is_float() const {
-    return std::holds_alternative<std::vector<float>>(data_);
-  }
-
-  const std::vector<uint8_t> &get_uint8_data() const & {
-    return std::get<std::vector<uint8_t>>(data_);
-  }
-  const std::vector<float> &get_float_data() const & {
-    return std::get<std::vector<float>>(data_);
-  }
-  std::vector<float> &get_float_data() & {
-    return std::get<std::vector<float>>(data_);
-  }
-
-  ::executorch::runtime::Result<::executorch::extension::TensorPtr>
-  toTensor(bool with_batch = false) const {
-    std::vector<::executorch::aten::SizesType> sizes = {channels(), height(),
-                                                        width()};
-    if (with_batch) {
-      sizes.insert(sizes.begin(), 1);
-    }
-    if (is_float()) {
-      return ::executorch::extension::from_blob(
-          const_cast<float *>(get_float_data().data()), sizes,
-          ::executorch::aten::ScalarType::Float);
-    } else if (is_uint8()) {
-      return ::executorch::extension::from_blob(
-          const_cast<uint8_t *>(get_uint8_data().data()), sizes,
-          ::executorch::aten::ScalarType::Byte);
-    }
-    ET_LOG(Error, "Image data is not initialized.");
-    return ::executorch::runtime::Error::NotSupported;
-  }
-
-private:
-  std::variant<std::vector<uint8_t>, std::vector<float>> data_;
-  int32_t width_;
-  int32_t height_;
-  int32_t channels_;
-};
-
-} // namespace llm
-} // namespace extension
-} // namespace executorch
diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h
index 4ce588db6..1d56f5f5a 100644
--- a/packages/react-native-executorch/common/runner/multimodal_input.h
+++ b/packages/react-native-executorch/common/runner/multimodal_input.h
@@ -11,7 +11,6 @@
 
 #pragma once
 
-#include <runner/image.h>
 #include <string>
 #include <variant>
 #include <vector>
@@ -20,6 +19,11 @@ namespace executorch {
 namespace extension {
 namespace llm {
 
+// Tagged struct to distinguish image paths from text strings in the variant.
+struct ImagePath {
+  std::string path;
+};
+
 class MultimodalInput {
 public:
   explicit MultimodalInput(const std::string &text) : data_(text) {}
@@ -28,8 +32,8 @@ class MultimodalInput {
       : data_(tokens) {}
   explicit MultimodalInput(std::vector<uint64_t> &&tokens)
       : data_(std::move(tokens)) {}
-  explicit MultimodalInput(const Image &image) : data_(image) {}
-  explicit MultimodalInput(Image &&image) : data_(std::move(image)) {}
+  explicit MultimodalInput(ImagePath image_path)
+      : data_(std::move(image_path)) {}
 
   MultimodalInput(const MultimodalInput &) = default;
   MultimodalInput &operator=(const MultimodalInput &) = default;
@@ -43,17 +47,19 @@ class MultimodalInput {
     return std::holds_alternative<std::vector<uint64_t>>(data_);
   }
   bool is_image() const noexcept {
-    return std::holds_alternative<Image>(data_);
+    return std::holds_alternative<ImagePath>(data_);
   }
 
   const std::string &get_text() const & { return std::get<std::string>(data_); }
   const std::vector<uint64_t> &get_tokens() const & {
     return std::get<std::vector<uint64_t>>(data_);
   }
-  const Image &get_image() const & { return std::get<Image>(data_); }
+  const std::string &get_image_path() const & {
+    return std::get<ImagePath>(data_).path;
+  }
 
 private:
-  std::variant<std::string, std::vector<uint64_t>, Image> data_;
+  std::variant<std::string, std::vector<uint64_t>, ImagePath> data_;
 };
 
 inline MultimodalInput make_text_input(const std::string &text) noexcept {
@@ -62,11 +68,8 @@ inline MultimodalInput make_text_input(const std::string &text) noexcept {
 inline MultimodalInput make_text_input(std::string &&text) noexcept {
   return MultimodalInput(std::move(text));
 }
-inline MultimodalInput make_image_input(const Image &image) noexcept {
-  return MultimodalInput(image);
-}
-inline MultimodalInput make_image_input(Image &&image) noexcept {
-  return MultimodalInput(std::move(image));
+inline MultimodalInput make_image_input(std::string path) noexcept {
+  return MultimodalInput(ImagePath{std::move(path)});
 }
 
 } // namespace llm
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
index c39c7cc0f..a9a4715a7 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -24,9 +24,10 @@ using ::executorch::runtime::Result;
 
 MultimodalPrefiller::MultimodalPrefiller(
     Module *module, MultimodalDecoderRunner *decoder_runner,
-    tokenizers::HFTokenizer *tokenizer, IOManager *io_manager)
+    tokenizers::HFTokenizer *tokenizer, IOManager *io_manager,
+    IEncoder *image_encoder)
     : module_(module), decoder_runner_(decoder_runner), tokenizer_(tokenizer),
-      io_manager_(io_manager) {}
+      io_manager_(io_manager), image_encoder_(image_encoder) {}
 
 Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
                                               int64_t &start_pos) {
@@ -36,37 +37,11 @@ Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
   TensorPtr sliced_embed_storage;
 
   if (input.is_image()) {
-    const Image &image = input.get_image();
-
-    // Query input dtype expected by vision_encoder.
-    auto method_meta_result = module_->method_meta(kVisionEncoderMethod);
-    ET_CHECK_OK_OR_RETURN_ERROR(method_meta_result.error(),
-                                "Failed to get method_meta for %s",
-                                kVisionEncoderMethod);
-    auto &method_meta = *method_meta_result;
-
-    ET_CHECK_OR_RETURN_ERROR(method_meta.num_inputs() > 0, InvalidArgument,
-                             "vision_encoder has no inputs");
-    auto input_meta_result = method_meta.input_tensor_meta(0);
-    ET_CHECK_OK_OR_RETURN_ERROR(input_meta_result.error(),
-                                "Cannot get vision_encoder input meta at 0");
-    auto expected_dtype = input_meta_result->scalar_type();
-
-    ET_CHECK_OR_RETURN_ERROR(
-        expected_dtype == ::executorch::aten::ScalarType::Float &&
-            image.is_float(),
-        InvalidArgument, "vision_encoder expects float32 image data");
-
-    auto expected_dims = input_meta_result->sizes();
-    auto image_tensor_result =
-        image.toTensor(/*with_batch=*/expected_dims.size() == 4);
-    ET_CHECK_OK_OR_RETURN_ERROR(image_tensor_result.error(),
-                                "Failed to convert image to tensor");
-
-    auto image_encoder_result =
-        module_->execute(kVisionEncoderMethod, *image_tensor_result);
-    ET_CHECK_OK_OR_RETURN_ERROR(image_encoder_result.error());
-    encoder_output = (*image_encoder_result)[0];
+    ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState,
+                             "No image encoder registered");
+    auto encode_result = image_encoder_->encode(input);
+    ET_CHECK_OK_OR_RETURN_ERROR(encode_result.error(), "Image encoding failed");
+    encoder_output = *encode_result;
 
   } else if (input.is_text() || input.is_tokens()) {
     std::vector<uint64_t> tokens;
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
index ee0f99a5b..4effee7b7 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.h
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
@@ -14,6 +14,7 @@
 #include "multimodal_input.h"
 #include <executorch/extension/module/module.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
+#include <runner/encoders/iencoder.h>
 
 namespace executorch {
 namespace extension {
@@ -26,7 +27,8 @@ class MultimodalPrefiller {
   explicit MultimodalPrefiller(Module *module,
                                MultimodalDecoderRunner *decoder_runner,
                                tokenizers::HFTokenizer *tokenizer,
-                               IOManager *io_manager);
+                               IOManager *io_manager,
+                               IEncoder *image_encoder = nullptr);
 
   // Prefill one input segment. Updates start_pos in-place.
   // Returns the first predicted token after this segment.
@@ -41,6 +43,7 @@ class MultimodalPrefiller {
   MultimodalDecoderRunner *decoder_runner_;
   tokenizers::HFTokenizer *tokenizer_;
   IOManager *io_manager_;
+  IEncoder *image_encoder_;
 };
 
 } // namespace llm
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index 4960f7845..3d70e04b5 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -65,8 +65,14 @@ Error MultimodalRunner::load_subcomponents() {
 
   mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
       module_, io_manager_.get());
+  llm::IEncoder *image_encoder = nullptr;
+  auto enc_it = encoders_.find(llm::MultimodalType::Image);
+  if (enc_it != encoders_.end()) {
+    image_encoder = enc_it->second.get();
+  }
   mm_prefiller_ = std::make_unique<llm::MultimodalPrefiller>(
-      module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get());
+      module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get(),
+      image_encoder);
   mm_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
       std::move(eos_ids), stats_ptr);

From ce6856d6f4ad4f26795ab6e3be3399c8667f4883 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 14:54:57 +0100
Subject: [PATCH 40/46] test: add TextRunnerTests and VLMTests suites, register
 in CMake and run_tests.sh

---
 .../common/rnexecutorch/tests/CMakeLists.txt  |  24 ++++
 .../tests/integration/LLMTest.cpp             |  45 -------
 .../integration/MultimodalRunnerTest.cpp      | 118 ++++++++++++++++++
 .../tests/integration/TextRunnerTest.cpp      | 101 +++++++++++++++
 .../common/rnexecutorch/tests/run_tests.sh    |   4 +
 5 files changed, 247 insertions(+), 45 deletions(-)
 create mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
 create mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index 159f00159..ebf390691 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -223,6 +223,30 @@ add_rn_test(LLMTests integration/LLMTest.cpp
     LIBS tokenizers_deps opencv_deps
 )
 
+add_rn_test(TextRunnerTests integration/TextRunnerTest.cpp
+    SOURCES
+        ${COMMON_DIR}/runner/base_llm_runner.cpp
+        ${COMMON_DIR}/runner/text_runner.cpp
+        ${COMMON_DIR}/runner/text_prefiller.cpp
+        ${COMMON_DIR}/runner/text_decoder_runner.cpp
+        ${COMMON_DIR}/runner/sampler.cpp
+        ${COMMON_DIR}/runner/arange_util.cpp
+    LIBS tokenizers_deps
+)
+
+add_rn_test(VLMTests integration/MultimodalRunnerTest.cpp
+    SOURCES
+        ${COMMON_DIR}/runner/base_llm_runner.cpp
+        ${COMMON_DIR}/runner/multimodal_runner.cpp
+        ${COMMON_DIR}/runner/multimodal_prefiller.cpp
+        ${COMMON_DIR}/runner/text_decoder_runner.cpp
+        ${COMMON_DIR}/runner/sampler.cpp
+        ${COMMON_DIR}/runner/arange_util.cpp
+        ${COMMON_DIR}/runner/encoders/vision_encoder.cpp
+        ${IMAGE_UTILS_SOURCES}
+    LIBS tokenizers_deps opencv_deps
+)
+
 add_rn_test(TextToImageTests integration/TextToImageTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/text_to_image/TextToImage.cpp
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index cad94fa10..65bd1917a 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -212,48 +212,3 @@ TEST(BaseLLMRunnerTest, ResetZerosPos) {
   runner.reset();
   EXPECT_EQ(runner.pos_, 0);
 }
-
-#include <runner/text_runner.h>
-
-TEST(TextRunnerTest, LoadsSuccessfully) {
-  auto module = std::make_unique<::executorch::extension::Module>(
-      "smolLm2_135M_8da4w.pte",
-      ::executorch::extension::Module::LoadMode::File);
-
-  example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json");
-  auto err = runner.load();
-  EXPECT_EQ(err, ::executorch::runtime::Error::Ok);
-  EXPECT_TRUE(runner.is_loaded());
-}
-
-TEST(TextRunnerTest, SetTemperaturePropagatesToDecoder) {
-  auto module = std::make_unique<::executorch::extension::Module>(
-      "smolLm2_135M_8da4w.pte",
-      ::executorch::extension::Module::LoadMode::File);
-
-  example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json");
-  runner.load();
-  EXPECT_NO_THROW(runner.set_temperature(0.5f));
-  EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f);
-}
-
-#include <runner/multimodal_runner.h>
-
-TEST(MultimodalRunnerTest, LoadFailsWithClearErrorWhenCapabilityMismatch) {
-  // smolLm2_135M_8da4w.pte is text-only — declaring vision capability should
-  // throw
-  auto module = std::make_unique<::executorch::extension::Module>(
-      "smolLm2_135M_8da4w.pte",
-      ::executorch::extension::Module::LoadMode::File);
-
-  std::map<executorch::extension::llm::MultimodalType,
-           std::unique_ptr<executorch::extension::llm::IEncoder>>
-      encoders;
-  encoders[executorch::extension::llm::MultimodalType::Image] =
-      std::make_unique<executorch::extension::llm::VisionEncoder>(module.get());
-
-  example::MultimodalRunner runner(std::move(module), "smollm_tokenizer.json",
-                                   std::move(encoders));
-
-  EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError);
-}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
new file mode 100644
index 000000000..fbd9da03c
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
@@ -0,0 +1,118 @@
+#include <gtest/gtest.h>
+#include <map>
+#include <memory>
+
+#include <executorch/extension/module/module.h>
+#include <rnexecutorch/Error.h>
+#include <runner/encoders/vision_encoder.h>
+#include <runner/multimodal_input.h>
+#include <runner/multimodal_runner.h>
+
+using ::executorch::extension::Module;
+using ::executorch::extension::llm::MultimodalType;
+using ::executorch::extension::llm::VisionEncoder;
+using ::executorch::runtime::Error;
+
+constexpr auto kTextModel = "smolLm2_135M_8da4w.pte";
+constexpr auto kTextTokenizer = "smollm_tokenizer.json";
+constexpr auto kVLMModel = "lfm2_5_vl_quantized_xnnpack_v2.pte";
+constexpr auto kVLMTokenizer = "tokenizer_2.5.json";
+constexpr auto kTestImage = "test_image.jpg";
+
+static std::map<MultimodalType,
+                std::unique_ptr<::executorch::extension::llm::IEncoder>>
+makeVisionEncoders(Module *module) {
+  std::map<MultimodalType,
+           std::unique_ptr<::executorch::extension::llm::IEncoder>>
+      encoders;
+  encoders[MultimodalType::Image] = std::make_unique<VisionEncoder>(module);
+  return encoders;
+}
+
+// ============================================================================
+// Error-path tests (text-only SmolLM2 — no vision_encoder method)
+// ============================================================================
+
+TEST(MultimodalRunnerTest, LoadFailsWhenVisionEncoderMissing) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  auto encoders = makeVisionEncoders(module.get());
+  example::MultimodalRunner runner(std::move(module), kTextTokenizer,
+                                   std::move(encoders));
+  EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError);
+}
+
+TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  auto encoders = makeVisionEncoders(module.get());
+  example::MultimodalRunner runner(std::move(module), kTextTokenizer,
+                                   std::move(encoders));
+  EXPECT_FALSE(runner.is_loaded());
+}
+
+// ============================================================================
+// Integration tests (require VLM .pte)
+// ============================================================================
+
+class VLMTest : public ::testing::Test {
+protected:
+  std::unique_ptr<example::MultimodalRunner> runner_;
+
+  void SetUp() override {
+    auto module = std::make_unique<Module>(kVLMModel, Module::LoadMode::File);
+    auto encoders = makeVisionEncoders(module.get());
+    runner_ = std::make_unique<example::MultimodalRunner>(
+        std::move(module), kVLMTokenizer, std::move(encoders));
+    auto err = runner_->load();
+    ASSERT_EQ(err, Error::Ok) << "VLM model load failed";
+  }
+};
+
+TEST_F(VLMTest, LoadSucceedsWithRealVLMModel) {
+  EXPECT_TRUE(runner_->is_loaded());
+}
+
+TEST_F(VLMTest, MetadataApplied_KVCache) {
+  EXPECT_TRUE(runner_->config_.enable_kv_cache);
+}
+
+TEST_F(VLMTest, GenerateTextOnlyInputWorks) {
+  runner_->set_temperature(0.0f);
+  auto err = runner_->generate(
+      "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n");
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_GT(runner_->pos_, 0);
+}
+
+TEST_F(VLMTest, GenerateWithImageProducesTokens) {
+  runner_->set_temperature(0.0f);
+
+  std::vector<::executorch::extension::llm::MultimodalInput> inputs = {
+      ::executorch::extension::llm::make_image_input(kTestImage),
+      ::executorch::extension::llm::make_text_input(
+          "<|im_start|>user\nDescribe this image briefly."
+          "<|im_end|>\n<|im_start|>assistant\n"),
+  };
+
+  auto err = runner_->generate_internal(inputs, nullptr);
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_GT(runner_->pos_, 0);
+}
+
+TEST_F(VLMTest, EmbeddingCacheHitOnRepeatedImage) {
+  runner_->set_temperature(0.0f);
+
+  // First call — cache miss, runs vision_encoder
+  std::vector<::executorch::extension::llm::MultimodalInput> inputs = {
+      ::executorch::extension::llm::make_image_input(kTestImage),
+      ::executorch::extension::llm::make_text_input(
+          "<|im_start|>user\nWhat is this?<|im_end|>\n<|im_start|>assistant\n"),
+  };
+  runner_->generate_internal(inputs, nullptr);
+  runner_->reset();
+
+  // Second call — same image path, should hit cache
+  // (no functional assertion possible without instrumenting the encoder,
+  //  but this at least verifies it doesn't crash or error)
+  auto err = runner_->generate_internal(inputs, nullptr);
+  EXPECT_EQ(err, Error::Ok);
+}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
new file mode 100644
index 000000000..3253758cc
--- /dev/null
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
@@ -0,0 +1,101 @@
+#include <gtest/gtest.h>
+#include <memory>
+
+#include <executorch/extension/module/module.h>
+#include <rnexecutorch/Error.h>
+#include <runner/text_runner.h>
+
+using ::executorch::extension::Module;
+using ::executorch::runtime::Error;
+
+constexpr auto kTextModel = "smolLm2_135M_8da4w.pte";
+constexpr auto kTextTokenizer = "smollm_tokenizer.json";
+constexpr auto kSystemPrompt = "You are a helpful assistant. Assist the user "
+                               "to the best of your abilities.";
+
+static std::string formatChatML(const std::string &systemPrompt,
+                                const std::string &userMessage) {
+  return "<|im_start|>system\n" + systemPrompt + "<|im_end|>\n" +
+         "<|im_start|>user\n" + userMessage + "<|im_end|>\n" +
+         "<|im_start|>assistant\n";
+}
+
+TEST(TextRunnerTest, ConstructorAndLoadSucceeds) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  auto err = runner.load();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_TRUE(runner.is_loaded());
+}
+
+TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) {
+  // SmolLM2-135M exports enable_dynamic_shape = 1
+  // After load(), config_.enable_dynamic_shape must be true (our fix)
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.load();
+  EXPECT_TRUE(runner.config_.enable_dynamic_shape);
+}
+
+TEST(TextRunnerTest, MetadataApplied_KVCache) {
+  // SmolLM2-135M exports use_kv_cache = 1
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.load();
+  EXPECT_TRUE(runner.config_.enable_kv_cache);
+}
+
+TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.load();
+  runner.set_temperature(0.3f);
+  EXPECT_FLOAT_EQ(runner.config_.temperature, 0.3f);
+}
+
+TEST(TextRunnerTest, ResetZerosPos) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.pos_ = 42;
+  runner.reset();
+  EXPECT_EQ(runner.pos_, 0);
+}
+
+TEST(TextRunnerTest, GenerateProducesTokens) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.load();
+  runner.set_temperature(0.0f);
+
+  std::string prompt = formatChatML(kSystemPrompt, "Say: hello");
+  auto err = runner.generate(prompt);
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_GT(runner.pos_, 0);
+}
+
+TEST(TextRunnerTest, ParallelPrefillEnabled) {
+  // Confirms the fix: enable_dynamic_shape from metadata now unconditionally
+  // applied
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.load();
+  EXPECT_TRUE(runner.config_.enable_dynamic_shape);
+}
+
+TEST(TextRunnerTest, StopHaltsGeneration) {
+  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
+  example::TextRunner runner(std::move(module), kTextTokenizer);
+  runner.load();
+  runner.set_temperature(0.0f);
+
+  int token_count = 0;
+  std::string prompt = formatChatML(kSystemPrompt, "Count to one hundred");
+  runner.generate(prompt, {}, [&](const std::string &) {
+    token_count++;
+    if (token_count >= 3) {
+      runner.stop();
+    }
+  });
+  EXPECT_GT(token_count, 0);
+  EXPECT_LE(token_count, 5); // stopped early
+}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index 360aa9d11..324841d9b 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -29,6 +29,8 @@ TEST_EXECUTABLES=(
   "TokenizerModuleTests"
   "SpeechToTextTests"
   "LLMTests"
+  "TextRunnerTests"
+  "VLMTests"
   "ImageSegmentationTests"
   "TextToImageTests"
   "OCRTests"
@@ -60,6 +62,8 @@ MODELS=(
   "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.6.0/tokenizer.json"
   "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte"
   "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json"
+  "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_v2.pte"
+  "tokenizer_2.5.json|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json"
   "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte"
   "xnnpack_crnn_english.pte|https://huggingface.co/software-mansion/react-native-executorch-recognizer-crnn.en/resolve/v0.7.0/xnnpack/english/xnnpack_crnn_english.pte"
   "xnnpack_craft_quantized.pte|https://huggingface.co/software-mansion/react-native-executorch-detector-craft/resolve/v0.7.0/xnnpack/xnnpack_craft.pte"

From 4184bb3dbf132d7946643e030f6df01c3400f201 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 15:44:42 +0100
Subject: [PATCH 41/46] refactor: unify multimodal/text paths in sendMessage,
 add getVisualTokenCount JSI

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../host_objects/ModelHostObject.h            |  5 ++
 .../common/rnexecutorch/models/llm/LLM.cpp    |  7 ++
 .../common/rnexecutorch/models/llm/LLM.h      |  1 +
 .../common/runner/base_llm_runner.h           |  1 +
 .../common/runner/encoders/iencoder.h         |  4 ++
 .../common/runner/encoders/vision_encoder.cpp | 20 ++++++
 .../common/runner/encoders/vision_encoder.h   |  1 +
 .../common/runner/multimodal_runner.cpp       |  8 +++
 .../common/runner/multimodal_runner.h         |  3 +-
 .../src/controllers/LLMController.ts          | 68 ++++++++-----------
 .../react-native-executorch/src/types/llm.ts  |  6 +-
 11 files changed, 81 insertions(+), 43 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index a4af6eb8f..35b34ed56 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -159,6 +159,11 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
               std::string, std::vector<std::string>,
               std::shared_ptr<jsi::Function>)>(&Model::generate)>,
           "generateMultimodal"));
+
+      addFunctions(JSI_EXPORT_FUNCTION(
+          ModelHostObject<Model>,
+          synchronousHostFunction<&Model::getVisualTokenCount>,
+          "getVisualTokenCount"));
     }
 
     if constexpr (meta::SameAs<Model, models::text_to_image::TextToImage>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 8cdeffbe0..a634b372b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -166,6 +166,13 @@ size_t LLM::getPromptTokenCount() const noexcept {
   return runner_->stats_.num_prompt_tokens;
 }
 
+int32_t LLM::getVisualTokenCount() const {
+  if (!runner_ || !runner_->is_loaded()) {
+    return 0;
+  }
+  return runner_->get_visual_token_count();
+}
+
 int32_t LLM::countTextTokens(std::string text) const {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index b341e3811..e73b7771d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -35,6 +35,7 @@ class LLM : public BaseModel {
   size_t getGeneratedTokenCount() const noexcept;
   size_t getPromptTokenCount() const noexcept;
   int32_t countTextTokens(std::string text) const;
+  int32_t getVisualTokenCount() const;
   size_t getMemoryLowerBound() const noexcept;
   void setCountInterval(size_t countInterval);
   void setTemperature(float temperature);
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
index 7d2eef285..d888256ec 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.h
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -51,6 +51,7 @@ class BaseLLMRunner {
   void reset();
   int32_t count_text_tokens(const std::string &text) const;
   int32_t get_max_context_length() const;
+  virtual int32_t get_visual_token_count() const { return 0; }
 
   // Writes config_ then propagates to subclass impl
   void set_temperature(float temperature) noexcept;
diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h
index 3f46ef775..78abe80ce 100644
--- a/packages/react-native-executorch/common/runner/encoders/iencoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/iencoder.h
@@ -16,6 +16,10 @@ class IEncoder {
   // Encodes one input segment, returns embeddings EValue
   virtual ::executorch::runtime::Result<::executorch::runtime::EValue>
   encode(const MultimodalInput &input) = 0;
+
+  // Returns the number of tokens produced per encoded input (e.g. visual
+  // tokens per image). Returns 0 if not loaded or unknown.
+  virtual int32_t encoderTokenCount() const { return 0; }
 };
 
 } // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 44beb2a7c..191182b12 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -57,6 +57,26 @@ bool VisionEncoder::is_loaded() const {
   return module_->is_method_loaded(kVisionEncoderMethod);
 }
 
+int32_t VisionEncoder::encoderTokenCount() const {
+  if (!is_loaded()) {
+    return 0;
+  }
+  auto meta_result = module_->method_meta(kVisionEncoderMethod);
+  if (!meta_result.ok()) {
+    return 0;
+  }
+  auto output_meta = meta_result->output_tensor_meta(0);
+  if (!output_meta.ok()) {
+    return 0;
+  }
+  // Output shape is [1, num_visual_tokens, embed_dim]
+  auto sizes = output_meta->sizes();
+  if (sizes.size() < 2) {
+    return 0;
+  }
+  return static_cast<int32_t>(sizes[1]);
+}
+
 Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
   if (!is_loaded()) {
     return Error::InvalidState;
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
index 5af0491bd..c7adb118a 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -18,6 +18,7 @@ class VisionEncoder : public IEncoder {
   bool is_loaded() const override;
   ::executorch::runtime::Result<::executorch::runtime::EValue>
   encode(const MultimodalInput &input) override;
+  int32_t encoderTokenCount() const override;
 
 private:
   ::executorch::extension::Module *module_;
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index 3d70e04b5..f0b836248 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -18,6 +18,14 @@ MultimodalRunner::MultimodalRunner(
     : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config),
       encoders_(std::move(encoders)) {}
 
+int32_t MultimodalRunner::get_visual_token_count() const {
+  auto it = encoders_.find(llm::MultimodalType::Image);
+  if (it == encoders_.end()) {
+    return 0;
+  }
+  return it->second->encoderTokenCount();
+}
+
 bool MultimodalRunner::is_loaded() const {
   if (!mm_prefiller_ || !mm_token_generator_)
     return false;
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
index 4190127e6..6139c0fc2 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.h
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.h
@@ -28,6 +28,7 @@ class MultimodalRunner : public BaseLLMRunner {
           .temperature = 0.8F, .topp = 0.9F});
 
   bool is_loaded() const override;
+  int32_t get_visual_token_count() const override;
 
   ::executorch::runtime::Error generate_internal(
       const std::vector<::executorch::extension::llm::MultimodalInput> &inputs,
@@ -37,7 +38,7 @@ class MultimodalRunner : public BaseLLMRunner {
   ::executorch::runtime::Error load_subcomponents() override;
   void stop_impl() override;
   void set_temperature_impl(float) override {
-  }                                     // config_ already updated by base
+  } // config_ already updated by base
   void set_topp_impl(float) override {} // config_ already updated by base
   void set_count_interval_impl(size_t count_interval) override;
   void set_time_interval_impl(size_t time_interval) override;
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index c52e537e4..0a4629a0b 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -284,7 +284,8 @@ export class LLMController {
 
   public async generate(
     messages: Message[],
-    tools?: LLMTool[]
+    tools?: LLMTool[],
+    imagePaths?: string[]
   ): Promise<string> {
     if (!this._isReady) {
       throw new RnExecutorchError(
@@ -312,7 +313,7 @@ export class LLMController {
       { tools_in_user_message: false, add_generation_prompt: true }
     );
 
-    return await this.forward(renderedChat);
+    return await this.forward(renderedChat, imagePaths);
   }
 
   public async sendMessage(
@@ -328,27 +329,21 @@ export class LLMController {
     const updatedHistory = [...this._messageHistory, newMessage];
     this.messageHistoryCallback(updatedHistory);
 
-    let response: string;
-
-    const isMultimodal = updatedHistory.some((m) => m.mediaPath);
-
-    // For multimodal messages, convert mediaPath into structured content so
+    // For messages with images, convert mediaPath into structured content so
     // the chat template emits <image> placeholders in the right position.
-    const historyForTemplate = isMultimodal
-      ? updatedHistory.map((m) =>
-          m.mediaPath
-            ? {
-                ...m,
-                content: [
-                  { type: 'image' },
-                  { type: 'text', text: m.content },
-                ] as any,
-              }
-            : m
-        )
-      : updatedHistory;
-
-    const IMAGE_VISUAL_TOKENS = 256;
+    const historyForTemplate = updatedHistory.map((m) =>
+      m.mediaPath
+        ? {
+            ...m,
+            content: [
+              { type: 'image' },
+              { type: 'text', text: m.content },
+            ] as any,
+          }
+        : m
+    );
+
+    const visualTokenCount = this.nativeModule.getVisualTokenCount();
     const countTokensCallback = (messages: Message[]) => {
       const rendered = this.applyChatTemplate(
         messages,
@@ -359,7 +354,7 @@ export class LLMController {
       );
       const textTokens = this.nativeModule.countTextTokens(rendered);
       const imageCount = messages.filter((m) => m.mediaPath).length;
-      return textTokens + imageCount * (IMAGE_VISUAL_TOKENS - 1);
+      return textTokens + imageCount * (visualTokenCount - 1);
     };
     const maxContextLength = this.nativeModule.getMaxContextLength();
     const messageHistoryWithPrompt =
@@ -370,24 +365,15 @@ export class LLMController {
         countTokensCallback
       );
 
-    if (isMultimodal) {
-      const renderedPrompt = this.applyChatTemplate(
-        messageHistoryWithPrompt,
-        this.tokenizerConfig,
-        this.toolsConfig?.tools,
-        // eslint-disable-next-line camelcase
-        { tools_in_user_message: false, add_generation_prompt: true }
-      );
-      const imagePaths = messageHistoryWithPrompt
-        .filter((m) => m.mediaPath)
-        .map((m) => m.mediaPath!);
-      response = await this.forward(renderedPrompt, imagePaths);
-    } else {
-      response = await this.generate(
-        messageHistoryWithPrompt,
-        this.toolsConfig?.tools
-      );
-    }
+    const imagePaths = messageHistoryWithPrompt
+      .filter((m) => m.mediaPath)
+      .map((m) => m.mediaPath!);
+
+    const response = await this.generate(
+      messageHistoryWithPrompt,
+      this.toolsConfig?.tools,
+      imagePaths.length > 0 ? imagePaths : undefined
+    );
 
     if (!this.toolsConfig || this.toolsConfig.displayToolCalls) {
       this.messageHistoryCallback([
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index aea9817bb..f906f8b3f 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -109,7 +109,11 @@ export interface LLMTypeBase {
    * @param tools - Optional array of tools that can be used during generation.
    * @returns The generated tokens as `string`.
    */
-  generate: (messages: Message[], tools?: LLMTool[]) => Promise<string>;
+  generate: (
+    messages: Message[],
+    tools?: LLMTool[],
+    imagePaths?: string[]
+  ) => Promise<string>;
   /**
    * Returns the number of total tokens from the previous generation. This is a sum of prompt tokens and generated tokens.
    *

From c88d97c30c01fda4d5b4dfd53c398b038c273dc3 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 16:02:43 +0100
Subject: [PATCH 42/46] refactor: replace example namespace with
 rnexecutorch::llm::runner in runner classes

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/models/llm/LLM.cpp    | 14 +++++------
 .../common/rnexecutorch/models/llm/LLM.h      |  2 +-
 .../tests/integration/LLMTest.cpp             |  2 +-
 .../integration/MultimodalRunnerTest.cpp      | 14 +++++------
 .../tests/integration/TextRunnerTest.cpp      | 24 ++++++++++++-------
 .../common/runner/base_llm_runner.cpp         |  4 ++--
 .../common/runner/base_llm_runner.h           |  4 ++--
 .../common/runner/multimodal_runner.cpp       |  8 +++----
 .../common/runner/multimodal_runner.h         | 14 +++++------
 .../common/runner/text_runner.cpp             |  4 ++--
 .../common/runner/text_runner.h               |  4 ++--
 11 files changed, 50 insertions(+), 44 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index a634b372b..94d4aa1ec 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -11,6 +11,7 @@
 
 namespace rnexecutorch::models::llm {
 namespace llm = ::executorch::extension::llm;
+namespace runner = ::rnexecutorch::llm::runner;
 namespace fs = std::filesystem;
 using namespace facebook;
 using executorch::extension::module::Module;
@@ -22,17 +23,17 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource,
     : BaseModel(modelSource, callInvoker, Module::LoadMode::File) {
 
   if (capabilities.empty()) {
-    runner_ = std::make_unique<example::TextRunner>(std::move(module_),
-                                                    tokenizerSource);
+    runner_ = std::make_unique<runner::TextRunner>(std::move(module_),
+                                                   tokenizerSource);
   } else {
-    std::map<llm::MultimodalType, std::unique_ptr<llm::IEncoder>> encoders;
+    std::map<runner::MultimodalType, std::unique_ptr<llm::IEncoder>> encoders;
     for (const auto &cap : capabilities) {
       if (cap == "vision") {
-        encoders[llm::MultimodalType::Image] =
+        encoders[runner::MultimodalType::Image] =
             std::make_unique<llm::VisionEncoder>(module_.get());
       }
     }
-    runner_ = std::make_unique<example::MultimodalRunner>(
+    runner_ = std::make_unique<runner::MultimodalRunner>(
         std::move(module_), tokenizerSource, std::move(encoders));
   }
 
@@ -51,7 +52,6 @@ std::string LLM::generate(std::string input,
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
-
   std::string output;
   auto nativeCallback = [this, callback, &output](const std::string &token) {
     output += token;
@@ -77,7 +77,7 @@ std::string LLM::generate(std::string prompt,
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
-  if (!dynamic_cast<example::MultimodalRunner *>(runner_.get())) {
+  if (!dynamic_cast<runner::MultimodalRunner *>(runner_.get())) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidUserInput,
         "This is a text-only model. Call generate(prompt, cb).");
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index e73b7771d..514760908 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -44,7 +44,7 @@ class LLM : public BaseModel {
   int32_t getMaxContextLength() const;
 
 private:
-  std::unique_ptr<example::BaseLLMRunner> runner_;
+  std::unique_ptr<::rnexecutorch::llm::runner::BaseLLMRunner> runner_;
 };
 } // namespace models::llm
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index 65bd1917a..acd667118 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -176,7 +176,7 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) {
 #include <runner/base_llm_runner.h>
 
 // Minimal concrete subclass — only used in tests to verify base class behavior
-class StubRunner : public example::BaseLLMRunner {
+class StubRunner : public rnexecutorch::llm::runner::BaseLLMRunner {
 public:
   using BaseLLMRunner::BaseLLMRunner;
   bool is_loaded() const override { return loaded_; }
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
index fbd9da03c..038fa7f6e 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
@@ -9,9 +9,9 @@
 #include <runner/multimodal_runner.h>
 
 using ::executorch::extension::Module;
-using ::executorch::extension::llm::MultimodalType;
 using ::executorch::extension::llm::VisionEncoder;
 using ::executorch::runtime::Error;
+using ::rnexecutorch::llm::runner::MultimodalType;
 
 constexpr auto kTextModel = "smolLm2_135M_8da4w.pte";
 constexpr auto kTextTokenizer = "smollm_tokenizer.json";
@@ -36,16 +36,16 @@ makeVisionEncoders(Module *module) {
 TEST(MultimodalRunnerTest, LoadFailsWhenVisionEncoderMissing) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
   auto encoders = makeVisionEncoders(module.get());
-  example::MultimodalRunner runner(std::move(module), kTextTokenizer,
-                                   std::move(encoders));
+  rnexecutorch::llm::runner::MultimodalRunner runner(
+      std::move(module), kTextTokenizer, std::move(encoders));
   EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError);
 }
 
 TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
   auto encoders = makeVisionEncoders(module.get());
-  example::MultimodalRunner runner(std::move(module), kTextTokenizer,
-                                   std::move(encoders));
+  rnexecutorch::llm::runner::MultimodalRunner runner(
+      std::move(module), kTextTokenizer, std::move(encoders));
   EXPECT_FALSE(runner.is_loaded());
 }
 
@@ -55,12 +55,12 @@ TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) {
 
 class VLMTest : public ::testing::Test {
 protected:
-  std::unique_ptr<example::MultimodalRunner> runner_;
+  std::unique_ptr<rnexecutorch::llm::runner::MultimodalRunner> runner_;
 
   void SetUp() override {
     auto module = std::make_unique<Module>(kVLMModel, Module::LoadMode::File);
     auto encoders = makeVisionEncoders(module.get());
-    runner_ = std::make_unique<example::MultimodalRunner>(
+    runner_ = std::make_unique<rnexecutorch::llm::runner::MultimodalRunner>(
         std::move(module), kVLMTokenizer, std::move(encoders));
     auto err = runner_->load();
     ASSERT_EQ(err, Error::Ok) << "VLM model load failed";
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
index 3253758cc..169310ed3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
@@ -22,7 +22,8 @@ static std::string formatChatML(const std::string &systemPrompt,
 
 TEST(TextRunnerTest, ConstructorAndLoadSucceeds) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   auto err = runner.load();
   EXPECT_EQ(err, Error::Ok);
   EXPECT_TRUE(runner.is_loaded());
@@ -32,7 +33,8 @@ TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) {
   // SmolLM2-135M exports enable_dynamic_shape = 1
   // After load(), config_.enable_dynamic_shape must be true (our fix)
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.load();
   EXPECT_TRUE(runner.config_.enable_dynamic_shape);
 }
@@ -40,14 +42,16 @@ TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) {
 TEST(TextRunnerTest, MetadataApplied_KVCache) {
   // SmolLM2-135M exports use_kv_cache = 1
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.load();
   EXPECT_TRUE(runner.config_.enable_kv_cache);
 }
 
 TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.load();
   runner.set_temperature(0.3f);
   EXPECT_FLOAT_EQ(runner.config_.temperature, 0.3f);
@@ -55,7 +59,8 @@ TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) {
 
 TEST(TextRunnerTest, ResetZerosPos) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.pos_ = 42;
   runner.reset();
   EXPECT_EQ(runner.pos_, 0);
@@ -63,7 +68,8 @@ TEST(TextRunnerTest, ResetZerosPos) {
 
 TEST(TextRunnerTest, GenerateProducesTokens) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.load();
   runner.set_temperature(0.0f);
 
@@ -77,14 +83,16 @@ TEST(TextRunnerTest, ParallelPrefillEnabled) {
   // Confirms the fix: enable_dynamic_shape from metadata now unconditionally
   // applied
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.load();
   EXPECT_TRUE(runner.config_.enable_dynamic_shape);
 }
 
 TEST(TextRunnerTest, StopHaltsGeneration) {
   auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  example::TextRunner runner(std::move(module), kTextTokenizer);
+  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
+                                               kTextTokenizer);
   runner.load();
   runner.set_temperature(0.0f);
 
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
index fcb647ec5..37adde77a 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -6,7 +6,7 @@
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/Log.h>
 
-namespace example {
+namespace rnexecutorch::llm::runner {
 
 using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
@@ -159,4 +159,4 @@ int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens,
   return std::max(0, result);
 }
 
-} // namespace example
+} // namespace rnexecutorch::llm::runner
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
index d888256ec..161463580 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.h
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -15,7 +15,7 @@
 #include <unordered_set>
 #include <vector>
 
-namespace example {
+namespace rnexecutorch::llm::runner {
 
 namespace llm = ::executorch::extension::llm;
 
@@ -86,4 +86,4 @@ class BaseLLMRunner {
   bool shouldStop_{false};
 };
 
-} // namespace example
+} // namespace rnexecutorch::llm::runner
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index f0b836248..c211ee922 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -5,7 +5,7 @@
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/Log.h>
 
-namespace example {
+namespace rnexecutorch::llm::runner {
 
 using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
@@ -19,7 +19,7 @@ MultimodalRunner::MultimodalRunner(
       encoders_(std::move(encoders)) {}
 
 int32_t MultimodalRunner::get_visual_token_count() const {
-  auto it = encoders_.find(llm::MultimodalType::Image);
+  auto it = encoders_.find(MultimodalType::Image);
   if (it == encoders_.end()) {
     return 0;
   }
@@ -74,7 +74,7 @@ Error MultimodalRunner::load_subcomponents() {
   mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
       module_, io_manager_.get());
   llm::IEncoder *image_encoder = nullptr;
-  auto enc_it = encoders_.find(llm::MultimodalType::Image);
+  auto enc_it = encoders_.find(MultimodalType::Image);
   if (enc_it != encoders_.end()) {
     image_encoder = enc_it->second.get();
   }
@@ -162,4 +162,4 @@ void MultimodalRunner::set_time_interval_impl(size_t time_interval) {
     mm_token_generator_->set_time_interval(time_interval);
 }
 
-} // namespace example
+} // namespace rnexecutorch::llm::runner
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
index 6139c0fc2..58e676a94 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.h
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.h
@@ -9,19 +9,17 @@
 #include "text_token_generator.h"
 #include <map>
 
-namespace executorch::extension::llm {
-// Tag enum for keying encoder map
-enum class MultimodalType { Image, Audio };
-} // namespace executorch::extension::llm
+namespace rnexecutorch::llm::runner {
 
-namespace example {
+// Tag enum for keying encoder map
+enum class MultimodalType { Image };
 
 class MultimodalRunner : public BaseLLMRunner {
 public:
   explicit MultimodalRunner(
       std::unique_ptr<::executorch::extension::Module> owned_module,
       const std::string &tokenizer_path,
-      std::map<::executorch::extension::llm::MultimodalType,
+      std::map<MultimodalType,
                std::unique_ptr<::executorch::extension::llm::IEncoder>>
           encoders,
       const ::executorch::extension::llm::GenerationConfig &config = {
@@ -44,7 +42,7 @@ class MultimodalRunner : public BaseLLMRunner {
   void set_time_interval_impl(size_t time_interval) override;
 
 private:
-  std::map<::executorch::extension::llm::MultimodalType,
+  std::map<MultimodalType,
            std::unique_ptr<::executorch::extension::llm::IEncoder>>
       encoders_;
   std::unique_ptr<::executorch::extension::llm::MultimodalDecoderRunner>
@@ -55,4 +53,4 @@ class MultimodalRunner : public BaseLLMRunner {
       mm_token_generator_;
 };
 
-} // namespace example
+} // namespace rnexecutorch::llm::runner
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
index fa2225f3d..d61d70c41 100644
--- a/packages/react-native-executorch/common/runner/text_runner.cpp
+++ b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -6,7 +6,7 @@
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/Log.h>
 
-namespace example {
+namespace rnexecutorch::llm::runner {
 
 using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
@@ -172,4 +172,4 @@ void TextRunner::set_time_interval_impl(size_t time_interval) {
     text_token_generator_->set_time_interval(time_interval);
 }
 
-} // namespace example
+} // namespace rnexecutorch::llm::runner
diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h
index 17394ee3f..857cf452f 100644
--- a/packages/react-native-executorch/common/runner/text_runner.h
+++ b/packages/react-native-executorch/common/runner/text_runner.h
@@ -6,7 +6,7 @@
 #include "text_prefiller.h"
 #include "text_token_generator.h"
 
-namespace example {
+namespace rnexecutorch::llm::runner {
 
 class TextRunner : public BaseLLMRunner {
 public:
@@ -38,4 +38,4 @@ class TextRunner : public BaseLLMRunner {
       text_token_generator_;
 };
 
-} // namespace example
+} // namespace rnexecutorch::llm::runner

From c7357d3b679c1f1f8ca550fb7e98c70774dcfa07 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 16:54:13 +0100
Subject: [PATCH 43/46] refactor: collapse BaseLLMRunner constructor,
 deduplicate eos_ids, read image shape from model metadata

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/models/llm/LLM.cpp    |  4 +-
 .../tests/integration/LLMTest.cpp             |  4 +-
 .../common/runner/base_llm_runner.cpp         | 29 ++++--
 .../common/runner/base_llm_runner.h           | 17 ++--
 .../common/runner/encoders/vision_encoder.cpp | 93 ++++++++-----------
 .../common/runner/encoders/vision_encoder.h   |  9 ++
 .../common/runner/multimodal_runner.cpp       | 43 ++-------
 .../common/runner/multimodal_runner.h         |  9 +-
 .../common/runner/text_runner.cpp             | 23 +----
 .../common/runner/text_runner.h               |  9 +-
 10 files changed, 100 insertions(+), 140 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 94d4aa1ec..2dd342702 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -77,7 +77,7 @@ std::string LLM::generate(std::string prompt,
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
                             "Runner is not loaded");
   }
-  if (!dynamic_cast<runner::MultimodalRunner *>(runner_.get())) {
+  if (!runner_->is_multimodal()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidUserInput,
         "This is a text-only model. Call generate(prompt, cb).");
@@ -130,7 +130,7 @@ std::string LLM::generate(std::string prompt,
     }
   };
 
-  auto error = runner_->generate_internal(inputs, nativeCallback);
+  auto error = runner_->generate(inputs, nativeCallback);
   if (error != Error::Ok) {
     throw RnExecutorchError(error, "Failed to generate multimodal response");
   }
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
index acd667118..5ebb96fbe 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp
@@ -200,14 +200,14 @@ class StubRunner : public rnexecutorch::llm::runner::BaseLLMRunner {
 };
 
 TEST(BaseLLMRunnerTest, SetTemperatureWritesConfigAndCallsImpl) {
-  StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json");
+  StubRunner runner(nullptr, "dummy_tokenizer.json");
   runner.set_temperature(0.5f);
   EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f);
   EXPECT_FLOAT_EQ(runner.last_temp_, 0.5f);
 }
 
 TEST(BaseLLMRunnerTest, ResetZerosPos) {
-  StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json");
+  StubRunner runner(nullptr, "dummy_tokenizer.json");
   runner.pos_ = 42;
   runner.reset();
   EXPECT_EQ(runner.pos_, 0);
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
index 37adde77a..4a382a530 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp
@@ -1,7 +1,6 @@
 // common/runner/base_llm_runner.cpp
 #include "base_llm_runner.h"
 #include "constants.h"
-#include "util.h"
 #include <cstdint>
 #include <rnexecutorch/Error.h>
 #include <rnexecutorch/Log.h>
@@ -12,12 +11,11 @@ using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
 using ::executorch::runtime::Error;
 
-BaseLLMRunner::BaseLLMRunner(Module *module,
-                             std::unique_ptr<Module> owned_module,
+BaseLLMRunner::BaseLLMRunner(std::unique_ptr<Module> module,
                              const std::string &tokenizer_path,
                              const llm::GenerationConfig &config)
-    : config_(config), module_(owned_module ? owned_module.get() : module),
-      owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path),
+    : config_(config), module_(std::move(module)),
+      tokenizer_path_(tokenizer_path),
       tokenizer_(std::make_unique<tokenizers::HFTokenizer>()),
       metadata_({
           {kEnableDynamicShape, false},
@@ -68,14 +66,14 @@ Error BaseLLMRunner::load() {
       static_cast<bool>(metadata_.at(kEnableDynamicShape));
   config_.enable_kv_cache = static_cast<bool>(metadata_.at(kUseKVCache));
 
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
+  eos_ids_ = std::make_unique<std::unordered_set<uint64_t>>();
   if (method_names.count(kEosIds)) {
     for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
-      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
+      eos_ids_->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
     }
   }
-  if (eos_ids->empty()) {
-    eos_ids->emplace(7); // fallback <|im_end|>
+  if (eos_ids_->empty()) {
+    eos_ids_->emplace(7); // fallback <|im_end|>
   }
 
   io_manager_ = std::make_unique<llm::IOManager>(*module_);
@@ -99,6 +97,19 @@ Error BaseLLMRunner::generate(
   return err;
 }
 
+Error BaseLLMRunner::generate(
+    const std::vector<llm::MultimodalInput> &inputs,
+    std::function<void(const std::string &)> token_callback,
+    std::function<void(const llm::Stats &)> stats_callback) {
+
+  auto err = generate_internal(inputs, token_callback);
+
+  if (stats_callback)
+    stats_callback(stats_);
+
+  return err;
+}
+
 void BaseLLMRunner::stop() { stop_impl(); }
 
 void BaseLLMRunner::reset() {
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
index 161463580..c26a3c2d3 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.h
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -22,8 +22,7 @@ namespace llm = ::executorch::extension::llm;
 class BaseLLMRunner {
 public:
   explicit BaseLLMRunner(
-      ::executorch::extension::Module *module,
-      std::unique_ptr<::executorch::extension::Module> owned_module,
+      std::unique_ptr<::executorch::extension::Module> module,
       const std::string &tokenizer_path,
       const llm::GenerationConfig &config = {.temperature = 0.8F,
                                              .topp = 0.9F});
@@ -32,17 +31,19 @@ class BaseLLMRunner {
 
   virtual bool is_loaded() const = 0;
 
-  // Loads tokenizer + metadata + eos, then calls load_subcomponents()
   virtual ::executorch::runtime::Error load();
 
-  // Text convenience — wraps string in make_text_input, calls generate_internal
   ::executorch::runtime::Error
   generate(const std::string &prompt,
            const llm::GenerationConfig &generation_config = {},
            std::function<void(const std::string &)> token_callback = {},
            std::function<void(const llm::Stats &)> stats_callback = {});
 
-  // Multimodal entry point — subclasses implement this
+  ::executorch::runtime::Error
+  generate(const std::vector<llm::MultimodalInput> &inputs,
+           std::function<void(const std::string &)> token_callback = {},
+           std::function<void(const llm::Stats &)> stats_callback = {});
+
   virtual ::executorch::runtime::Error generate_internal(
       const std::vector<llm::MultimodalInput> &inputs,
       std::function<void(const std::string &)> token_callback) = 0;
@@ -51,9 +52,9 @@ class BaseLLMRunner {
   void reset();
   int32_t count_text_tokens(const std::string &text) const;
   int32_t get_max_context_length() const;
+  virtual bool is_multimodal() const { return false; }
   virtual int32_t get_visual_token_count() const { return 0; }
 
-  // Writes config_ then propagates to subclass impl
   void set_temperature(float temperature) noexcept;
   void set_topp(float topp) noexcept;
   void set_count_interval(size_t count_interval);
@@ -77,12 +78,12 @@ class BaseLLMRunner {
                                  int32_t max_context_len,
                                  int32_t max_new_tokens = -1) const;
 
-  ::executorch::extension::Module *module_;
-  std::unique_ptr<::executorch::extension::Module> owned_module_;
+  std::unique_ptr<::executorch::extension::Module> module_;
   std::string tokenizer_path_;
   std::unique_ptr<tokenizers::HFTokenizer> tokenizer_;
   std::unordered_map<std::string, int64_t> metadata_;
   std::unique_ptr<llm::IOManager> io_manager_;
+  std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
   bool shouldStop_{false};
 };
 
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
index 191182b12..800c76cab 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp
@@ -15,10 +15,6 @@ using ::executorch::runtime::Error;
 using ::executorch::runtime::EValue;
 using ::executorch::runtime::Result;
 
-// LFM2-VL vision encoder expects [1, 3, H, W] NCHW float32, values [0, 255]
-static constexpr int kImageSize = 512;
-static constexpr int kImageChannels = 3;
-
 VisionEncoder::VisionEncoder(::executorch::extension::Module *module)
     : module_(module) {}
 
@@ -30,17 +26,6 @@ Error VisionEncoder::load() {
   if (!method_names_result.ok()) {
     return method_names_result.error();
   }
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug,
-                    "[VisionEncoder] Available methods:");
-  for (const auto &name : *method_names_result) {
-    auto val = module_->get(name);
-    if (val.ok()) {
-      rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name, "=",
-                        val->toScalar().to<int64_t>());
-    } else {
-      rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name);
-    }
-  }
 
   if (method_names_result->count(kVisionEncoderMethod) == 0) {
     throw rnexecutorch::RnExecutorchError(
@@ -77,6 +62,38 @@ int32_t VisionEncoder::encoderTokenCount() const {
   return static_cast<int32_t>(sizes[1]);
 }
 
+Result<VisionEncoder::ImageShape> VisionEncoder::getInputShape() const {
+  auto method_meta = ET_UNWRAP(module_->method_meta(kVisionEncoderMethod));
+  auto input_meta = ET_UNWRAP(method_meta.input_tensor_meta(0));
+  auto dims = input_meta.sizes();
+  const bool with_batch = dims.size() == 4;
+  const int32_t offset = with_batch ? 1 : 0;
+  return ImageShape{
+      .channels = static_cast<int32_t>(dims[offset]),
+      .height = static_cast<int32_t>(dims[offset + 1]),
+      .width = static_cast<int32_t>(dims[offset + 2]),
+      .with_batch = with_batch,
+  };
+}
+
+std::vector<float>
+VisionEncoder::preprocessImage(const std::string &path,
+                               const ImageShape &shape) const {
+  cv::Mat mat = rnexecutorch::image_processing::readImage(path);
+  cv::resize(mat, mat, cv::Size(shape.width, shape.height));
+  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
+
+  const int32_t pixelCount = shape.height * shape.width;
+  std::vector<float> chw(shape.channels * pixelCount);
+  for (int32_t i = 0; i < pixelCount; ++i) {
+    cv::Vec3b px = mat.at<cv::Vec3b>(i / shape.width, i % shape.width);
+    for (int32_t c = 0; c < shape.channels; ++c) {
+      chw[c * pixelCount + i] = static_cast<float>(px[c]);
+    }
+  }
+  return chw;
+}
+
 Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
   if (!is_loaded()) {
     return Error::InvalidState;
@@ -87,57 +104,25 @@ Result<EValue> VisionEncoder::encode(const MultimodalInput &input) {
 
   const std::string &path = input.get_image_path();
 
-  // Return cached embedding if available
   auto it = embedding_cache_.find(path);
   if (it != embedding_cache_.end()) {
-    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug,
-                      "[VisionEncoder] Cache hit for:", path);
     return it->second;
   }
 
-  // Load and preprocess image: resize → BGR→RGB → HWC uint8 → CHW float32
-  cv::Mat mat = rnexecutorch::image_processing::readImage(path);
-  cv::resize(mat, mat, cv::Size(kImageSize, kImageSize));
-  cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB);
-
-  std::vector<float> chw(kImageChannels * kImageSize * kImageSize);
-  const int pixelCount = kImageSize * kImageSize;
-  for (int i = 0; i < pixelCount; ++i) {
-    cv::Vec3b px = mat.at<cv::Vec3b>(i / kImageSize, i % kImageSize);
-    for (int c = 0; c < kImageChannels; ++c) {
-      chw[c * pixelCount + i] = static_cast<float>(px[c]);
-    }
-  }
+  auto shape = ET_UNWRAP(getInputShape());
+  auto chw = preprocessImage(path, shape);
 
-  // Determine expected input shape (with or without batch dim)
-  auto method_meta_result = module_->method_meta(kVisionEncoderMethod);
-  if (!method_meta_result.ok()) {
-    return method_meta_result.error();
-  }
-  auto input_meta_result = method_meta_result->input_tensor_meta(0);
-  if (!input_meta_result.ok()) {
-    return input_meta_result.error();
-  }
-  auto expected_dims = input_meta_result->sizes();
-  const bool with_batch = expected_dims.size() == 4;
-
-  std::vector<::executorch::aten::SizesType> sizes = {kImageChannels,
-                                                      kImageSize, kImageSize};
-  if (with_batch) {
+  std::vector<::executorch::aten::SizesType> sizes = {
+      shape.channels, shape.height, shape.width};
+  if (shape.with_batch) {
     sizes.insert(sizes.begin(), 1);
   }
 
   auto image_tensor = ::executorch::extension::from_blob(
       chw.data(), sizes, ::executorch::aten::ScalarType::Float);
 
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[VisionEncoder] Running encode for:", path);
-  auto result = module_->execute(kVisionEncoderMethod, image_tensor);
-  if (!result.ok()) {
-    return result.error();
-  }
-
-  EValue embedding = (*result)[0];
+  auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor));
+  EValue embedding = result[0];
   embedding_cache_.emplace(path, embedding);
   return embedding;
 }
diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
index c7adb118a..8a54bfb6b 100644
--- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h
@@ -21,6 +21,15 @@ class VisionEncoder : public IEncoder {
   int32_t encoderTokenCount() const override;
 
 private:
+  struct ImageShape {
+    int32_t channels, height, width;
+    bool with_batch;
+  };
+
+  ::executorch::runtime::Result<ImageShape> getInputShape() const;
+  std::vector<float> preprocessImage(const std::string &path,
+                                     const ImageShape &shape) const;
+
   ::executorch::extension::Module *module_;
   std::unordered_map<std::string, ::executorch::runtime::EValue>
       embedding_cache_;
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index c211ee922..7eda70870 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -12,10 +12,10 @@ using ::executorch::extension::Module;
 using ::executorch::runtime::Error;
 
 MultimodalRunner::MultimodalRunner(
-    std::unique_ptr<Module> owned_module, const std::string &tokenizer_path,
+    std::unique_ptr<Module> module, const std::string &tokenizer_path,
     std::map<MultimodalType, std::unique_ptr<IEncoder>> encoders,
     const llm::GenerationConfig &config)
-    : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config),
+    : BaseLLMRunner(std::move(module), tokenizer_path, config),
       encoders_(std::move(encoders)) {}
 
 int32_t MultimodalRunner::get_visual_token_count() const {
@@ -41,54 +41,29 @@ bool MultimodalRunner::is_loaded() const {
 Error MultimodalRunner::load_subcomponents() {
   rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading",
                     encoders_.size(), "encoder(s)");
-  // Load and validate all declared encoders — throws on mismatch
   for (auto &[type, encoder] : encoders_) {
-    rnexecutorch::log(
-        rnexecutorch::LOG_LEVEL::Debug,
-        "[MultimodalRunner] Loading encoder type:", static_cast<int>(type));
     encoder->load();
-    rnexecutorch::log(
-        rnexecutorch::LOG_LEVEL::Info,
-        "[MultimodalRunner] Encoder loaded, type:", static_cast<int>(type));
   }
 
   llm::Stats *stats_ptr = &stats_;
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  const auto method_names =
-      ET_UNWRAP(module_->method_names(), "Failed reading method names");
-  if (method_names.count(kEosIds)) {
-    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
-      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
-    }
-  }
-  if (eos_ids->empty()) {
-    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Warn,
-                      "[MultimodalRunner] get_eos_ids not found in model, "
-                      "falling back to {7}");
-    eos_ids->emplace(7);
-  } else {
-    rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                      "[MultimodalRunner] EOS IDs loaded:", *eos_ids);
-  }
 
   mm_decoder_runner_ = std::make_unique<llm::MultimodalDecoderRunner>(
-      module_, io_manager_.get());
+      module_.get(), io_manager_.get());
   llm::IEncoder *image_encoder = nullptr;
   auto enc_it = encoders_.find(MultimodalType::Image);
   if (enc_it != encoders_.end()) {
     image_encoder = enc_it->second.get();
   }
   mm_prefiller_ = std::make_unique<llm::MultimodalPrefiller>(
-      module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get(),
-      image_encoder);
+      module_.get(), mm_decoder_runner_.get(), tokenizer_.get(),
+      io_manager_.get(), image_encoder);
   mm_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true,
-      std::move(eos_ids), stats_ptr);
+      std::move(eos_ids_), stats_ptr);
 
   ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load());
   ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load());
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[MultimodalRunner] All subcomponents loaded successfully");
+
   return Error::Ok;
 }
 
@@ -114,10 +89,6 @@ Error MultimodalRunner::generate_internal(
   stats_.first_token_ms = llm::time_in_ms();
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   stats_.num_prompt_tokens = pos_;
-  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
-                    "[MultimodalRunner] Prefill took",
-                    stats_.prompt_eval_end_ms - stats_.inference_start_ms,
-                    "ms for", pos_, "tokens");
 
   int32_t resolved_max_new =
       static_cast<int32_t>(config_.max_context_length - pos_);
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h
index 58e676a94..f96916de3 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.h
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.h
@@ -11,13 +11,12 @@
 
 namespace rnexecutorch::llm::runner {
 
-// Tag enum for keying encoder map
 enum class MultimodalType { Image };
 
 class MultimodalRunner : public BaseLLMRunner {
 public:
   explicit MultimodalRunner(
-      std::unique_ptr<::executorch::extension::Module> owned_module,
+      std::unique_ptr<::executorch::extension::Module> module,
       const std::string &tokenizer_path,
       std::map<MultimodalType,
                std::unique_ptr<::executorch::extension::llm::IEncoder>>
@@ -26,6 +25,7 @@ class MultimodalRunner : public BaseLLMRunner {
           .temperature = 0.8F, .topp = 0.9F});
 
   bool is_loaded() const override;
+  bool is_multimodal() const override { return true; }
   int32_t get_visual_token_count() const override;
 
   ::executorch::runtime::Error generate_internal(
@@ -35,9 +35,8 @@ class MultimodalRunner : public BaseLLMRunner {
 protected:
   ::executorch::runtime::Error load_subcomponents() override;
   void stop_impl() override;
-  void set_temperature_impl(float) override {
-  } // config_ already updated by base
-  void set_topp_impl(float) override {} // config_ already updated by base
+  void set_temperature_impl(float) override {}
+  void set_topp_impl(float) override {}
   void set_count_interval_impl(size_t count_interval) override;
   void set_time_interval_impl(size_t time_interval) override;
 
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
index d61d70c41..063775be4 100644
--- a/packages/react-native-executorch/common/runner/text_runner.cpp
+++ b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -12,10 +12,10 @@ using namespace executorch::extension::llm;
 using ::executorch::extension::Module;
 using ::executorch::runtime::Error;
 
-TextRunner::TextRunner(std::unique_ptr<Module> owned_module,
+TextRunner::TextRunner(std::unique_ptr<Module> module,
                        const std::string &tokenizer_path,
                        const llm::GenerationConfig &config)
-    : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config) {}
+    : BaseLLMRunner(std::move(module), tokenizer_path, config) {}
 
 bool TextRunner::is_loaded() const {
   return module_ && module_->is_loaded() && tokenizer_ &&
@@ -26,24 +26,10 @@ bool TextRunner::is_loaded() const {
 Error TextRunner::load_subcomponents() {
   ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward"));
 
-  // Re-detect eos_ids from the module (base class built them but doesn't pass
-  // them down yet — reconstruct with the same fallback logic).
-  auto eos_ids = std::make_unique<std::unordered_set<uint64_t>>();
-  const auto method_names =
-      ET_UNWRAP(module_->method_names(), "Failed reading method names");
-  if (method_names.count(kEosIds)) {
-    for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) {
-      eos_ids->emplace(static_cast<uint64_t>(eos_id.toScalar().to<int64_t>()));
-    }
-  }
-  if (eos_ids->empty()) {
-    eos_ids->emplace(7); // fallback <|im_end|>
-  }
-
   llm::Stats *stats_ptr = &stats_;
 
   text_decoder_runner_ = std::make_unique<llm::TextDecoderRunner>(
-      module_, io_manager_.get(), config_.temperature, config_.topp);
+      module_.get(), io_manager_.get(), config_.temperature, config_.topp);
   rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info,
                     "[TextRunner] Parallel prefill (enable_dynamic_shape):",
                     config_.enable_dynamic_shape);
@@ -52,7 +38,7 @@ Error TextRunner::load_subcomponents() {
       config_.enable_dynamic_shape, config_.max_seq_len);
   text_token_generator_ = std::make_unique<llm::TextTokenGenerator>(
       tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache,
-      std::move(eos_ids), stats_ptr);
+      std::move(eos_ids_), stats_ptr);
 
   return Error::Ok;
 }
@@ -62,7 +48,6 @@ Error TextRunner::generate_internal(
     std::function<void(const std::string &)> token_callback) {
 
   if (inputs.empty()) {
-    ET_LOG(Error, "MultimodalInput vector cannot be empty");
     return Error::InvalidArgument;
   }
 
diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h
index 857cf452f..5944943b9 100644
--- a/packages/react-native-executorch/common/runner/text_runner.h
+++ b/packages/react-native-executorch/common/runner/text_runner.h
@@ -10,11 +10,10 @@ namespace rnexecutorch::llm::runner {
 
 class TextRunner : public BaseLLMRunner {
 public:
-  explicit TextRunner(
-      std::unique_ptr<::executorch::extension::Module> owned_module,
-      const std::string &tokenizer_path,
-      const ::executorch::extension::llm::GenerationConfig &config = {
-          .temperature = 0.8F, .topp = 0.9F});
+  explicit TextRunner(std::unique_ptr<::executorch::extension::Module> module,
+                      const std::string &tokenizer_path,
+                      const ::executorch::extension::llm::GenerationConfig
+                          &config = {.temperature = 0.8F, .topp = 0.9F});
 
   bool is_loaded() const override;
 

From 69d454b21a2f963e612c5be832867fa05962c308 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 16:55:02 +0100
Subject: [PATCH 44/46] refactor: comments etc.

---
 .../common/runner/encoders/iencoder.h              |  2 +-
 .../common/runner/multimodal_decoder_runner.h      | 14 ++------------
 .../common/runner/multimodal_input.h               |  1 -
 .../common/runner/multimodal_prefiller.cpp         |  1 -
 .../common/runner/multimodal_prefiller.h           | 10 ++--------
 .../src/controllers/LLMController.ts               |  2 --
 packages/react-native-executorch/src/types/llm.ts  |  5 ++---
 7 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h
index 78abe80ce..8a6bf7e51 100644
--- a/packages/react-native-executorch/common/runner/encoders/iencoder.h
+++ b/packages/react-native-executorch/common/runner/encoders/iencoder.h
@@ -13,7 +13,7 @@ class IEncoder {
   virtual ~IEncoder() = default;
   virtual ::executorch::runtime::Error load() = 0;
   virtual bool is_loaded() const = 0;
-  // Encodes one input segment, returns embeddings EValue
+
   virtual ::executorch::runtime::Result<::executorch::runtime::EValue>
   encode(const MultimodalInput &input) = 0;
 
diff --git a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
index 2eafe3901..3b6fe4660 100644
--- a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
+++ b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h
@@ -13,19 +13,12 @@
 #include "constants.h"
 #include "text_decoder_runner.h"
 
-namespace executorch {
-namespace extension {
-namespace llm {
-
-// Extends TextDecoderRunner to use the multi-method PTE layout:
-//   token_embedding method  → embeddings
-//   text_decoder method     → logits
+namespace executorch::extension::llm {
 class MultimodalDecoderRunner : public TextDecoderRunner {
 public:
   explicit MultimodalDecoderRunner(Module *module, IOManager *io_manager)
       : TextDecoderRunner(module, io_manager) {}
 
-  // Step: embed single token, then decode.
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   step(TensorPtr &tokens, int64_t start_pos) override {
     auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens);
@@ -35,7 +28,6 @@ class MultimodalDecoderRunner : public TextDecoderRunner {
     return decode((*embed_result)[0], start_pos);
   }
 
-  // Decode an embedding EValue to logits.
   inline ::executorch::runtime::Result<::executorch::aten::Tensor>
   decode(const ::executorch::runtime::EValue &embeddings, int64_t start_pos) {
     auto start_pos_tensor = ::executorch::extension::from_blob(
@@ -68,6 +60,4 @@ class MultimodalDecoderRunner : public TextDecoderRunner {
   }
 };
 
-} // namespace llm
-} // namespace extension
-} // namespace executorch
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h
index 1d56f5f5a..e7d28bdd6 100644
--- a/packages/react-native-executorch/common/runner/multimodal_input.h
+++ b/packages/react-native-executorch/common/runner/multimodal_input.h
@@ -19,7 +19,6 @@ namespace executorch {
 namespace extension {
 namespace llm {
 
-// Tagged struct to distinguish image paths from text strings in the variant.
 struct ImagePath {
   std::string path;
 };
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
index a9a4715a7..f2bce9bc5 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp
@@ -31,7 +31,6 @@ MultimodalPrefiller::MultimodalPrefiller(
 
 Result<uint64_t> MultimodalPrefiller::prefill(const MultimodalInput &input,
                                               int64_t &start_pos) {
-  // Keep backing storage alive for the duration of the prefill call.
   EValue encoder_output;
   std::vector<int64_t> padded_tokens_storage;
   TensorPtr sliced_embed_storage;
diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
index 4effee7b7..5f1978943 100644
--- a/packages/react-native-executorch/common/runner/multimodal_prefiller.h
+++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h
@@ -16,12 +16,8 @@
 #include <pytorch/tokenizers/hf_tokenizer.h>
 #include <runner/encoders/iencoder.h>
 
-namespace executorch {
-namespace extension {
-namespace llm {
+namespace executorch::extension::llm {
 
-// Prefills all multimodal inputs (image + text segments) into the KV cache.
-// Implements the same padding logic as the ET repo's multimodal_prefiller.cpp.
 class MultimodalPrefiller {
 public:
   explicit MultimodalPrefiller(Module *module,
@@ -46,6 +42,4 @@ class MultimodalPrefiller {
   IEncoder *image_encoder_;
 };
 
-} // namespace llm
-} // namespace extension
-} // namespace executorch
+} // namespace executorch::extension::llm
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 0a4629a0b..378817833 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -329,8 +329,6 @@ export class LLMController {
     const updatedHistory = [...this._messageHistory, newMessage];
     this.messageHistoryCallback(updatedHistory);
 
-    // For messages with images, convert mediaPath into structured content so
-    // the chat template emits <image> placeholders in the right position.
     const historyForTemplate = updatedHistory.map((m) =>
       m.mediaPath
         ? {
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index f906f8b3f..15b070bc5 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -12,8 +12,7 @@ export type LLMCapability = 'vision' | 'audio';
  * @category Types
  */
 export type MediaArg<C extends readonly LLMCapability[]> =
-  ('vision' extends C[number] ? { imagePath?: string } : object) &
-    ('audio' extends C[number] ? { audioPath?: string } : object);
+  'vision' extends C[number] ? { imagePath?: string } : object;
 
 /**
  * Properties for initializing and configuring a Large Language Model (LLM) instance.
@@ -154,7 +153,7 @@ export interface LLMTypeMultimodal<
    * After model responds, `messageHistory` will be updated.
    *
    * @param message - The message string to send.
-   * @param media - Optional media object (e.g. `{ imagePath }` for vision, `{ audioPath }` for audio).
+   * @param media - Optional media object (e.g. `{ imagePath }` for vision.
    * @returns The model's response as a `string`.
    */
   sendMessage: (message: string, media?: MediaArg<C>) => Promise<string>;

From 6a3857b5d3985c66becef9166fe017c18beb2726 Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 17:12:14 +0100
Subject: [PATCH 45/46] fix: cap VLM generation tokens, propagate encoder load
 errors, pass image_token from config

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../rnexecutorch/host_objects/ModelHostObject.h      |  2 +-
 .../common/rnexecutorch/models/llm/LLM.cpp           | 12 ++++++++----
 .../common/rnexecutorch/models/llm/LLM.h             |  3 ++-
 .../common/rnexecutorch/tests/run_tests.sh           |  2 +-
 .../common/runner/base_llm_runner.h                  |  1 -
 .../common/runner/multimodal_runner.cpp              |  8 ++++----
 .../common/runner/text_runner.cpp                    |  1 -
 .../src/controllers/LLMController.ts                 |  1 +
 .../src/hooks/natural_language_processing/useLLM.ts  |  4 ++--
 packages/react-native-executorch/src/types/llm.ts    |  2 +-
 10 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 35b34ed56..88a0e0dd3 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -156,7 +156,7 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>,
           promiseHostFunction<static_cast<std::string (Model::*)(
-              std::string, std::vector<std::string>,
+              std::string, std::vector<std::string>, std::string,
               std::shared_ptr<jsi::Function>)>(&Model::generate)>,
           "generateMultimodal"));
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
index 2dd342702..e929ead9b 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp
@@ -72,6 +72,7 @@ std::string LLM::generate(std::string input,
 
 std::string LLM::generate(std::string prompt,
                           std::vector<std::string> imagePaths,
+                          std::string imageToken,
                           std::shared_ptr<jsi::Function> callback) {
   if (!runner_ || !runner_->is_loaded()) {
     throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
@@ -82,17 +83,20 @@ std::string LLM::generate(std::string prompt,
         RnExecutorchErrorCode::InvalidUserInput,
         "This is a text-only model. Call generate(prompt, cb).");
   }
+  if (imageToken.empty()) {
+    imageToken = "<image>";
+  }
 
-  // Split rendered prompt on "<image>" placeholders and interleave with images.
-  static constexpr const char *kImageToken = "<image>";
-  static constexpr size_t kImageTokenLen = 7; // strlen("<image>")
+  // Split rendered prompt on imageToken placeholders and interleave with
+  // images.
+  const size_t kImageTokenLen = imageToken.size();
 
   std::vector<llm::MultimodalInput> inputs;
   size_t imageIdx = 0;
   size_t searchPos = 0;
 
   while (true) {
-    size_t found = prompt.find(kImageToken, searchPos);
+    size_t found = prompt.find(imageToken, searchPos);
     if (found == std::string::npos) {
       // Remaining text after last image (or entire prompt if no images)
       if (searchPos < prompt.size()) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
index 514760908..d4e44ec8d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h
@@ -24,9 +24,10 @@ class LLM : public BaseModel {
   std::string generate(std::string prompt,
                        std::shared_ptr<jsi::Function> callback);
 
-  // Multimodal: pre-rendered prompt string with <image> placeholders +
+  // Multimodal: pre-rendered prompt string with imageToken placeholders +
   // ordered list of image paths (one per placeholder)
   std::string generate(std::string prompt, std::vector<std::string> imagePaths,
+                       std::string imageToken,
                        std::shared_ptr<jsi::Function> callback);
 
   void interrupt();
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index 324841d9b..941885e54 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -62,7 +62,7 @@ MODELS=(
   "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.6.0/tokenizer.json"
   "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte"
   "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json"
-  "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_v2.pte"
+  "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_latest.pte"
   "tokenizer_2.5.json|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json"
   "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte"
   "xnnpack_crnn_english.pte|https://huggingface.co/software-mansion/react-native-executorch-recognizer-crnn.en/resolve/v0.7.0/xnnpack/english/xnnpack_crnn_english.pte"
diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h
index c26a3c2d3..2dd929be7 100644
--- a/packages/react-native-executorch/common/runner/base_llm_runner.h
+++ b/packages/react-native-executorch/common/runner/base_llm_runner.h
@@ -84,7 +84,6 @@ class BaseLLMRunner {
   std::unordered_map<std::string, int64_t> metadata_;
   std::unique_ptr<llm::IOManager> io_manager_;
   std::unique_ptr<std::unordered_set<uint64_t>> eos_ids_;
-  bool shouldStop_{false};
 };
 
 } // namespace rnexecutorch::llm::runner
diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
index 7eda70870..c2aa69204 100644
--- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp
+++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp
@@ -42,7 +42,7 @@ Error MultimodalRunner::load_subcomponents() {
   rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading",
                     encoders_.size(), "encoder(s)");
   for (auto &[type, encoder] : encoders_) {
-    encoder->load();
+    ET_CHECK_OK_OR_RETURN_ERROR(encoder->load());
   }
 
   llm::Stats *stats_ptr = &stats_;
@@ -90,9 +90,9 @@ Error MultimodalRunner::generate_internal(
   stats_.prompt_eval_end_ms = llm::time_in_ms();
   stats_.num_prompt_tokens = pos_;
 
-  int32_t resolved_max_new =
-      static_cast<int32_t>(config_.max_context_length - pos_);
-  resolved_max_new = std::max(0, resolved_max_new);
+  int32_t resolved_max_new = resolve_max_new_tokens(
+      static_cast<int32_t>(pos_), config_.max_seq_len,
+      config_.max_context_length, config_.max_new_tokens);
 
   std::vector<uint64_t> seed_tokens = {prefill_next_token};
   auto wrapped_callback = [&](const std::string &piece) {
diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp
index 063775be4..d535dba6c 100644
--- a/packages/react-native-executorch/common/runner/text_runner.cpp
+++ b/packages/react-native-executorch/common/runner/text_runner.cpp
@@ -69,7 +69,6 @@ Error TextRunner::generate_internal(
       };
 
   stats_.inference_start_ms = llm::time_in_ms();
-  shouldStop_ = false;
 
   int64_t context_len_left =
       static_cast<int64_t>(config_.max_context_length) - pos_;
diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts
index 378817833..13a6c4c34 100644
--- a/packages/react-native-executorch/src/controllers/LLMController.ts
+++ b/packages/react-native-executorch/src/controllers/LLMController.ts
@@ -237,6 +237,7 @@ export class LLMController {
           ? await this.nativeModule.generateMultimodal(
               input,
               imagePaths,
+              this.tokenizerConfig?.image_token ?? '<image>',
               this.onToken
             )
           : await this.nativeModule.generate(input, this.onToken);
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
index 877f3a02d..72c7f4d96 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts
@@ -97,9 +97,9 @@ export function useLLM({
   );
 
   const generate = useCallback(
-    (messages: Message[], tools?: LLMTool[]) => {
+    (messages: Message[], tools?: LLMTool[], imagePaths?: string[]) => {
       setResponse('');
-      return controllerInstance.generate(messages, tools);
+      return controllerInstance.generate(messages, tools, imagePaths);
     },
     [controllerInstance]
   );
diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts
index 15b070bc5..ac57355f1 100644
--- a/packages/react-native-executorch/src/types/llm.ts
+++ b/packages/react-native-executorch/src/types/llm.ts
@@ -5,7 +5,7 @@ import { ResourceSource } from './common';
  * Capabilities a multimodal LLM can have.
  * @category Types
  */
-export type LLMCapability = 'vision' | 'audio';
+export type LLMCapability = 'vision';
 
 /**
  * Derives the media argument shape for `sendMessage` from a capabilities tuple.

From 551a30656326663e0f1904f2b4730946ca8bf8ce Mon Sep 17 00:00:00 2001
From: Norbert Klockiewicz <Nklockiewicz12@gmail.com>
Date: Thu, 5 Mar 2026 17:16:15 +0100
Subject: [PATCH 46/46] revert: remove TextRunnerTests and VLMTests suites

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../common/rnexecutorch/tests/CMakeLists.txt  |  24 ----
 .../integration/MultimodalRunnerTest.cpp      | 118 ------------------
 .../tests/integration/TextRunnerTest.cpp      | 109 ----------------
 .../common/rnexecutorch/tests/run_tests.sh    |   4 -
 4 files changed, 255 deletions(-)
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
 delete mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp

diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
index ebf390691..159f00159 100644
--- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt
@@ -223,30 +223,6 @@ add_rn_test(LLMTests integration/LLMTest.cpp
     LIBS tokenizers_deps opencv_deps
 )
 
-add_rn_test(TextRunnerTests integration/TextRunnerTest.cpp
-    SOURCES
-        ${COMMON_DIR}/runner/base_llm_runner.cpp
-        ${COMMON_DIR}/runner/text_runner.cpp
-        ${COMMON_DIR}/runner/text_prefiller.cpp
-        ${COMMON_DIR}/runner/text_decoder_runner.cpp
-        ${COMMON_DIR}/runner/sampler.cpp
-        ${COMMON_DIR}/runner/arange_util.cpp
-    LIBS tokenizers_deps
-)
-
-add_rn_test(VLMTests integration/MultimodalRunnerTest.cpp
-    SOURCES
-        ${COMMON_DIR}/runner/base_llm_runner.cpp
-        ${COMMON_DIR}/runner/multimodal_runner.cpp
-        ${COMMON_DIR}/runner/multimodal_prefiller.cpp
-        ${COMMON_DIR}/runner/text_decoder_runner.cpp
-        ${COMMON_DIR}/runner/sampler.cpp
-        ${COMMON_DIR}/runner/arange_util.cpp
-        ${COMMON_DIR}/runner/encoders/vision_encoder.cpp
-        ${IMAGE_UTILS_SOURCES}
-    LIBS tokenizers_deps opencv_deps
-)
-
 add_rn_test(TextToImageTests integration/TextToImageTest.cpp
     SOURCES
         ${RNEXECUTORCH_DIR}/models/text_to_image/TextToImage.cpp
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
deleted file mode 100644
index 038fa7f6e..000000000
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <gtest/gtest.h>
-#include <map>
-#include <memory>
-
-#include <executorch/extension/module/module.h>
-#include <rnexecutorch/Error.h>
-#include <runner/encoders/vision_encoder.h>
-#include <runner/multimodal_input.h>
-#include <runner/multimodal_runner.h>
-
-using ::executorch::extension::Module;
-using ::executorch::extension::llm::VisionEncoder;
-using ::executorch::runtime::Error;
-using ::rnexecutorch::llm::runner::MultimodalType;
-
-constexpr auto kTextModel = "smolLm2_135M_8da4w.pte";
-constexpr auto kTextTokenizer = "smollm_tokenizer.json";
-constexpr auto kVLMModel = "lfm2_5_vl_quantized_xnnpack_v2.pte";
-constexpr auto kVLMTokenizer = "tokenizer_2.5.json";
-constexpr auto kTestImage = "test_image.jpg";
-
-static std::map<MultimodalType,
-                std::unique_ptr<::executorch::extension::llm::IEncoder>>
-makeVisionEncoders(Module *module) {
-  std::map<MultimodalType,
-           std::unique_ptr<::executorch::extension::llm::IEncoder>>
-      encoders;
-  encoders[MultimodalType::Image] = std::make_unique<VisionEncoder>(module);
-  return encoders;
-}
-
-// ============================================================================
-// Error-path tests (text-only SmolLM2 — no vision_encoder method)
-// ============================================================================
-
-TEST(MultimodalRunnerTest, LoadFailsWhenVisionEncoderMissing) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  auto encoders = makeVisionEncoders(module.get());
-  rnexecutorch::llm::runner::MultimodalRunner runner(
-      std::move(module), kTextTokenizer, std::move(encoders));
-  EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError);
-}
-
-TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  auto encoders = makeVisionEncoders(module.get());
-  rnexecutorch::llm::runner::MultimodalRunner runner(
-      std::move(module), kTextTokenizer, std::move(encoders));
-  EXPECT_FALSE(runner.is_loaded());
-}
-
-// ============================================================================
-// Integration tests (require VLM .pte)
-// ============================================================================
-
-class VLMTest : public ::testing::Test {
-protected:
-  std::unique_ptr<rnexecutorch::llm::runner::MultimodalRunner> runner_;
-
-  void SetUp() override {
-    auto module = std::make_unique<Module>(kVLMModel, Module::LoadMode::File);
-    auto encoders = makeVisionEncoders(module.get());
-    runner_ = std::make_unique<rnexecutorch::llm::runner::MultimodalRunner>(
-        std::move(module), kVLMTokenizer, std::move(encoders));
-    auto err = runner_->load();
-    ASSERT_EQ(err, Error::Ok) << "VLM model load failed";
-  }
-};
-
-TEST_F(VLMTest, LoadSucceedsWithRealVLMModel) {
-  EXPECT_TRUE(runner_->is_loaded());
-}
-
-TEST_F(VLMTest, MetadataApplied_KVCache) {
-  EXPECT_TRUE(runner_->config_.enable_kv_cache);
-}
-
-TEST_F(VLMTest, GenerateTextOnlyInputWorks) {
-  runner_->set_temperature(0.0f);
-  auto err = runner_->generate(
-      "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n");
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_GT(runner_->pos_, 0);
-}
-
-TEST_F(VLMTest, GenerateWithImageProducesTokens) {
-  runner_->set_temperature(0.0f);
-
-  std::vector<::executorch::extension::llm::MultimodalInput> inputs = {
-      ::executorch::extension::llm::make_image_input(kTestImage),
-      ::executorch::extension::llm::make_text_input(
-          "<|im_start|>user\nDescribe this image briefly."
-          "<|im_end|>\n<|im_start|>assistant\n"),
-  };
-
-  auto err = runner_->generate_internal(inputs, nullptr);
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_GT(runner_->pos_, 0);
-}
-
-TEST_F(VLMTest, EmbeddingCacheHitOnRepeatedImage) {
-  runner_->set_temperature(0.0f);
-
-  // First call — cache miss, runs vision_encoder
-  std::vector<::executorch::extension::llm::MultimodalInput> inputs = {
-      ::executorch::extension::llm::make_image_input(kTestImage),
-      ::executorch::extension::llm::make_text_input(
-          "<|im_start|>user\nWhat is this?<|im_end|>\n<|im_start|>assistant\n"),
-  };
-  runner_->generate_internal(inputs, nullptr);
-  runner_->reset();
-
-  // Second call — same image path, should hit cache
-  // (no functional assertion possible without instrumenting the encoder,
-  //  but this at least verifies it doesn't crash or error)
-  auto err = runner_->generate_internal(inputs, nullptr);
-  EXPECT_EQ(err, Error::Ok);
-}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
deleted file mode 100644
index 169310ed3..000000000
--- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <gtest/gtest.h>
-#include <memory>
-
-#include <executorch/extension/module/module.h>
-#include <rnexecutorch/Error.h>
-#include <runner/text_runner.h>
-
-using ::executorch::extension::Module;
-using ::executorch::runtime::Error;
-
-constexpr auto kTextModel = "smolLm2_135M_8da4w.pte";
-constexpr auto kTextTokenizer = "smollm_tokenizer.json";
-constexpr auto kSystemPrompt = "You are a helpful assistant. Assist the user "
-                               "to the best of your abilities.";
-
-static std::string formatChatML(const std::string &systemPrompt,
-                                const std::string &userMessage) {
-  return "<|im_start|>system\n" + systemPrompt + "<|im_end|>\n" +
-         "<|im_start|>user\n" + userMessage + "<|im_end|>\n" +
-         "<|im_start|>assistant\n";
-}
-
-TEST(TextRunnerTest, ConstructorAndLoadSucceeds) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  auto err = runner.load();
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_TRUE(runner.is_loaded());
-}
-
-TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) {
-  // SmolLM2-135M exports enable_dynamic_shape = 1
-  // After load(), config_.enable_dynamic_shape must be true (our fix)
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.load();
-  EXPECT_TRUE(runner.config_.enable_dynamic_shape);
-}
-
-TEST(TextRunnerTest, MetadataApplied_KVCache) {
-  // SmolLM2-135M exports use_kv_cache = 1
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.load();
-  EXPECT_TRUE(runner.config_.enable_kv_cache);
-}
-
-TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.load();
-  runner.set_temperature(0.3f);
-  EXPECT_FLOAT_EQ(runner.config_.temperature, 0.3f);
-}
-
-TEST(TextRunnerTest, ResetZerosPos) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.pos_ = 42;
-  runner.reset();
-  EXPECT_EQ(runner.pos_, 0);
-}
-
-TEST(TextRunnerTest, GenerateProducesTokens) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.load();
-  runner.set_temperature(0.0f);
-
-  std::string prompt = formatChatML(kSystemPrompt, "Say: hello");
-  auto err = runner.generate(prompt);
-  EXPECT_EQ(err, Error::Ok);
-  EXPECT_GT(runner.pos_, 0);
-}
-
-TEST(TextRunnerTest, ParallelPrefillEnabled) {
-  // Confirms the fix: enable_dynamic_shape from metadata now unconditionally
-  // applied
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.load();
-  EXPECT_TRUE(runner.config_.enable_dynamic_shape);
-}
-
-TEST(TextRunnerTest, StopHaltsGeneration) {
-  auto module = std::make_unique<Module>(kTextModel, Module::LoadMode::File);
-  rnexecutorch::llm::runner::TextRunner runner(std::move(module),
-                                               kTextTokenizer);
-  runner.load();
-  runner.set_temperature(0.0f);
-
-  int token_count = 0;
-  std::string prompt = formatChatML(kSystemPrompt, "Count to one hundred");
-  runner.generate(prompt, {}, [&](const std::string &) {
-    token_count++;
-    if (token_count >= 3) {
-      runner.stop();
-    }
-  });
-  EXPECT_GT(token_count, 0);
-  EXPECT_LE(token_count, 5); // stopped early
-}
diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
index 941885e54..360aa9d11 100755
--- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
+++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh
@@ -29,8 +29,6 @@ TEST_EXECUTABLES=(
   "TokenizerModuleTests"
   "SpeechToTextTests"
   "LLMTests"
-  "TextRunnerTests"
-  "VLMTests"
   "ImageSegmentationTests"
   "TextToImageTests"
   "OCRTests"
@@ -62,8 +60,6 @@ MODELS=(
   "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.6.0/tokenizer.json"
   "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte"
   "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json"
-  "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_latest.pte"
-  "tokenizer_2.5.json|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json"
   "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte"
   "xnnpack_crnn_english.pte|https://huggingface.co/software-mansion/react-native-executorch-recognizer-crnn.en/resolve/v0.7.0/xnnpack/english/xnnpack_crnn_english.pte"
   "xnnpack_craft_quantized.pte|https://huggingface.co/software-mansion/react-native-executorch-detector-craft/resolve/v0.7.0/xnnpack/xnnpack_craft.pte"