From 0ac42b4a049664264cb6b4fd1e4110447ea5c2e4 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 19 Feb 2026 16:18:38 +0100 Subject: [PATCH 01/46] feat: initial implementation of multimodal runner with lfm vlm --- apps/llm/app.json | 3 +- apps/llm/app/_layout.tsx | 22 +- apps/llm/app/index.tsx | 22 +- apps/llm/app/multimodal_llm/index.tsx | 389 ++++++++++++++++++ apps/llm/package.json | 2 + .../rnexecutorch/RnExecutorchInstaller.cpp | 6 + .../host_objects/ModelHostObject.h | 24 +- .../models/multimodal_llm/MultimodalLLM.cpp | 197 +++++++++ .../models/multimodal_llm/MultimodalLLM.h | 40 ++ .../common/runner/image.h | 88 ++++ .../common/runner/multimodal_decoder_runner.h | 73 ++++ .../common/runner/multimodal_input.h | 74 ++++ .../common/runner/multimodal_prefiller.cpp | 179 ++++++++ .../common/runner/multimodal_prefiller.h | 48 +++ .../common/runner/multimodal_runner.cpp | 149 +++++++ .../common/runner/multimodal_runner.h | 68 +++ .../useMultimodalLLM.ts | 153 +++++++ packages/react-native-executorch/src/index.ts | 3 + 18 files changed, 1511 insertions(+), 29 deletions(-) create mode 100644 apps/llm/app/multimodal_llm/index.tsx create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp create mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h create mode 100644 packages/react-native-executorch/common/runner/image.h create mode 100644 packages/react-native-executorch/common/runner/multimodal_decoder_runner.h create mode 100644 packages/react-native-executorch/common/runner/multimodal_input.h create mode 100644 packages/react-native-executorch/common/runner/multimodal_prefiller.cpp create mode 100644 packages/react-native-executorch/common/runner/multimodal_prefiller.h create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.cpp create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.h create mode 100644 packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts diff --git a/apps/llm/app.json b/apps/llm/app.json index e4d2da0d6..36a042341 100644 --- a/apps/llm/app.json +++ b/apps/llm/app.json @@ -55,7 +55,8 @@ }, "entitlements": { "com.apple.developer.kernel.increased-memory-limit": true - } + }, + "appleTeamId": "B357MU264T" }, "android": { "adaptiveIcon": { diff --git a/apps/llm/app/_layout.tsx b/apps/llm/app/_layout.tsx index 5ece80f1f..523d3aaf7 100644 --- a/apps/llm/app/_layout.tsx +++ b/apps/llm/app/_layout.tsx @@ -57,37 +57,45 @@ export default function _layout() { headerTitleStyle: { color: ColorPalette.primary }, }} > - - */} + {/* - */} + {/* - */} + {/* */} + router.navigate('llm/')} + onPress={() => router.navigate('multimodal_llm/')} > - LLM - - router.navigate('llm_tool_calling/')} - > - LLM Tool Calling - - router.navigate('llm_structured_output/')} - > - LLM Structured Output - - router.navigate('voice_chat/')} - > - Voice Chat + Multimodal LLM (VLM) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx new file mode 100644 index 000000000..3a3b4692b --- /dev/null +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -0,0 +1,389 @@ +import { useContext, useEffect, useRef, useState } from 'react'; +import { + Image, + Keyboard, + KeyboardAvoidingView, + Platform, + ScrollView, + StyleSheet, + Text, + TextInput, + TouchableOpacity, + TouchableWithoutFeedback, + View, +} from 'react-native'; +import * as DocumentPicker from 'expo-document-picker'; +import { launchImageLibrary } from 'react-native-image-picker'; +import { useIsFocused } from '@react-navigation/native'; +import { useMultimodalLLM } from 'react-native-executorch'; +import ColorPalette from '../../colors'; +import Spinner from '../../components/Spinner'; +import { GeneratingContext } from '../../context'; + +export default function MultimodalLLMScreenWrapper() { + const isFocused = useIsFocused(); + return isFocused ? : null; +} + +// Outer component: collect model + tokenizer paths before mounting the hook +function MultimodalLLMScreenOuter() { + const [modelUri, setModelUri] = useState(null); + const [tokenizerUri, setTokenizerUri] = useState(null); + const [confirmed, setConfirmed] = useState(false); + + const pickFile = async (setter: (uri: string) => void) => { + const result = await DocumentPicker.getDocumentAsync({ + copyToCacheDirectory: false, + multiple: false, + }); + if (result.canceled) return; + const asset = result.assets[0]; + if (asset?.uri) { + setter(asset.uri); + } + }; + + if (!confirmed) { + return ( + + Select model files + + Pick the .pte model and tokenizer.json from your device storage. + + + pickFile(setModelUri)} + /> + pickFile(setTokenizerUri)} + /> + + setConfirmed(true)} + > + Load model + + + ); + } + + return ( + + ); +} + +function FilePicker({ + label, + uri, + onPick, +}: { + label: string; + uri: string | null; + onPick: () => void; +}) { + const fileName = uri ? (uri.split('/').pop() ?? uri) : null; + return ( + + + {label} + + {fileName ?? 'Tap to pick file'} + + + + + ); +} + +function MultimodalLLMScreen({ + modelSource, + tokenizerSource, +}: { + modelSource: string; + tokenizerSource: string; +}) { + const [imageUri, setImageUri] = useState(null); + const [prompt, setPrompt] = useState(''); + const [isTextInputFocused, setIsTextInputFocused] = useState(false); + const scrollViewRef = useRef(null); + const { setGlobalGenerating } = useContext(GeneratingContext); + + const vlm = useMultimodalLLM({ model: { modelSource, tokenizerSource } }); + + useEffect(() => { + setGlobalGenerating(vlm.isGenerating); + }, [vlm.isGenerating, setGlobalGenerating]); + + useEffect(() => { + if (vlm.error) { + console.error('MultimodalLLM error:', vlm.error); + } + }, [vlm.error]); + + const pickImage = async () => { + const result = await launchImageLibrary({ mediaType: 'photo' }); + if (result.assets && result.assets.length > 0) { + const uri = result.assets[0]?.uri; + if (uri) { + setImageUri(uri); + } + } + }; + + const handleGenerate = async () => { + if (!imageUri || !prompt.trim() || !vlm.isReady || vlm.isGenerating) return; + Keyboard.dismiss(); + try { + await vlm.generate(imageUri, prompt.trim()); + } catch (e) { + console.error('Generation error:', e); + } + }; + + if (!vlm.isReady) { + return ( + + ); + } + + return ( + + + + scrollViewRef.current?.scrollToEnd({ animated: true }) + } + > + {/* Image picker */} + + {imageUri ? ( + + ) : ( + Tap to pick an image + )} + + + {/* Response area */} + {vlm.response ? ( + + Response: + {vlm.response} + + ) : vlm.isGenerating ? ( + + Generating… + + ) : null} + + + {/* Bottom bar */} + + setIsTextInputFocused(true)} + onBlur={() => setIsTextInputFocused(false)} + style={[ + styles.textInput, + { + borderColor: isTextInputFocused + ? ColorPalette.blueDark + : ColorPalette.blueLight, + }, + ]} + placeholder="Ask about the image…" + placeholderTextColor="#C1C6E5" + multiline + value={prompt} + onChangeText={setPrompt} + /> + {vlm.isGenerating ? ( + + Stop + + ) : ( + + Ask + + )} + + + + ); +} + +const styles = StyleSheet.create({ + // Setup phase + setupContainer: { + flex: 1, + padding: 24, + backgroundColor: '#fff', + justifyContent: 'center', + }, + setupTitle: { + fontSize: 20, + fontFamily: 'medium', + color: ColorPalette.primary, + marginBottom: 8, + }, + setupHint: { + fontSize: 13, + fontFamily: 'regular', + color: ColorPalette.blueDark, + marginBottom: 32, + lineHeight: 18, + }, + filePickerRow: { + flexDirection: 'row', + alignItems: 'center', + borderWidth: 1, + borderColor: ColorPalette.blueLight, + borderRadius: 10, + padding: 14, + marginBottom: 12, + backgroundColor: '#fafbff', + }, + filePickerInfo: { flex: 1 }, + filePickerLabel: { + fontSize: 12, + fontFamily: 'medium', + color: ColorPalette.blueDark, + marginBottom: 2, + }, + filePickerValue: { fontSize: 14, fontFamily: 'regular' }, + filePickerValueSet: { color: ColorPalette.primary }, + filePickerValueEmpty: { color: ColorPalette.blueLight }, + filePickerChevron: { + fontSize: 24, + color: ColorPalette.blueLight, + marginLeft: 8, + }, + loadButton: { + marginTop: 16, + backgroundColor: ColorPalette.strongPrimary, + borderRadius: 10, + padding: 14, + alignItems: 'center', + }, + loadButtonDisabled: { backgroundColor: ColorPalette.blueLight }, + loadButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 15 }, + + // Chat phase + container: { flex: 1, backgroundColor: '#fff' }, + scrollView: { flex: 1 }, + scrollContent: { padding: 16, paddingBottom: 8 }, + imagePicker: { + width: '100%', + height: 220, + borderRadius: 12, + borderWidth: 1, + borderColor: ColorPalette.blueLight, + borderStyle: 'dashed', + justifyContent: 'center', + alignItems: 'center', + overflow: 'hidden', + marginBottom: 16, + }, + previewImage: { width: '100%', height: '100%' }, + imagePickerText: { + color: ColorPalette.blueLight, + fontSize: 16, + fontFamily: 'regular', + }, + responseContainer: { + backgroundColor: ColorPalette.seaBlueLight, + borderRadius: 8, + padding: 12, + marginBottom: 8, + }, + responseLabel: { + fontSize: 12, + color: ColorPalette.blueDark, + fontFamily: 'medium', + marginBottom: 4, + }, + responseText: { + fontSize: 14, + lineHeight: 20, + color: ColorPalette.primary, + fontFamily: 'regular', + }, + bottomContainer: { + flexDirection: 'row', + alignItems: 'center', + paddingHorizontal: 16, + paddingVertical: 12, + borderTopWidth: 1, + borderTopColor: ColorPalette.blueLight, + backgroundColor: '#fff', + }, + textInput: { + flex: 1, + borderWidth: 1, + borderRadius: 8, + fontSize: 14, + lineHeight: 19.6, + fontFamily: 'regular', + color: ColorPalette.primary, + padding: 12, + maxHeight: 100, + }, + actionButton: { + marginLeft: 8, + backgroundColor: ColorPalette.strongPrimary, + borderRadius: 8, + paddingHorizontal: 16, + paddingVertical: 12, + justifyContent: 'center', + alignItems: 'center', + }, + actionButtonDisabled: { backgroundColor: ColorPalette.blueLight }, + actionButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 14 }, +}); diff --git a/apps/llm/package.json b/apps/llm/package.json index f58bc8127..d0fbb6401 100644 --- a/apps/llm/package.json +++ b/apps/llm/package.json @@ -19,6 +19,7 @@ "expo-brightness": "~14.0.8", "expo-calendar": "~15.0.8", "expo-constants": "~18.0.11", + "expo-document-picker": "~13.0.3", "expo-font": "~14.0.10", "expo-linking": "~8.0.10", "expo-router": "~6.0.17", @@ -30,6 +31,7 @@ "react-native-device-info": "^15.0.2", "react-native-executorch": "workspace:*", "react-native-gesture-handler": "~2.28.0", + "react-native-image-picker": "^7.2.2", "react-native-loading-spinner-overlay": "^3.0.1", "react-native-markdown-display": "^7.0.2", "react-native-reanimated": "~4.1.1", diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index 9d4b419e2..4cb6afef8 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -88,6 +89,11 @@ void RnExecutorchInstaller::injectJSIBindings( RnExecutorchInstaller::loadModel( jsiRuntime, jsCallInvoker, "loadLLM")); + jsiRuntime->global().setProperty( + *jsiRuntime, "loadMultimodalLLM", + RnExecutorchInstaller::loadModel( + jsiRuntime, jsCallInvoker, "loadMultimodalLLM")); + jsiRuntime->global().setProperty( *jsiRuntime, "loadOCR", RnExecutorchInstaller::loadModel( diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 7712b2b9d..2b7cbc2e1 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,11 @@ template class ModelHostObject : public JsiHostObject { explicit ModelHostObject(const std::shared_ptr &model, std::shared_ptr callInvoker) : model(model), callInvoker(callInvoker) { - if constexpr (meta::DerivedFromOrSameAs) { + // MultimodalLLM moves module_ into its runner during construction, so + // the base class methods that go through module_ (forward, getInputShape) + // are unsafe to expose. Its unload is registered separately below. + if constexpr (meta::DerivedFromOrSameAs && + !meta::SameAs) { addFunctions( JSI_EXPORT_FUNCTION(ModelHostObject, unload, "unload")); @@ -172,6 +177,23 @@ template class ModelHostObject : public JsiHostObject { ModelHostObject, synchronousHostFunction<&Model::streamStop>, "streamStop")); } + + if constexpr (meta::SameAs) { + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, synchronousHostFunction<&Model::interrupt>, + "interrupt")); + + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, + synchronousHostFunction<&Model::setTemperature>, "setTemperature")); + + addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, + synchronousHostFunction<&Model::setTopp>, + "setTopp")); + + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, unload, "unload")); + } } // A generic host function that runs synchronously, works analogously to the diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp new file mode 100644 index 000000000..7187b3d57 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp @@ -0,0 +1,197 @@ +#include "MultimodalLLM.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace rnexecutorch::models::multimodal_llm { +namespace llm = ::executorch::extension::llm; +namespace fs = std::filesystem; +using namespace facebook; +using ::executorch::extension::module::Module; +using ::executorch::runtime::Error; + +// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values in +// [0,255]. Normalization and patch unfolding are baked into the exported PTE. +static constexpr int kImageSize = 512; +static constexpr int kImageChannels = 3; + +// LFM2-VL chat template +static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n"; +static constexpr const char *kChatSuffix = + "<|im_end|>\n<|im_start|>assistant\n"; + +static llm::Image loadImageForLFM2(const std::string &imagePath) { + cv::Mat mat = image_processing::readImage(imagePath); + cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); + cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); + + // HWC uint8 → CHW float32, values in [0, 255] + std::vector chw(kImageChannels * kImageSize * kImageSize); + const int pixelCount = kImageSize * kImageSize; + for (int i = 0; i < pixelCount; ++i) { + cv::Vec3b px = mat.at(i / kImageSize, i % kImageSize); + for (int c = 0; c < kImageChannels; ++c) { + chw[c * pixelCount + i] = static_cast(px[c]); + } + } + return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels); +} + +MultimodalLLM::MultimodalLLM(const std::string &modelSource, + const std::string &tokenizerSource, + std::shared_ptr callInvoker) + : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { + // Build the multimodal runner from parts — all referencing module_ owned by + // BaseModel so we don't load the PTE twice. + auto tokenizer = std::make_unique(); + auto tokenizer_status = tokenizer->load(tokenizerSource); + if (tokenizer_status != tokenizers::Error::Ok) { + throw RnExecutorchError(RnExecutorchErrorCode::TokenizerError, + "Failed to load tokenizer"); + } + + auto io_manager = std::make_unique(*module_); + auto decoder_runner = std::make_unique( + module_.get(), io_manager.get()); + + auto eos_ids = std::make_unique>(); + // Read EOS ids from PTE constant method if present, default to 7 (<|im_end|>) + auto method_names_result = module_->method_names(); + if (method_names_result.ok()) { + if (method_names_result->count(llm::kEosIds)) { + auto eos_result = module_->execute(llm::kEosIds); + if (eos_result.ok()) { + for (const auto &ev : *eos_result) { + eos_ids->emplace(static_cast(ev.toScalar().to())); + } + } + } + } + if (eos_ids->empty()) { + eos_ids->emplace(7); // <|im_end|> fallback + } + + auto stats = std::make_unique(); + // Keep a raw pointer before moving into the runner so TextTokenGenerator + // can safely reference the same Stats object owned by the runner. + llm::Stats *stats_ptr = stats.get(); + auto token_generator = std::make_unique( + tokenizer.get(), decoder_runner.get(), /*use_kv_cache=*/true, + std::move(eos_ids), stats_ptr); + + auto prefiller = std::make_unique( + module_.get(), decoder_runner.get(), tokenizer.get(), io_manager.get()); + + // Read metadata from the PTE + std::unordered_map metadata = { + {llm::kMaxSeqLen, 2048}, + {llm::kMaxContextLen, 2048}, + }; + if (method_names_result.ok()) { + for (auto &pair : metadata) { + if (method_names_result->count(pair.first)) { + auto val = module_->get(pair.first); + if (val.ok()) { + pair.second = val->toScalar().to(); + } + } + } + } + + runner_ = std::make_unique( + std::move(metadata), std::move(tokenizer), std::move(module_), + std::move(decoder_runner), std::move(prefiller), std::move(io_manager), + std::move(token_generator), std::move(stats)); + + auto loadError = runner_->load(); + if (loadError != Error::Ok) { + throw RnExecutorchError(loadError, "Failed to load multimodal runner"); + } + + memorySizeLowerBound = fs::file_size(fs::path(modelSource)) + + fs::file_size(fs::path(tokenizerSource)); +} + +std::string MultimodalLLM::generate(std::string imagePath, std::string prompt, + std::shared_ptr callback) { + if (!runner_) { + throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, + "Runner is not loaded"); + } + + llm::Image image = loadImageForLFM2(imagePath); + + std::vector inputs = { + llm::make_text_input(std::string(kChatPrefix)), + llm::make_image_input(std::move(image)), + llm::make_text_input(prompt + kChatSuffix), + }; + + std::string output; + auto nativeCallback = [this, &callback, &output](const std::string &token) { + output += token; + if (callback && callInvoker) { + callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) { + callback->call(runtime, jsi::String::createFromUtf8(runtime, token)); + }); + } + }; + + auto error = runner_->generate(inputs, temperature_, topp_, + /*max_new_tokens=*/-1, nativeCallback); + if (error != Error::Ok) { + throw RnExecutorchError(error, "Failed to generate text"); + } + + runner_->reset(); + return output; +} + +void MultimodalLLM::interrupt() { + if (!runner_) { + throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, + "Can't interrupt a model that's not loaded"); + } + runner_->stop(); +} + +size_t MultimodalLLM::getGeneratedTokenCount() const noexcept { + if (!runner_) + return 0; + return static_cast(runner_->stats().num_generated_tokens); +} + +size_t MultimodalLLM::getPromptTokenCount() const noexcept { + if (!runner_) + return 0; + return static_cast(runner_->stats().num_prompt_tokens); +} + +size_t MultimodalLLM::getMemoryLowerBound() const noexcept { + return memorySizeLowerBound; +} + +void MultimodalLLM::setTemperature(float temperature) { + if (temperature < 0.0f) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, + "Temperature must be non-negative"); + } + temperature_ = temperature; +} + +void MultimodalLLM::setTopp(float topp) { + if (topp < 0.0f || topp > 1.0f) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, + "Top-p must be between 0.0 and 1.0"); + } + topp_ = topp; +} + +void MultimodalLLM::unload() noexcept { runner_.reset(nullptr); } + +} // namespace rnexecutorch::models::multimodal_llm diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h new file mode 100644 index 000000000..6b9f8698c --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include + +namespace rnexecutorch { +namespace models::multimodal_llm { +using namespace facebook; + +class MultimodalLLM : public BaseModel { +public: + explicit MultimodalLLM(const std::string &modelSource, + const std::string &tokenizerSource, + std::shared_ptr callInvoker); + + std::string generate(std::string imagePath, std::string prompt, + std::shared_ptr callback); + void interrupt(); + void unload() noexcept; + size_t getGeneratedTokenCount() const noexcept; + size_t getPromptTokenCount() const noexcept; + size_t getMemoryLowerBound() const noexcept; + void setTemperature(float temperature); + void setTopp(float topp); + +private: + float temperature_ = 0.8f; + float topp_ = 0.9f; + std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner_; +}; +} // namespace models::multimodal_llm + +REGISTER_CONSTRUCTOR(models::multimodal_llm::MultimodalLLM, std::string, + std::string, std::shared_ptr); +} // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/runner/image.h b/packages/react-native-executorch/common/runner/image.h new file mode 100644 index 000000000..86373ca91 --- /dev/null +++ b/packages/react-native-executorch/common/runner/image.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/image.h + +#pragma once + +#include +#include +#include + +#include +#include + +namespace executorch { +namespace extension { +namespace llm { + +class Image { +public: + Image() : width_(0), height_(0), channels_(0) {} + + Image(std::vector &&data, int32_t width, int32_t height, + int32_t channels) + : data_(std::move(data)), width_(width), height_(height), + channels_(channels) {} + + Image(std::vector &&data, int32_t width, int32_t height, + int32_t channels) + : data_(std::move(data)), width_(width), height_(height), + channels_(channels) {} + + int32_t width() const { return width_; } + int32_t height() const { return height_; } + int32_t channels() const { return channels_; } + + bool is_uint8() const { + return std::holds_alternative>(data_); + } + bool is_float() const { + return std::holds_alternative>(data_); + } + + const std::vector &get_uint8_data() const & { + return std::get>(data_); + } + const std::vector &get_float_data() const & { + return std::get>(data_); + } + std::vector &get_float_data() & { + return std::get>(data_); + } + + ::executorch::runtime::Result<::executorch::extension::TensorPtr> + toTensor(bool with_batch = false) const { + std::vector<::executorch::aten::SizesType> sizes = {channels(), height(), + width()}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); + } + if (is_float()) { + return ::executorch::extension::from_blob( + const_cast(get_float_data().data()), sizes, + ::executorch::aten::ScalarType::Float); + } else if (is_uint8()) { + return ::executorch::extension::from_blob( + const_cast(get_uint8_data().data()), sizes, + ::executorch::aten::ScalarType::Byte); + } + ET_LOG(Error, "Image data is not initialized."); + return ::executorch::runtime::Error::NotSupported; + } + +private: + std::variant, std::vector> data_; + int32_t width_; + int32_t height_; + int32_t channels_; +}; + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h new file mode 100644 index 000000000..2eafe3901 --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/multimodal_decoder_runner.h + +#pragma once + +#include "constants.h" +#include "text_decoder_runner.h" + +namespace executorch { +namespace extension { +namespace llm { + +// Extends TextDecoderRunner to use the multi-method PTE layout: +// token_embedding method → embeddings +// text_decoder method → logits +class MultimodalDecoderRunner : public TextDecoderRunner { +public: + explicit MultimodalDecoderRunner(Module *module, IOManager *io_manager) + : TextDecoderRunner(module, io_manager) {} + + // Step: embed single token, then decode. + inline ::executorch::runtime::Result<::executorch::aten::Tensor> + step(TensorPtr &tokens, int64_t start_pos) override { + auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens); + if (!embed_result.ok()) { + return embed_result.error(); + } + return decode((*embed_result)[0], start_pos); + } + + // Decode an embedding EValue to logits. + inline ::executorch::runtime::Result<::executorch::aten::Tensor> + decode(const ::executorch::runtime::EValue &embeddings, int64_t start_pos) { + auto start_pos_tensor = ::executorch::extension::from_blob( + &start_pos, {1}, ::executorch::aten::ScalarType::Long); + auto outputs_result = + module_->execute(kTextModelMethod, {embeddings, start_pos_tensor}); + if (!outputs_result.ok()) { + return outputs_result.error(); + } + auto &outputs = *outputs_result; + ET_CHECK_MSG(outputs.size() == 1, + "Expected 1 output from text_decoder, got %zu", + outputs.size()); + ET_CHECK_MSG(outputs[0].isTensor(), "text_decoder output is not a tensor"); + return outputs[0].toTensor(); + } + + inline ::executorch::runtime::Error load() override { + if (is_method_loaded()) { + return ::executorch::runtime::Error::Ok; + } + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod)); + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod)); + return ::executorch::runtime::Error::Ok; + } + + inline bool is_method_loaded() override { + return module_->is_method_loaded(kTokenEmbeddingMethod) && + module_->is_method_loaded(kTextModelMethod); + } +}; + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h new file mode 100644 index 000000000..4ce588db6 --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_input.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/multimodal_input.h +// Audio support stripped — only text and image are used by LFM2-VL. + +#pragma once + +#include +#include +#include +#include + +namespace executorch { +namespace extension { +namespace llm { + +class MultimodalInput { +public: + explicit MultimodalInput(const std::string &text) : data_(text) {} + explicit MultimodalInput(std::string &&text) : data_(std::move(text)) {} + explicit MultimodalInput(const std::vector &tokens) + : data_(tokens) {} + explicit MultimodalInput(std::vector &&tokens) + : data_(std::move(tokens)) {} + explicit MultimodalInput(const Image &image) : data_(image) {} + explicit MultimodalInput(Image &&image) : data_(std::move(image)) {} + + MultimodalInput(const MultimodalInput &) = default; + MultimodalInput &operator=(const MultimodalInput &) = default; + MultimodalInput(MultimodalInput &&) noexcept = default; + MultimodalInput &operator=(MultimodalInput &&) noexcept = default; + + bool is_text() const noexcept { + return std::holds_alternative(data_); + } + bool is_tokens() const noexcept { + return std::holds_alternative>(data_); + } + bool is_image() const noexcept { + return std::holds_alternative(data_); + } + + const std::string &get_text() const & { return std::get(data_); } + const std::vector &get_tokens() const & { + return std::get>(data_); + } + const Image &get_image() const & { return std::get(data_); } + +private: + std::variant, Image> data_; +}; + +inline MultimodalInput make_text_input(const std::string &text) noexcept { + return MultimodalInput(text); +} +inline MultimodalInput make_text_input(std::string &&text) noexcept { + return MultimodalInput(std::move(text)); +} +inline MultimodalInput make_image_input(const Image &image) noexcept { + return MultimodalInput(image); +} +inline MultimodalInput make_image_input(Image &&image) noexcept { + return MultimodalInput(std::move(image)); +} + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp new file mode 100644 index 000000000..c39c7cc0f --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp @@ -0,0 +1,179 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/multimodal_prefiller.cpp +// with our token-embedding padding fix and LFM2-VL adaptations. + +#include "multimodal_prefiller.h" +#include "constants.h" +#include "util.h" + +namespace executorch { +namespace extension { +namespace llm { + +using ::executorch::aten::SizesType; +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Result; + +MultimodalPrefiller::MultimodalPrefiller( + Module *module, MultimodalDecoderRunner *decoder_runner, + tokenizers::HFTokenizer *tokenizer, IOManager *io_manager) + : module_(module), decoder_runner_(decoder_runner), tokenizer_(tokenizer), + io_manager_(io_manager) {} + +Result MultimodalPrefiller::prefill(const MultimodalInput &input, + int64_t &start_pos) { + // Keep backing storage alive for the duration of the prefill call. + EValue encoder_output; + std::vector padded_tokens_storage; + TensorPtr sliced_embed_storage; + + if (input.is_image()) { + const Image &image = input.get_image(); + + // Query input dtype expected by vision_encoder. + auto method_meta_result = module_->method_meta(kVisionEncoderMethod); + ET_CHECK_OK_OR_RETURN_ERROR(method_meta_result.error(), + "Failed to get method_meta for %s", + kVisionEncoderMethod); + auto &method_meta = *method_meta_result; + + ET_CHECK_OR_RETURN_ERROR(method_meta.num_inputs() > 0, InvalidArgument, + "vision_encoder has no inputs"); + auto input_meta_result = method_meta.input_tensor_meta(0); + ET_CHECK_OK_OR_RETURN_ERROR(input_meta_result.error(), + "Cannot get vision_encoder input meta at 0"); + auto expected_dtype = input_meta_result->scalar_type(); + + ET_CHECK_OR_RETURN_ERROR( + expected_dtype == ::executorch::aten::ScalarType::Float && + image.is_float(), + InvalidArgument, "vision_encoder expects float32 image data"); + + auto expected_dims = input_meta_result->sizes(); + auto image_tensor_result = + image.toTensor(/*with_batch=*/expected_dims.size() == 4); + ET_CHECK_OK_OR_RETURN_ERROR(image_tensor_result.error(), + "Failed to convert image to tensor"); + + auto image_encoder_result = + module_->execute(kVisionEncoderMethod, *image_tensor_result); + ET_CHECK_OK_OR_RETURN_ERROR(image_encoder_result.error()); + encoder_output = (*image_encoder_result)[0]; + + } else if (input.is_text() || input.is_tokens()) { + std::vector tokens; + if (input.is_text()) { + auto encode_result = tokenizer_->encode(input.get_text()); + if (!encode_result.ok()) { + ET_LOG(Error, "Tokenizer encode error %d", + static_cast(encode_result.error())); + return Error::InvalidArgument; + } + tokens = std::move(*encode_result); + } else { + tokens = input.get_tokens(); + } + + const auto actual_seq_len = static_cast(tokens.size()); + + // The token_embedding PTE has a fixed MAX_SEQ_LEN input buffer. + // Pad with zeros, run embedding, then slice output back to actual length. + int64_t max_seq_len = actual_seq_len; // fallback: no padding needed + auto max_seq_len_result = module_->get(kMaxSeqLen); + if (max_seq_len_result.error() == Error::Ok) { + max_seq_len = max_seq_len_result->toScalar().to(); + } + + padded_tokens_storage.assign(max_seq_len, 0); + std::copy(tokens.begin(), tokens.end(), padded_tokens_storage.begin()); + + auto text_tensor = ::executorch::extension::from_blob( + padded_tokens_storage.data(), {1, static_cast(max_seq_len)}, + ::executorch::aten::ScalarType::Long); + + auto embed_result = module_->execute(kTokenEmbeddingMethod, text_tensor); + ET_CHECK_OK_OR_RETURN_ERROR(embed_result.error()); + + auto full_embed = (*embed_result)[0].toTensor(); + const auto embed_dim = static_cast(full_embed.size(2)); + sliced_embed_storage = ::executorch::extension::from_blob( + full_embed.mutable_data_ptr(), {1, actual_seq_len, embed_dim}, + ::executorch::aten::ScalarType::Float); + encoder_output = EValue(*sliced_embed_storage); + + } else { + ET_LOG(Error, "Unsupported MultimodalInput type"); + return Error::NotSupported; + } + + // Run text_decoder for prefill. + int64_t seq_len = encoder_output.toTensor().size(1); + if (seq_len == 0) { + ET_LOG(Error, "Encoder returned empty output"); + return Error::InvalidState; + } + + std::vector cache_positions; + auto cache_pos_result = populate_start_pos_or_cache_position( + module_, start_pos, cache_positions, seq_len, kTextModelMethod); + ET_CHECK_OK_OR_RETURN_ERROR(cache_pos_result.error()); + + auto prefill_result = + module_->execute(kTextModelMethod, {encoder_output, *cache_pos_result}); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_result.error()); + + auto &prefill_outputs = *prefill_result; + ET_CHECK_OR_RETURN_ERROR(!prefill_outputs.empty(), InvalidState, + "text_decoder returned no outputs during prefill"); + + auto logits = prefill_outputs[0].toTensor(); + start_pos += seq_len; + + return static_cast(decoder_runner_->logits_to_token(logits)); +} + +Error MultimodalPrefiller::load() { + if (is_method_loaded()) { + return Error::Ok; + } + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTokenEmbeddingMethod)); + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kTextModelMethod)); + + auto method_names_result = module_->method_names(); + ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error(), + "Failed to get method names"); + const auto &methods = *method_names_result; + + if (methods.find(kVisionEncoderMethod) != methods.end()) { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kVisionEncoderMethod)); + } + return Error::Ok; +} + +bool MultimodalPrefiller::is_method_loaded() { + auto methods_res = module_->method_names(); + if (methods_res.error() != Error::Ok) { + return false; + } + if (!module_->is_method_loaded(kTokenEmbeddingMethod) || + !module_->is_method_loaded(kTextModelMethod)) { + return false; + } + const auto &methods = *methods_res; + if (methods.find(kVisionEncoderMethod) != methods.end()) { + return module_->is_method_loaded(kVisionEncoderMethod); + } + return true; +} + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h new file mode 100644 index 000000000..ee0f99a5b --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/multimodal_prefiller.h + +#pragma once + +#include "multimodal_decoder_runner.h" +#include "multimodal_input.h" +#include +#include + +namespace executorch { +namespace extension { +namespace llm { + +// Prefills all multimodal inputs (image + text segments) into the KV cache. +// Implements the same padding logic as the ET repo's multimodal_prefiller.cpp. +class MultimodalPrefiller { +public: + explicit MultimodalPrefiller(Module *module, + MultimodalDecoderRunner *decoder_runner, + tokenizers::HFTokenizer *tokenizer, + IOManager *io_manager); + + // Prefill one input segment. Updates start_pos in-place. + // Returns the first predicted token after this segment. + ::executorch::runtime::Result prefill(const MultimodalInput &input, + int64_t &start_pos); + + ::executorch::runtime::Error load(); + bool is_method_loaded(); + +private: + Module *module_; + MultimodalDecoderRunner *decoder_runner_; + tokenizers::HFTokenizer *tokenizer_; + IOManager *io_manager_; +}; + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp new file mode 100644 index 000000000..842b96c72 --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/multimodal_runner.cpp + +#include "multimodal_runner.h" +#include "constants.h" +#include "util.h" +#include + +namespace executorch { +namespace extension { +namespace llm { + +using ::executorch::extension::Module; +using ::executorch::runtime::Error; + +MultimodalRunner::MultimodalRunner( + std::unordered_map metadata, + std::unique_ptr tokenizer, + std::unique_ptr module, + std::unique_ptr decoder_runner, + std::unique_ptr prefiller, + std::unique_ptr io_manager, + std::unique_ptr token_generator, + std::unique_ptr stats) + : metadata_(std::move(metadata)), tokenizer_(std::move(tokenizer)), + module_(std::move(module)), decoder_runner_(std::move(decoder_runner)), + prefiller_(std::move(prefiller)), io_manager_(std::move(io_manager)), + token_generator_(std::move(token_generator)), stats_(std::move(stats)), + pos_(0) {} + +bool MultimodalRunner::is_loaded() { + return prefiller_->is_method_loaded() && token_generator_->is_loaded(); +} + +Error MultimodalRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + ET_CHECK_OK_OR_RETURN_ERROR(prefiller_->load()); + ET_CHECK_OK_OR_RETURN_ERROR(token_generator_->load()); + return Error::Ok; +} + +Error MultimodalRunner::generate( + const std::vector &inputs, float temperature, float topp, + int32_t max_new_tokens, + std::function token_callback) { + if (inputs.empty()) { + ET_LOG(Error, "MultimodalInput vector cannot be empty"); + return Error::InvalidArgument; + } + + if (!is_loaded()) { + ET_CHECK_OK_OR_RETURN_ERROR(load()); + } + + stats_->inference_start_ms = time_in_ms(); + + // Prefill all input segments in order. + uint64_t prefill_next_token = 0; + for (size_t i = 0; i < inputs.size(); ++i) { + ET_LOG(Info, "Prefilling input %zu/%zu", i + 1, inputs.size()); + auto prefill_result = prefiller_->prefill(inputs[i], pos_); + if (!prefill_result.ok()) { + return prefill_result.error(); + } + prefill_next_token = prefill_result.get(); + } + + stats_->first_token_ms = time_in_ms(); + stats_->prompt_eval_end_ms = time_in_ms(); + stats_->num_prompt_tokens = pos_; + + // Decode and emit the first token from prefill. + auto decode_result = + tokenizer_->decode(prefill_next_token, prefill_next_token); + if (!decode_result.ok()) { + ET_LOG(Error, "Tokenizer decode error %d", + static_cast(decode_result.error())); + return Error::InvalidArgument; + } + const std::string first_piece = std::move(*decode_result); + safe_printf(first_piece.c_str()); + fflush(stdout); + if (token_callback) { + token_callback(first_piece); + } + + // Resolve max_new_tokens from metadata if caller passed -1. + int64_t context_len = metadata_.count(kMaxContextLen) + ? metadata_.at(kMaxContextLen) + : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen) + : 2048; + int32_t resolved_max_new = max_new_tokens > 0 + ? max_new_tokens + : static_cast(context_len - pos_); + resolved_max_new = std::max(0, resolved_max_new); + + // Autoregressive decode loop. + std::vector prompt_tokens = {prefill_next_token}; + auto wrapped_callback = [&](const std::string &piece) { + safe_printf(piece.c_str()); + fflush(stdout); + if (token_callback) { + token_callback(piece); + } + }; + + auto generate_result = token_generator_->generate( + prompt_tokens, pos_, + static_cast(std::max(0, resolved_max_new - 1)), temperature, + topp, wrapped_callback); + + if (!generate_result.ok()) { + return generate_result.error(); + } + + int64_t num_generated = generate_result.get(); + pos_ += num_generated; + + stats_->inference_end_ms = time_in_ms(); + stats_->num_generated_tokens = num_generated; + + return Error::Ok; +} + +void MultimodalRunner::stop() { + if (token_generator_) { + token_generator_->stop(); + } +} + +void MultimodalRunner::reset() { + pos_ = 0; + if (stats_) { + stats_->reset(); + } +} + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h new file mode 100644 index 000000000..c8007a67e --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_runner.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Ported from executorch/extension/llm/runner/multimodal_runner.h + +#pragma once + +#include "multimodal_decoder_runner.h" +#include "multimodal_input.h" +#include "multimodal_prefiller.h" +#include "stats.h" +#include "text_token_generator.h" +#include +#include +#include +#include +#include +#include + +namespace executorch { +namespace extension { +namespace llm { + +class MultimodalRunner { +public: + explicit MultimodalRunner( + std::unordered_map metadata, + std::unique_ptr tokenizer, + std::unique_ptr module, + std::unique_ptr decoder_runner, + std::unique_ptr prefiller, + std::unique_ptr io_manager, + std::unique_ptr token_generator, + std::unique_ptr stats); + + bool is_loaded(); + ::executorch::runtime::Error load(); + + ::executorch::runtime::Error + generate(const std::vector &inputs, float temperature, + float topp, int32_t max_new_tokens, + std::function token_callback = {}); + + void stop(); + void reset(); + + Stats &stats() { return *stats_; } + +private: + std::unordered_map metadata_; + std::unique_ptr tokenizer_; + std::unique_ptr module_; + std::unique_ptr decoder_runner_; + std::unique_ptr prefiller_; + std::unique_ptr io_manager_; + std::unique_ptr token_generator_; + std::unique_ptr stats_; + int64_t pos_ = 0; +}; + +} // namespace llm +} // namespace extension +} // namespace executorch diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts new file mode 100644 index 000000000..0a54239cc --- /dev/null +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts @@ -0,0 +1,153 @@ +import { useCallback, useEffect, useRef, useState } from 'react'; +import { ResourceSource } from '../../types/common'; +import { ResourceFetcher } from '../../utils/ResourceFetcher'; +import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; +import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; + +export interface MultimodalLLMProps { + model: { + modelSource: ResourceSource; + tokenizerSource: ResourceSource; + }; + preventLoad?: boolean; +} + +export interface MultimodalLLMType { + isReady: boolean; + isGenerating: boolean; + downloadProgress: number; + response: string; + error: RnExecutorchError | null; + generate: (imagePath: string, prompt: string) => Promise; + interrupt: () => void; +} + +/** + * React hook for managing a Multimodal LLM (VLM) instance. + * Uses `loadMultimodalLLM` native global, which wraps a multi-method PTE + * with vision_encoder, token_embedding, and text_decoder methods. + * + * @category Hooks + */ +export const useMultimodalLLM = ({ + model, + preventLoad = false, +}: MultimodalLLMProps): MultimodalLLMType => { + const [nativeModule, setNativeModule] = useState(null); + const [isReady, setIsReady] = useState(false); + const [isGenerating, setIsGenerating] = useState(false); + const [downloadProgress, setDownloadProgress] = useState(0); + const [response, setResponse] = useState(''); + const [error, setError] = useState(null); + + useEffect(() => { + setDownloadProgress(0); + setError(null); + setIsReady(false); + + if (preventLoad) return; + + let cancelled = false; + + (async () => { + try { + const [modelResults, tokenizerResults] = await Promise.all([ + ResourceFetcher.fetch(setDownloadProgress, model.modelSource), + ResourceFetcher.fetch(undefined, model.tokenizerSource), + ]); + + if (cancelled) return; + + const modelPath = modelResults?.[0]; + const tokenizerPath = tokenizerResults?.[0]; + + if (!modelPath || !tokenizerPath) { + throw new RnExecutorchError( + RnExecutorchErrorCode.DownloadInterrupted, + 'Download interrupted — not all files were fetched.' + ); + } + + const mod = global.loadMultimodalLLM(modelPath, tokenizerPath); + setNativeModule(mod); + setIsReady(true); + } catch (e) { + if (!cancelled) { + setError(parseUnknownError(e)); + } + } + })(); + + return () => { + cancelled = true; + }; + }, [model.modelSource, model.tokenizerSource, preventLoad]); + + const tokenBufferRef = useRef(''); + const rafRef = useRef | null>(null); + + const generate = useCallback( + async (imagePath: string, prompt: string): Promise => { + if (!nativeModule) { + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'Multimodal LLM is not loaded yet.' + ); + } + tokenBufferRef.current = ''; + if (rafRef.current !== null) { + cancelAnimationFrame(rafRef.current); + rafRef.current = null; + } + setResponse(''); + setIsGenerating(true); + try { + const result: string = await nativeModule.generate( + imagePath, + prompt, + (token: string) => { + tokenBufferRef.current += token; + if (rafRef.current === null) { + rafRef.current = requestAnimationFrame(() => { + rafRef.current = null; + const buffered = tokenBufferRef.current; + tokenBufferRef.current = ''; + setResponse((prev) => prev + buffered); + }); + } + } + ); + // Flush any remaining buffered tokens after generation completes + if (rafRef.current !== null) { + cancelAnimationFrame(rafRef.current); + rafRef.current = null; + } + if (tokenBufferRef.current) { + const remaining = tokenBufferRef.current; + tokenBufferRef.current = ''; + setResponse((prev) => prev + remaining); + } + return result; + } catch (e) { + throw parseUnknownError(e); + } finally { + setIsGenerating(false); + } + }, + [nativeModule] + ); + + const interrupt = useCallback(() => { + nativeModule?.interrupt(); + }, [nativeModule]); + + return { + isReady, + isGenerating, + downloadProgress, + response, + error, + generate, + interrupt, + }; +}; diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index dd7557ca2..e544d9cca 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -49,6 +49,7 @@ declare global { var loadVAD: (source: string) => any; var loadTextEmbeddings: (modelSource: string, tokenizerSource: string) => any; var loadLLM: (modelSource: string, tokenizerSource: string) => any; + var loadMultimodalLLM: (modelSource: string, tokenizerSource: string) => any; var loadTextToImage: ( tokenizerSource: string, encoderSource: string, @@ -97,6 +98,7 @@ if ( global.loadImageEmbeddings == null || global.loadVAD == null || global.loadLLM == null || + global.loadMultimodalLLM == null || global.loadSpeechToText == null || global.loadTextToSpeechKokoro == null || global.loadOCR == null || @@ -121,6 +123,7 @@ export * from './hooks/computer_vision/useImageEmbeddings'; export * from './hooks/computer_vision/useTextToImage'; export * from './hooks/natural_language_processing/useLLM'; +export * from './hooks/natural_language_processing/useMultimodalLLM'; export * from './hooks/natural_language_processing/useSpeechToText'; export * from './hooks/natural_language_processing/useTextToSpeech'; export * from './hooks/natural_language_processing/useTextEmbeddings'; From 57d96189bd4fa8490bf1d4b1a9778e73cd0bacc3 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 11:13:13 +0100 Subject: [PATCH 02/46] feat: unified LLM runner for text-only and multimodal PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add UnifiedRunner that auto-detects PTE layout at load time (forward method → text-only, token_embedding+text_decoder → multimodal) - Merge MultimodalLLM into LLM using UnifiedRunner - VLMs now have full feature parity: multi-turn, countTextTokens, getMaxContextLength, setCountInterval, setTimeInterval - Remove Runner, MultimodalRunner, MultimodalLLM classes - Add sendMessageWithImage to LLMController and useLLM hook - Remove useMultimodalLLM — callers use useLLM with isMultimodal: true - Migrate multimodal_llm example app to useLLM Co-Authored-By: Claude Sonnet 4.6 --- apps/llm/app/multimodal_llm/index.tsx | 8 +- .../rnexecutorch/RnExecutorchInstaller.cpp | 6 - .../host_objects/ModelHostObject.h | 45 +- .../common/rnexecutorch/models/llm/LLM.cpp | 152 +++++-- .../common/rnexecutorch/models/llm/LLM.h | 15 +- .../models/multimodal_llm/MultimodalLLM.cpp | 197 --------- .../models/multimodal_llm/MultimodalLLM.h | 40 -- .../common/runner/multimodal_runner.cpp | 149 ------- .../common/runner/multimodal_runner.h | 68 --- .../common/runner/runner.cpp | 391 ------------------ .../common/runner/runner.h | 87 ---- .../common/runner/unified_runner.cpp | 388 +++++++++++++++++ .../common/runner/unified_runner.h | 100 +++++ .../src/controllers/LLMController.ts | 134 ++++-- .../natural_language_processing/useLLM.ts | 13 +- .../useMultimodalLLM.ts | 153 ------- packages/react-native-executorch/src/index.ts | 3 - .../react-native-executorch/src/types/llm.ts | 15 + yarn.lock | 11 + 19 files changed, 793 insertions(+), 1182 deletions(-) delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp delete mode 100644 packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h delete mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.cpp delete mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.h delete mode 100644 packages/react-native-executorch/common/runner/runner.cpp delete mode 100644 packages/react-native-executorch/common/runner/runner.h create mode 100644 packages/react-native-executorch/common/runner/unified_runner.cpp create mode 100644 packages/react-native-executorch/common/runner/unified_runner.h delete mode 100644 packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 3a3b4692b..990f6cf1a 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -15,7 +15,7 @@ import { import * as DocumentPicker from 'expo-document-picker'; import { launchImageLibrary } from 'react-native-image-picker'; import { useIsFocused } from '@react-navigation/native'; -import { useMultimodalLLM } from 'react-native-executorch'; +import { useLLM } from 'react-native-executorch'; import ColorPalette from '../../colors'; import Spinner from '../../components/Spinner'; import { GeneratingContext } from '../../context'; @@ -127,7 +127,9 @@ function MultimodalLLMScreen({ const scrollViewRef = useRef(null); const { setGlobalGenerating } = useContext(GeneratingContext); - const vlm = useMultimodalLLM({ model: { modelSource, tokenizerSource } }); + const vlm = useLLM({ + model: { modelSource, tokenizerSource, isMultimodal: true }, + }); useEffect(() => { setGlobalGenerating(vlm.isGenerating); @@ -153,7 +155,7 @@ function MultimodalLLMScreen({ if (!imageUri || !prompt.trim() || !vlm.isReady || vlm.isGenerating) return; Keyboard.dismiss(); try { - await vlm.generate(imageUri, prompt.trim()); + await vlm.sendMessageWithImage(imageUri, prompt.trim()); } catch (e) { console.error('Generation error:', e); } diff --git a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp index 4cb6afef8..9d4b419e2 100644 --- a/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/RnExecutorchInstaller.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -89,11 +88,6 @@ void RnExecutorchInstaller::injectJSIBindings( RnExecutorchInstaller::loadModel( jsiRuntime, jsCallInvoker, "loadLLM")); - jsiRuntime->global().setProperty( - *jsiRuntime, "loadMultimodalLLM", - RnExecutorchInstaller::loadModel( - jsiRuntime, jsCallInvoker, "loadMultimodalLLM")); - jsiRuntime->global().setProperty( *jsiRuntime, "loadOCR", RnExecutorchInstaller::loadModel( diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 2b7cbc2e1..f41da1e45 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include @@ -33,11 +32,7 @@ template class ModelHostObject : public JsiHostObject { explicit ModelHostObject(const std::shared_ptr &model, std::shared_ptr callInvoker) : model(model), callInvoker(callInvoker) { - // MultimodalLLM moves module_ into its runner during construction, so - // the base class methods that go through module_ (forward, getInputShape) - // are unsafe to expose. Its unload is registered separately below. - if constexpr (meta::DerivedFromOrSameAs && - !meta::SameAs) { + if constexpr (meta::DerivedFromOrSameAs) { addFunctions( JSI_EXPORT_FUNCTION(ModelHostObject, unload, "unload")); @@ -50,7 +45,9 @@ template class ModelHostObject : public JsiHostObject { "getInputShape")); } - if constexpr (meta::HasGenerate) { + // LLM has overloaded generate — handled explicitly in the LLM block below + if constexpr (meta::HasGenerate && + !meta::SameAs) { addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, promiseHostFunction<&Model::generate>, "generate")); @@ -103,6 +100,12 @@ template class ModelHostObject : public JsiHostObject { } if constexpr (meta::SameAs) { + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, + promiseHostFunction)>(&Model::generate)>, + "generate")); + addFunctions(JSI_EXPORT_FUNCTION( ModelHostObject, synchronousHostFunction<&Model::interrupt>, "interrupt")); @@ -149,6 +152,17 @@ template class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, synchronousHostFunction<&Model::reset>, "reset")); + + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, + promiseHostFunction)>( + &Model::generate)>, + "generateWithImage")); + + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, synchronousHostFunction<&Model::isMultimodal>, + "isMultimodal")); } if constexpr (meta::SameAs) { @@ -177,23 +191,6 @@ template class ModelHostObject : public JsiHostObject { ModelHostObject, synchronousHostFunction<&Model::streamStop>, "streamStop")); } - - if constexpr (meta::SameAs) { - addFunctions(JSI_EXPORT_FUNCTION( - ModelHostObject, synchronousHostFunction<&Model::interrupt>, - "interrupt")); - - addFunctions(JSI_EXPORT_FUNCTION( - ModelHostObject, - synchronousHostFunction<&Model::setTemperature>, "setTemperature")); - - addFunctions(JSI_EXPORT_FUNCTION(ModelHostObject, - synchronousHostFunction<&Model::setTopp>, - "setTopp")); - - addFunctions( - JSI_EXPORT_FUNCTION(ModelHostObject, unload, "unload")); - } } // A generic host function that runs synchronously, works analogously to the diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 4a9d40033..66b151faa 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -3,22 +3,64 @@ #include #include #include +#include #include +#include namespace rnexecutorch::models::llm { namespace llm = ::executorch::extension::llm; namespace fs = std::filesystem; using namespace facebook; -using executorch::extension::TensorPtr; using executorch::extension::module::Module; using executorch::runtime::Error; +// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values [0,255] +static constexpr int kImageSize = 512; +static constexpr int kImageChannels = 3; + +// LFM2-VL chat template +static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n"; +static constexpr const char *kChatSuffix = + "<|im_end|>\n<|im_start|>assistant\n"; + +static llm::Image loadImageForVLM(const std::string &imagePath) { + cv::Mat mat = image_processing::readImage(imagePath); + cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); + cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); + + std::vector chw(kImageChannels * kImageSize * kImageSize); + const int pixelCount = kImageSize * kImageSize; + for (int i = 0; i < pixelCount; ++i) { + cv::Vec3b px = mat.at(i / kImageSize, i % kImageSize); + for (int c = 0; c < kImageChannels; ++c) { + chw[c * pixelCount + i] = static_cast(px[c]); + } + } + return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels); +} + LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, std::shared_ptr callInvoker) - : BaseModel(modelSource, callInvoker, Module::LoadMode::File), - runner( - std::make_unique(module_.get(), tokenizerSource)) { - auto loadResult = runner->load(); + : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { + + // Peek at method names to decide text vs multimodal before constructing + // runner + auto method_names_result = module_->method_names(); + multimodal_ = method_names_result.ok() && + method_names_result->count(llm::kTokenEmbeddingMethod) > 0 && + method_names_result->count(llm::kTextModelMethod) > 0; + + if (multimodal_) { + // Transfer module_ ownership to the runner (same as old MultimodalLLM) + runner_ = std::make_unique( + nullptr, std::move(module_), tokenizerSource); + } else { + // Lend module_ as a raw pointer (same as old LLM) + runner_ = std::make_unique(module_.get(), nullptr, + tokenizerSource); + } + + auto loadResult = runner_->load(); if (loadResult != Error::Ok) { throw RnExecutorchError(loadResult, "Failed to load LLM runner"); } @@ -27,17 +69,21 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, fs::file_size(fs::path(tokenizerSource)); } -// TODO: add a way to manipulate the generation config with params +bool LLM::isMultimodal() const noexcept { return multimodal_; } + std::string LLM::generate(std::string input, std::shared_ptr callback) { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Runner is not loaded"); } + if (multimodal_) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidUserInput, + "This is a multimodal model. Call generate(imagePath, prompt, cb)."); + } std::string output; - - // Create a native callback that accumulates tokens and optionally invokes JS auto nativeCallback = [this, callback, &output](const std::string &token) { output += token; if (callback && callInvoker) { @@ -48,51 +94,87 @@ std::string LLM::generate(std::string input, }; auto config = llm::GenerationConfig{.echo = false, .warming = false}; - auto error = runner->generate(input, config, nativeCallback, {}); - if (error != executorch::runtime::Error::Ok) { + auto error = runner_->generate(input, config, nativeCallback, {}); + if (error != Error::Ok) { throw RnExecutorchError(error, "Failed to generate text"); } + return output; +} + +std::string LLM::generate(std::string imagePath, std::string prompt, + std::shared_ptr callback) { + if (!runner_ || !runner_->is_loaded()) { + throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, + "Runner is not loaded"); + } + if (!multimodal_) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidUserInput, + "This is a text-only model. Call generate(prompt, cb)."); + } + + llm::Image image = loadImageForVLM(imagePath); + std::vector inputs = { + llm::make_text_input(std::string(kChatPrefix)), + llm::make_image_input(std::move(image)), + llm::make_text_input(prompt + kChatSuffix), + }; + + std::string output; + auto nativeCallback = [this, &callback, &output](const std::string &token) { + output += token; + if (callback && callInvoker) { + callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) { + callback->call(runtime, jsi::String::createFromUtf8(runtime, token)); + }); + } + }; + + auto error = + runner_->generate(inputs, temperature_, topp_, -1, nativeCallback); + if (error != Error::Ok) { + throw RnExecutorchError(error, "Failed to generate multimodal response"); + } + runner_->reset(); return output; } void LLM::interrupt() { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Can't interrupt a model that's not loaded"); } - runner->stop(); + runner_->stop(); } void LLM::reset() { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Can't reset a model that's not loaded"); } - runner->reset(); + runner_->reset(); } size_t LLM::getGeneratedTokenCount() const noexcept { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) return 0; - } - return runner->stats_.num_generated_tokens; + return runner_->stats_.num_generated_tokens; } size_t LLM::getPromptTokenCount() const noexcept { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) return 0; - } - return runner->stats_.num_prompt_tokens; + return runner_->stats_.num_prompt_tokens; } int32_t LLM::countTextTokens(std::string text) const { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError( RnExecutorchErrorCode::ModuleNotLoaded, "Can't count tokens from a model that's not loaded"); } - return runner->count_text_tokens(text); + return runner_->count_text_tokens(text); } size_t LLM::getMemoryLowerBound() const noexcept { @@ -100,7 +182,7 @@ size_t LLM::getMemoryLowerBound() const noexcept { } void LLM::setCountInterval(size_t countInterval) { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Can't configure a model that's not loaded"); } @@ -108,11 +190,11 @@ void LLM::setCountInterval(size_t countInterval) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, "Count interval must be greater than 0"); } - runner->set_count_interval(countInterval); + runner_->set_count_interval(countInterval); } void LLM::setTimeInterval(size_t timeInterval) { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Can't configure a model that's not loaded"); } @@ -120,11 +202,11 @@ void LLM::setTimeInterval(size_t timeInterval) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, "Time interval must be greater than 0"); } - runner->set_time_interval(timeInterval); + runner_->set_time_interval(timeInterval); } void LLM::setTemperature(float temperature) { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Can't configure a model that's not loaded"); } @@ -132,11 +214,12 @@ void LLM::setTemperature(float temperature) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, "Temperature must be non-negative"); } - runner->set_temperature(temperature); + temperature_ = temperature; + runner_->set_temperature(temperature); } void LLM::setTopp(float topp) { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Can't configure a model that's not loaded"); } @@ -144,18 +227,19 @@ void LLM::setTopp(float topp) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, "Top-p must be between 0.0 and 1.0"); } - runner->set_topp(topp); + topp_ = topp; + runner_->set_topp(topp); } int32_t LLM::getMaxContextLength() const { - if (!runner || !runner->is_loaded()) { + if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError( RnExecutorchErrorCode::ModuleNotLoaded, "Can't get context length from a model that's not loaded"); } - return runner->get_max_context_length(); + return runner_->get_max_context_length(); } -void LLM::unload() noexcept { runner.reset(nullptr); } +void LLM::unload() noexcept { runner_.reset(nullptr); } } // namespace rnexecutorch::models::llm diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 99daaf6f5..3763fe924 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include namespace rnexecutorch { namespace models::llm { @@ -18,8 +18,16 @@ class LLM : public BaseModel { const std::string &tokenizerSource, std::shared_ptr callInvoker); + // Text-only generate (existing signature — used by LLMController) std::string generate(std::string input, std::shared_ptr callback); + + // Multimodal generate (image + text prompt) + std::string generate(std::string imagePath, std::string prompt, + std::shared_ptr callback); + + bool isMultimodal() const noexcept; + void interrupt(); void reset(); void unload() noexcept; @@ -34,7 +42,10 @@ class LLM : public BaseModel { int32_t getMaxContextLength() const; private: - std::unique_ptr runner; + std::unique_ptr runner_; + bool multimodal_; + float temperature_ = 0.8f; + float topp_ = 0.9f; }; } // namespace models::llm diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp deleted file mode 100644 index 7187b3d57..000000000 --- a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include "MultimodalLLM.h" - -#include -#include -#include -#include -#include -#include -#include - -namespace rnexecutorch::models::multimodal_llm { -namespace llm = ::executorch::extension::llm; -namespace fs = std::filesystem; -using namespace facebook; -using ::executorch::extension::module::Module; -using ::executorch::runtime::Error; - -// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values in -// [0,255]. Normalization and patch unfolding are baked into the exported PTE. -static constexpr int kImageSize = 512; -static constexpr int kImageChannels = 3; - -// LFM2-VL chat template -static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n"; -static constexpr const char *kChatSuffix = - "<|im_end|>\n<|im_start|>assistant\n"; - -static llm::Image loadImageForLFM2(const std::string &imagePath) { - cv::Mat mat = image_processing::readImage(imagePath); - cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); - cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); - - // HWC uint8 → CHW float32, values in [0, 255] - std::vector chw(kImageChannels * kImageSize * kImageSize); - const int pixelCount = kImageSize * kImageSize; - for (int i = 0; i < pixelCount; ++i) { - cv::Vec3b px = mat.at(i / kImageSize, i % kImageSize); - for (int c = 0; c < kImageChannels; ++c) { - chw[c * pixelCount + i] = static_cast(px[c]); - } - } - return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels); -} - -MultimodalLLM::MultimodalLLM(const std::string &modelSource, - const std::string &tokenizerSource, - std::shared_ptr callInvoker) - : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { - // Build the multimodal runner from parts — all referencing module_ owned by - // BaseModel so we don't load the PTE twice. - auto tokenizer = std::make_unique(); - auto tokenizer_status = tokenizer->load(tokenizerSource); - if (tokenizer_status != tokenizers::Error::Ok) { - throw RnExecutorchError(RnExecutorchErrorCode::TokenizerError, - "Failed to load tokenizer"); - } - - auto io_manager = std::make_unique(*module_); - auto decoder_runner = std::make_unique( - module_.get(), io_manager.get()); - - auto eos_ids = std::make_unique>(); - // Read EOS ids from PTE constant method if present, default to 7 (<|im_end|>) - auto method_names_result = module_->method_names(); - if (method_names_result.ok()) { - if (method_names_result->count(llm::kEosIds)) { - auto eos_result = module_->execute(llm::kEosIds); - if (eos_result.ok()) { - for (const auto &ev : *eos_result) { - eos_ids->emplace(static_cast(ev.toScalar().to())); - } - } - } - } - if (eos_ids->empty()) { - eos_ids->emplace(7); // <|im_end|> fallback - } - - auto stats = std::make_unique(); - // Keep a raw pointer before moving into the runner so TextTokenGenerator - // can safely reference the same Stats object owned by the runner. - llm::Stats *stats_ptr = stats.get(); - auto token_generator = std::make_unique( - tokenizer.get(), decoder_runner.get(), /*use_kv_cache=*/true, - std::move(eos_ids), stats_ptr); - - auto prefiller = std::make_unique( - module_.get(), decoder_runner.get(), tokenizer.get(), io_manager.get()); - - // Read metadata from the PTE - std::unordered_map metadata = { - {llm::kMaxSeqLen, 2048}, - {llm::kMaxContextLen, 2048}, - }; - if (method_names_result.ok()) { - for (auto &pair : metadata) { - if (method_names_result->count(pair.first)) { - auto val = module_->get(pair.first); - if (val.ok()) { - pair.second = val->toScalar().to(); - } - } - } - } - - runner_ = std::make_unique( - std::move(metadata), std::move(tokenizer), std::move(module_), - std::move(decoder_runner), std::move(prefiller), std::move(io_manager), - std::move(token_generator), std::move(stats)); - - auto loadError = runner_->load(); - if (loadError != Error::Ok) { - throw RnExecutorchError(loadError, "Failed to load multimodal runner"); - } - - memorySizeLowerBound = fs::file_size(fs::path(modelSource)) + - fs::file_size(fs::path(tokenizerSource)); -} - -std::string MultimodalLLM::generate(std::string imagePath, std::string prompt, - std::shared_ptr callback) { - if (!runner_) { - throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, - "Runner is not loaded"); - } - - llm::Image image = loadImageForLFM2(imagePath); - - std::vector inputs = { - llm::make_text_input(std::string(kChatPrefix)), - llm::make_image_input(std::move(image)), - llm::make_text_input(prompt + kChatSuffix), - }; - - std::string output; - auto nativeCallback = [this, &callback, &output](const std::string &token) { - output += token; - if (callback && callInvoker) { - callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) { - callback->call(runtime, jsi::String::createFromUtf8(runtime, token)); - }); - } - }; - - auto error = runner_->generate(inputs, temperature_, topp_, - /*max_new_tokens=*/-1, nativeCallback); - if (error != Error::Ok) { - throw RnExecutorchError(error, "Failed to generate text"); - } - - runner_->reset(); - return output; -} - -void MultimodalLLM::interrupt() { - if (!runner_) { - throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, - "Can't interrupt a model that's not loaded"); - } - runner_->stop(); -} - -size_t MultimodalLLM::getGeneratedTokenCount() const noexcept { - if (!runner_) - return 0; - return static_cast(runner_->stats().num_generated_tokens); -} - -size_t MultimodalLLM::getPromptTokenCount() const noexcept { - if (!runner_) - return 0; - return static_cast(runner_->stats().num_prompt_tokens); -} - -size_t MultimodalLLM::getMemoryLowerBound() const noexcept { - return memorySizeLowerBound; -} - -void MultimodalLLM::setTemperature(float temperature) { - if (temperature < 0.0f) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, - "Temperature must be non-negative"); - } - temperature_ = temperature; -} - -void MultimodalLLM::setTopp(float topp) { - if (topp < 0.0f || topp > 1.0f) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, - "Top-p must be between 0.0 and 1.0"); - } - topp_ = topp; -} - -void MultimodalLLM::unload() noexcept { runner_.reset(nullptr); } - -} // namespace rnexecutorch::models::multimodal_llm diff --git a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h b/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h deleted file mode 100644 index 6b9f8698c..000000000 --- a/packages/react-native-executorch/common/rnexecutorch/models/multimodal_llm/MultimodalLLM.h +++ /dev/null @@ -1,40 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include -#include - -namespace rnexecutorch { -namespace models::multimodal_llm { -using namespace facebook; - -class MultimodalLLM : public BaseModel { -public: - explicit MultimodalLLM(const std::string &modelSource, - const std::string &tokenizerSource, - std::shared_ptr callInvoker); - - std::string generate(std::string imagePath, std::string prompt, - std::shared_ptr callback); - void interrupt(); - void unload() noexcept; - size_t getGeneratedTokenCount() const noexcept; - size_t getPromptTokenCount() const noexcept; - size_t getMemoryLowerBound() const noexcept; - void setTemperature(float temperature); - void setTopp(float topp); - -private: - float temperature_ = 0.8f; - float topp_ = 0.9f; - std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner_; -}; -} // namespace models::multimodal_llm - -REGISTER_CONSTRUCTOR(models::multimodal_llm::MultimodalLLM, std::string, - std::string, std::shared_ptr); -} // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp deleted file mode 100644 index 842b96c72..000000000 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// Ported from executorch/extension/llm/runner/multimodal_runner.cpp - -#include "multimodal_runner.h" -#include "constants.h" -#include "util.h" -#include - -namespace executorch { -namespace extension { -namespace llm { - -using ::executorch::extension::Module; -using ::executorch::runtime::Error; - -MultimodalRunner::MultimodalRunner( - std::unordered_map metadata, - std::unique_ptr tokenizer, - std::unique_ptr module, - std::unique_ptr decoder_runner, - std::unique_ptr prefiller, - std::unique_ptr io_manager, - std::unique_ptr token_generator, - std::unique_ptr stats) - : metadata_(std::move(metadata)), tokenizer_(std::move(tokenizer)), - module_(std::move(module)), decoder_runner_(std::move(decoder_runner)), - prefiller_(std::move(prefiller)), io_manager_(std::move(io_manager)), - token_generator_(std::move(token_generator)), stats_(std::move(stats)), - pos_(0) {} - -bool MultimodalRunner::is_loaded() { - return prefiller_->is_method_loaded() && token_generator_->is_loaded(); -} - -Error MultimodalRunner::load() { - if (is_loaded()) { - return Error::Ok; - } - ET_CHECK_OK_OR_RETURN_ERROR(prefiller_->load()); - ET_CHECK_OK_OR_RETURN_ERROR(token_generator_->load()); - return Error::Ok; -} - -Error MultimodalRunner::generate( - const std::vector &inputs, float temperature, float topp, - int32_t max_new_tokens, - std::function token_callback) { - if (inputs.empty()) { - ET_LOG(Error, "MultimodalInput vector cannot be empty"); - return Error::InvalidArgument; - } - - if (!is_loaded()) { - ET_CHECK_OK_OR_RETURN_ERROR(load()); - } - - stats_->inference_start_ms = time_in_ms(); - - // Prefill all input segments in order. - uint64_t prefill_next_token = 0; - for (size_t i = 0; i < inputs.size(); ++i) { - ET_LOG(Info, "Prefilling input %zu/%zu", i + 1, inputs.size()); - auto prefill_result = prefiller_->prefill(inputs[i], pos_); - if (!prefill_result.ok()) { - return prefill_result.error(); - } - prefill_next_token = prefill_result.get(); - } - - stats_->first_token_ms = time_in_ms(); - stats_->prompt_eval_end_ms = time_in_ms(); - stats_->num_prompt_tokens = pos_; - - // Decode and emit the first token from prefill. - auto decode_result = - tokenizer_->decode(prefill_next_token, prefill_next_token); - if (!decode_result.ok()) { - ET_LOG(Error, "Tokenizer decode error %d", - static_cast(decode_result.error())); - return Error::InvalidArgument; - } - const std::string first_piece = std::move(*decode_result); - safe_printf(first_piece.c_str()); - fflush(stdout); - if (token_callback) { - token_callback(first_piece); - } - - // Resolve max_new_tokens from metadata if caller passed -1. - int64_t context_len = metadata_.count(kMaxContextLen) - ? metadata_.at(kMaxContextLen) - : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen) - : 2048; - int32_t resolved_max_new = max_new_tokens > 0 - ? max_new_tokens - : static_cast(context_len - pos_); - resolved_max_new = std::max(0, resolved_max_new); - - // Autoregressive decode loop. - std::vector prompt_tokens = {prefill_next_token}; - auto wrapped_callback = [&](const std::string &piece) { - safe_printf(piece.c_str()); - fflush(stdout); - if (token_callback) { - token_callback(piece); - } - }; - - auto generate_result = token_generator_->generate( - prompt_tokens, pos_, - static_cast(std::max(0, resolved_max_new - 1)), temperature, - topp, wrapped_callback); - - if (!generate_result.ok()) { - return generate_result.error(); - } - - int64_t num_generated = generate_result.get(); - pos_ += num_generated; - - stats_->inference_end_ms = time_in_ms(); - stats_->num_generated_tokens = num_generated; - - return Error::Ok; -} - -void MultimodalRunner::stop() { - if (token_generator_) { - token_generator_->stop(); - } -} - -void MultimodalRunner::reset() { - pos_ = 0; - if (stats_) { - stats_->reset(); - } -} - -} // namespace llm -} // namespace extension -} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h deleted file mode 100644 index c8007a67e..000000000 --- a/packages/react-native-executorch/common/runner/multimodal_runner.h +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// Ported from executorch/extension/llm/runner/multimodal_runner.h - -#pragma once - -#include "multimodal_decoder_runner.h" -#include "multimodal_input.h" -#include "multimodal_prefiller.h" -#include "stats.h" -#include "text_token_generator.h" -#include -#include -#include -#include -#include -#include - -namespace executorch { -namespace extension { -namespace llm { - -class MultimodalRunner { -public: - explicit MultimodalRunner( - std::unordered_map metadata, - std::unique_ptr tokenizer, - std::unique_ptr module, - std::unique_ptr decoder_runner, - std::unique_ptr prefiller, - std::unique_ptr io_manager, - std::unique_ptr token_generator, - std::unique_ptr stats); - - bool is_loaded(); - ::executorch::runtime::Error load(); - - ::executorch::runtime::Error - generate(const std::vector &inputs, float temperature, - float topp, int32_t max_new_tokens, - std::function token_callback = {}); - - void stop(); - void reset(); - - Stats &stats() { return *stats_; } - -private: - std::unordered_map metadata_; - std::unique_ptr tokenizer_; - std::unique_ptr module_; - std::unique_ptr decoder_runner_; - std::unique_ptr prefiller_; - std::unique_ptr io_manager_; - std::unique_ptr token_generator_; - std::unique_ptr stats_; - int64_t pos_ = 0; -}; - -} // namespace llm -} // namespace extension -} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/runner.cpp b/packages/react-native-executorch/common/runner/runner.cpp deleted file mode 100644 index 8e4660ac5..000000000 --- a/packages/react-native-executorch/common/runner/runner.cpp +++ /dev/null @@ -1,391 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated - */ - -// A simple llama2 runner that includes preprocessing and post processing logic. -// The module takes in a string as input and emits a string as output. - -#include "runner.h" -#include "constants.h" -#include "util.h" -#include -#include -#include - -namespace example { - -using namespace executorch::extension::llm; -using ::executorch::extension::Module; -using ::executorch::runtime::Error; -using ::executorch::runtime::Result; - -Runner::Runner(Module *module, const std::string &tokenizer_path, - const llm::GenerationConfig &config) - : config_(config), module_(module), tokenizer_path_(tokenizer_path), - tokenizer_(std::make_unique()), - metadata_({ - {kEnableDynamicShape, false}, - {kMaxSeqLen, 128}, - {kMaxContextLen, 128}, - {kUseKVCache, true}, - {kUseSDPAWithKVCache, false}, - }) {} - -bool Runner::is_loaded() const { - return module_->is_loaded() && tokenizer_->is_loaded() && - text_decoder_runner_ && text_prefiller_ && text_token_generator_; -} - -Error Runner::load() { - if (is_loaded()) { - return Error::Ok; - } - - auto status = tokenizer_->load(tokenizer_path_); - - if (status != tokenizers::Error::Ok) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Unexpected issue occured while loading tokenizer"); - }; - - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - - ET_LOG(Info, "Reading metadata from model"); - - auto eos_ids = std::make_unique>(); - metadata_[kVocabSize] = tokenizer_->vocab_size(); - - // Load model metadata - const auto method_names = - ET_UNWRAP(module_->method_names(), "Failed reading method names"); - for (auto &pair : metadata_) { - const auto &method_name = pair.first; - auto &value = pair.second; - if (method_names.count(method_name)) { - value = ET_UNWRAP(module_->get(method_name)) - .toScalar() - .to(); - } else { - ET_LOG(Info, "Method %s not found, using the default value %" PRId64, - method_name.c_str(), value); - } - ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); - } - - // Load EOS token ids - if (method_names.count(kEosIds)) { - eos_ids->clear(); - for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { - auto value = eos_id.toScalar().to(); - eos_ids->emplace(value); - ET_LOG(Info, "eos_id = %" PRId64, value); - } - } - - // Determine missing config values - // If user does not directly specify configuration parameters such as - // max_seq_len (i.e. leaves them as default values), they are determined by - // reading the exported model's methods. - if (config_.max_seq_len < 0) - config_.max_seq_len = static_cast(metadata_.at(kMaxSeqLen)); - if (config_.max_context_length < 0) - config_.max_context_length = - static_cast(metadata_.at(kMaxContextLen)); - if (config_.max_new_tokens < 0) - config_.max_new_tokens = - std::min(config_.max_seq_len, config_.max_context_length); - if (config_.enable_dynamic_shape) - config_.enable_dynamic_shape = - static_cast(metadata_.at(kEnableDynamicShape)); - if (config_.enable_kv_cache) - config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); - - io_manager_ = std::make_unique(*module_); - text_decoder_runner_ = std::make_unique( - module_, io_manager_.get(), config_.temperature, config_.topp); - text_prefiller_ = std::make_unique( - text_decoder_runner_.get(), config_.enable_kv_cache, - config_.enable_dynamic_shape, config_.max_seq_len); - - text_token_generator_ = std::make_unique( - tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache, - std::move(eos_ids), &stats_); - - return Error::Ok; -} - -// Don't print with the same priority during warmup -#define RUNNER_ET_LOG(warmup, format, ...) \ - if (warmup) { \ - ET_LOG(Debug, format, __VA_ARGS__); \ - } else { \ - ET_LOG(Info, format, __VA_ARGS__); \ - } - -Error Runner::generate(const std::string &prompt, - const llm::GenerationConfig &generation_config, - std::function token_callback, - std::function stats_callback) { - // Prepare the inputs. - // Use ones-initialized inputs. - ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - if (!is_loaded()) { - stats_.model_load_start_ms = llm::time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(load()); - stats_.model_load_end_ms = llm::time_in_ms(); - } - - if (generation_config.warming) { - ET_LOG(Info, "Doing a warmup run..."); - } - - RUNNER_ET_LOG(generation_config.warming, - "RSS after loading model: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - // Wrap the token_callback with print function - std::function wrapped_callback = - [token_callback, &generation_config](const std::string &piece) { - if (!generation_config.warming) { - llm::safe_printf(piece.c_str()); - fflush(stdout); - } - if (token_callback) { - token_callback(piece); - } - }; - // First token time only measures the time it takes to encode the prompt and - // return a response token. - - stats_.inference_start_ms = llm::time_in_ms(); - shouldStop_ = false; - - // Override main config fields with given generation config if specified - int32_t max_seq_len = generation_config.max_seq_len >= 0 - ? generation_config.max_seq_len - : config_.max_seq_len; - int32_t max_context_length = generation_config.max_context_length >= 0 - ? generation_config.max_context_length - : config_.max_context_length; - int32_t new_tokens_limit = generation_config.max_new_tokens >= 0 - ? generation_config.max_new_tokens - : config_.max_new_tokens; - float temperature = generation_config.temperature >= 0.F - ? generation_config.temperature - : config_.temperature; - float topp = - generation_config.topp >= 0.F ? generation_config.topp : config_.topp; - - int64_t context_len_left = static_cast(max_context_length) - pos_; - - // If the used tokenizer.json has defined post_processor field, - // setting any of bos or eos arguments to value other than provided constant - // ( which is 0) will result in running the post_processor with - // 'add_special_token' flag - auto encodeResult = - tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens); - if (!encodeResult.ok()) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Unexpected issue occured while encoding: " + - std::to_string(static_cast(encodeResult.error()))); - } - std::vector prompt_tokens = encodeResult.get(); - - std::vector prompt_tokens_uint64(prompt_tokens.begin(), - prompt_tokens.end()); - - // encode the (string) prompt into tokens sequence - int num_prompt_tokens = prompt_tokens.size(); - - ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument, - "Expected at least 1 prompt token"); - ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument, - "num_prompt_tokens %d >= max_context_len %" PRId32 - ", Max seq length exceeded - please increase max " - "seq len value in your export script", - num_prompt_tokens, max_seq_len); - - // Determine max_new_tokens using the GenerationConfig's resolve method, - // then subtract pos_ for max_new_tokens. - int32_t max_new_tokens = resolve_max_new_tokens( - num_prompt_tokens, max_seq_len, static_cast(context_len_left), - new_tokens_limit); - - ET_LOG(Info, - "Max new tokens resolved: %d, given pos_ %" PRId64 - ", num_prompt_tokens %zu, max_context_len %" PRId64, - max_new_tokens, pos_, prompt_tokens.size(), - static_cast(max_context_length)); - ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument, - "Max new tokens %d is less than or equal to 0", - max_new_tokens); - - // Prefill first - // Here feed all tokens to the model and get the next predicted token - // after the prompt. After that we will enter generate loop. - - // print prompts - if (generation_config.echo) { - wrapped_callback(prompt); - } - auto prefill_res = text_prefiller_->prefill(prompt_tokens_uint64, pos_); - stats_.first_token_ms = llm::time_in_ms(); - stats_.prompt_eval_end_ms = llm::time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); - uint64_t cur_token = prefill_res.get(); - auto decodeResult = tokenizer_->decode({cur_token}); - if (!decodeResult.ok()) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Unexpected issue occured while decoding: " + - std::to_string(static_cast(decodeResult.error()))); - } - const std::string cur_decoded = decodeResult.get(); - RUNNER_ET_LOG(generation_config.warming, - "RSS after prompt prefill: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - // start the main loop - prompt_tokens_uint64.push_back(cur_token); - int64_t num_generated_tokens = ET_UNWRAP(text_token_generator_->generate( - prompt_tokens_uint64, pos_, max_new_tokens - 1, temperature, topp, - wrapped_callback)); - - pos_ += num_generated_tokens; - - stats_.inference_end_ms = llm::time_in_ms(); - if (!generation_config.warming) { - printf("\n"); - } - RUNNER_ET_LOG( - generation_config.warming, - "RSS after finishing text generation: %f MiB (0 if unsupported)", - llm::get_rss_bytes() / 1024.0 / 1024.0); - - if (num_generated_tokens == max_new_tokens) { - RUNNER_ET_LOG(generation_config.warming, "Max new tokens %i reached!", - max_new_tokens); - } - - stats_.num_prompt_tokens = num_prompt_tokens; - stats_.num_generated_tokens = num_generated_tokens; - - if (generation_config.warming) { - ET_LOG(Info, "Warmup run finished!"); - } else { - // Do not print report during warmup -#ifndef TEST_BUILD - ::executorch::llm::print_report(stats_); -#endif - } - if (stats_callback) { - stats_callback(stats_); - } - - return Error::Ok; -} - -Error Runner::warmup(const std::string &prompt) { - // Create a GenerationConfig for warmup - llm::GenerationConfig config{.echo = false, .warming = true}; - - // Call generate with the warmup config - Error err = generate(prompt, config, - /*token_callback=*/nullptr, - /*stats_callbak=*/nullptr); - - // Reset stats after warmup - reset(); - - return err; -} - -void Runner::stop() { - if (is_loaded()) { - text_token_generator_->stop(); - } else { - ET_LOG(Error, "Token generator is not loaded, cannot stop"); - } -} - -void Runner::reset() { - stats_.reset(); - pos_ = 0; -} - -void Runner::set_count_interval(size_t count_interval) { - text_token_generator_->set_count_interval(count_interval); -} - -void Runner::set_time_interval(size_t time_interval) { - text_token_generator_->set_time_interval(time_interval); -} - -void Runner::set_temperature(float temperature) noexcept { - config_.temperature = temperature; - if (text_decoder_runner_) { - text_decoder_runner_->set_temperature(temperature); - } -} - -void Runner::set_topp(float topp) noexcept { - config_.topp = topp; - if (text_decoder_runner_) { - text_decoder_runner_->set_topp(topp); - } -} - -int32_t Runner::get_max_context_length() const { - if (!is_loaded()) { - return metadata_.at(kMaxContextLen); - } - return config_.max_context_length; -} - -int32_t Runner::count_text_tokens(const std::string &text) const { - auto encodeResult = - tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens); - - if (!encodeResult.ok()) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Encoding failed during token count check."); - } - - return encodeResult.get().size(); -} - -int32_t Runner::resolve_max_new_tokens(int32_t num_prompt_tokens, - int32_t max_seq_len, - int32_t max_context_len, - int32_t max_new_tokens) const { - int32_t result; - - if (max_seq_len == -1 && max_new_tokens == -1) { - // Both are -1, use max context len minus prompt tokens - result = max_context_len - num_prompt_tokens; - } else if (max_seq_len == -1 && max_new_tokens != -1) { - // Only max_new_tokens is specified - result = std::min(max_new_tokens, max_context_len - num_prompt_tokens); - } else if (max_seq_len != -1 && max_new_tokens == -1) { - // Only seq_len is specified - result = std::min(max_seq_len, max_context_len) - num_prompt_tokens; - } else { - // Both are specified - result = - std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens, - max_new_tokens); - } - - // Ensure result is not negative - return std::max(0, result); -} - -} // namespace example diff --git a/packages/react-native-executorch/common/runner/runner.h b/packages/react-native-executorch/common/runner/runner.h deleted file mode 100644 index 03dff39bc..000000000 --- a/packages/react-native-executorch/common/runner/runner.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// A simple llama2 runner that includes preprocessing and post processing logic. -// The module takes in a string as input and emits a string as output. - -#pragma once - -#include "irunner.h" -#include "stats.h" -#include "text_decoder_runner.h" -#include "text_prefiller.h" -#include "text_token_generator.h" -#include -#include -#include -#include -#include -#include -#include -#include - -namespace example { - -namespace llm = ::executorch::extension::llm; - -class Runner : public llm::IRunner { -public: - explicit Runner(::executorch::extension::Module *module, - const std::string &tokenizer_path, - const llm::GenerationConfig &config = { - .temperature = 0.8F, .topp = 0.9F}); // The main config - - bool is_loaded() const override; - ::executorch::runtime::Error load() override; - ::executorch::runtime::Error generate( - const std::string &prompt, - const llm::GenerationConfig &generation_config = - {}, // An extra config which temporarily overrides previous model - // settings - std::function token_callback = {}, - std::function stats_callback = {}) override; - ::executorch::runtime::Error warmup(const std::string &prompt); - void set_count_interval(size_t count_interval); - void set_time_interval(size_t time_interval); - void set_temperature(float temperature) noexcept; - void set_topp(float topp) noexcept; - int32_t count_text_tokens(const std::string &text) const; - int32_t get_max_context_length() const; - - void stop() override; - void reset() override; - - llm::Stats stats_; - -private: - // Helper functions - int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len, - int32_t max_context_len, - int32_t max_new_tokens = -1) const; - - // Main config - llm::GenerationConfig config_; - - // Flow control - bool shouldStop_{false}; - int64_t pos_ = 0; // The position in KV cache of the input, starting from 0. - - // Main model - ::executorch::extension::Module *module_; - - // Subcomponents - std::string tokenizer_path_; - std::unique_ptr tokenizer_; - std::unordered_map metadata_; - std::unique_ptr io_manager_; - std::unique_ptr text_decoder_runner_; - std::unique_ptr text_prefiller_; - std::unique_ptr text_token_generator_; -}; - -} // namespace example diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp new file mode 100644 index 000000000..98955d593 --- /dev/null +++ b/packages/react-native-executorch/common/runner/unified_runner.cpp @@ -0,0 +1,388 @@ +// packages/react-native-executorch/common/runner/unified_runner.cpp +#include "unified_runner.h" +#include "constants.h" +#include "util.h" +#include +#include +#include + +namespace example { + +using namespace executorch::extension::llm; +using ::executorch::extension::Module; +using ::executorch::runtime::Error; +using ::executorch::runtime::Result; + +UnifiedRunner::UnifiedRunner(Module *module, + std::unique_ptr owned_module, + const std::string &tokenizer_path, + const llm::GenerationConfig &config) + : config_(config), module_(owned_module ? owned_module.get() : module), + owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path), + tokenizer_(std::make_unique()), + metadata_({ + {kEnableDynamicShape, false}, + {kMaxSeqLen, 128}, + {kMaxContextLen, 128}, + {kUseKVCache, true}, + {kUseSDPAWithKVCache, false}, + }) {} + +bool UnifiedRunner::is_multimodal() const noexcept { return multimodal_; } + +bool UnifiedRunner::is_loaded() const { + if (multimodal_) { + return mm_prefiller_ && mm_prefiller_->is_method_loaded() && + mm_token_generator_ && mm_token_generator_->is_loaded(); + } + return module_->is_loaded() && tokenizer_->is_loaded() && + text_decoder_runner_ && text_prefiller_ && text_token_generator_; +} + +Error UnifiedRunner::load() { + if (is_loaded()) { + return Error::Ok; + } + + auto status = tokenizer_->load(tokenizer_path_); + if (status != tokenizers::Error::Ok) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Unexpected issue occurred while loading tokenizer"); + } + + // Detect mode by inspecting method names + const auto method_names = + ET_UNWRAP(module_->method_names(), "Failed reading method names"); + + multimodal_ = method_names.count(kTokenEmbeddingMethod) > 0 && + method_names.count(kTextModelMethod) > 0; + + // Load metadata + metadata_[kVocabSize] = tokenizer_->vocab_size(); + for (auto &pair : metadata_) { + const auto &method_name = pair.first; + auto &value = pair.second; + if (method_names.count(method_name)) { + value = ET_UNWRAP(module_->get(method_name)) + .toScalar() + .to(); + } + ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); + } + + if (config_.max_seq_len < 0) + config_.max_seq_len = static_cast(metadata_.at(kMaxSeqLen)); + if (config_.max_context_length < 0) + config_.max_context_length = + static_cast(metadata_.at(kMaxContextLen)); + if (config_.max_new_tokens < 0) + config_.max_new_tokens = + std::min(config_.max_seq_len, config_.max_context_length); + if (config_.enable_dynamic_shape) + config_.enable_dynamic_shape = + static_cast(metadata_.at(kEnableDynamicShape)); + if (config_.enable_kv_cache) + config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); + + // Load EOS ids + auto eos_ids = std::make_unique>(); + if (method_names.count(kEosIds)) { + for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { + eos_ids->emplace(static_cast(eos_id.toScalar().to())); + } + } + if (eos_ids->empty()) { + eos_ids->emplace(7); // fallback <|im_end|> + } + + io_manager_ = std::make_unique(*module_); + llm::Stats *stats_ptr = &stats_; + + if (multimodal_) { + mm_decoder_runner_ = std::make_unique( + module_, io_manager_.get()); + mm_prefiller_ = std::make_unique( + module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get()); + mm_token_generator_ = std::make_unique( + tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true, + std::move(eos_ids), stats_ptr); + + ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load()); + ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load()); + } else { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); + + text_decoder_runner_ = std::make_unique( + module_, io_manager_.get(), config_.temperature, config_.topp); + text_prefiller_ = std::make_unique( + text_decoder_runner_.get(), config_.enable_kv_cache, + config_.enable_dynamic_shape, config_.max_seq_len); + text_token_generator_ = std::make_unique( + tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache, + std::move(eos_ids), stats_ptr); + } + + return Error::Ok; +} + +Error UnifiedRunner::generate( + const std::string &prompt, const llm::GenerationConfig &generation_config, + std::function token_callback, + std::function stats_callback) { + + ET_CHECK_MSG(!multimodal_, + "generate(prompt) called on a multimodal runner. Use " + "generate(vector) instead."); + ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); + + if (!is_loaded()) { + stats_.model_load_start_ms = llm::time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_.model_load_end_ms = llm::time_in_ms(); + } + + std::function wrapped_callback = + [token_callback, &generation_config](const std::string &piece) { + if (!generation_config.warming) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + } + if (token_callback) + token_callback(piece); + }; + + stats_.inference_start_ms = llm::time_in_ms(); + shouldStop_ = false; + + int32_t max_seq_len = generation_config.max_seq_len >= 0 + ? generation_config.max_seq_len + : config_.max_seq_len; + int32_t max_context_length = generation_config.max_context_length >= 0 + ? generation_config.max_context_length + : config_.max_context_length; + int32_t new_tokens_limit = generation_config.max_new_tokens >= 0 + ? generation_config.max_new_tokens + : config_.max_new_tokens; + float temperature = generation_config.temperature >= 0.F + ? generation_config.temperature + : config_.temperature; + float topp = + generation_config.topp >= 0.F ? generation_config.topp : config_.topp; + + int64_t context_len_left = static_cast(max_context_length) - pos_; + + auto encodeResult = + tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens); + if (!encodeResult.ok()) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Unexpected issue occurred while encoding: " + + std::to_string(static_cast(encodeResult.error()))); + } + std::vector prompt_tokens = encodeResult.get(); + int num_prompt_tokens = prompt_tokens.size(); + + ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument, + "Expected at least 1 prompt token"); + ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument, + "num_prompt_tokens %d >= max_seq_len %" PRId32, + num_prompt_tokens, max_seq_len); + + int32_t max_new_tokens = resolve_max_new_tokens( + num_prompt_tokens, max_seq_len, static_cast(context_len_left), + new_tokens_limit); + + ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument, + "Max new tokens %d is <= 0", max_new_tokens); + + if (generation_config.echo) + wrapped_callback(prompt); + + auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_); + stats_.first_token_ms = llm::time_in_ms(); + stats_.prompt_eval_end_ms = llm::time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); + + uint64_t cur_token = prefill_res.get(); + auto decodeResult = tokenizer_->decode({cur_token}); + if (!decodeResult.ok()) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Unexpected issue occurred while decoding: " + + std::to_string(static_cast(decodeResult.error()))); + } + + prompt_tokens.push_back(cur_token); + int64_t num_generated = ET_UNWRAP( + text_token_generator_->generate(prompt_tokens, pos_, max_new_tokens - 1, + temperature, topp, wrapped_callback)); + + pos_ += num_generated; + stats_.inference_end_ms = llm::time_in_ms(); + stats_.num_prompt_tokens = num_prompt_tokens; + stats_.num_generated_tokens = num_generated; + + if (stats_callback) + stats_callback(stats_); + + return Error::Ok; +} + +Error UnifiedRunner::generate( + const std::vector &inputs, float temperature, + float topp, int32_t max_new_tokens, + std::function token_callback) { + + ET_CHECK_MSG(multimodal_, + "generate(MultimodalInput) called on a text-only runner. Use " + "generate(string) instead."); + + if (inputs.empty()) { + ET_LOG(Error, "MultimodalInput vector cannot be empty"); + return Error::InvalidArgument; + } + + if (!is_loaded()) + ET_CHECK_OK_OR_RETURN_ERROR(load()); + + stats_.inference_start_ms = llm::time_in_ms(); + + uint64_t prefill_next_token = 0; + for (size_t i = 0; i < inputs.size(); ++i) { + auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_); + if (!prefill_result.ok()) + return prefill_result.error(); + prefill_next_token = prefill_result.get(); + } + + stats_.first_token_ms = llm::time_in_ms(); + stats_.prompt_eval_end_ms = llm::time_in_ms(); + stats_.num_prompt_tokens = pos_; + + auto decode_result = + tokenizer_->decode(prefill_next_token, prefill_next_token); + if (!decode_result.ok()) { + ET_LOG(Error, "Tokenizer decode error %d", + static_cast(decode_result.error())); + return Error::InvalidArgument; + } + const std::string first_piece = std::move(*decode_result); + llm::safe_printf(first_piece.c_str()); + fflush(stdout); + if (token_callback) + token_callback(first_piece); + + int64_t context_len = metadata_.count(kMaxContextLen) + ? metadata_.at(kMaxContextLen) + : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen) + : 2048; + int32_t resolved_max_new = max_new_tokens > 0 + ? max_new_tokens + : static_cast(context_len - pos_); + resolved_max_new = std::max(0, resolved_max_new); + + std::vector seed_tokens = {prefill_next_token}; + auto wrapped_callback = [&](const std::string &piece) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + if (token_callback) + token_callback(piece); + }; + + auto generate_result = mm_token_generator_->generate( + seed_tokens, pos_, + static_cast(std::max(0, resolved_max_new - 1)), temperature, + topp, wrapped_callback); + + if (!generate_result.ok()) + return generate_result.error(); + + int64_t num_generated = generate_result.get(); + pos_ += num_generated; + + stats_.inference_end_ms = llm::time_in_ms(); + stats_.num_generated_tokens = num_generated; + + return Error::Ok; +} + +void UnifiedRunner::stop() { + if (multimodal_) { + if (mm_token_generator_) + mm_token_generator_->stop(); + } else { + if (text_token_generator_) + text_token_generator_->stop(); + } +} + +void UnifiedRunner::reset() { + stats_.reset(); + pos_ = 0; +} + +int32_t UnifiedRunner::count_text_tokens(const std::string &text) const { + auto encodeResult = + tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens); + if (!encodeResult.ok()) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Encoding failed during token count check."); + } + return static_cast(encodeResult.get().size()); +} + +int32_t UnifiedRunner::get_max_context_length() const { + if (!is_loaded()) { + return static_cast(metadata_.at(kMaxContextLen)); + } + return config_.max_context_length; +} + +void UnifiedRunner::set_temperature(float temperature) noexcept { + config_.temperature = temperature; + if (text_decoder_runner_) + text_decoder_runner_->set_temperature(temperature); +} + +void UnifiedRunner::set_topp(float topp) noexcept { + config_.topp = topp; + if (text_decoder_runner_) + text_decoder_runner_->set_topp(topp); +} + +void UnifiedRunner::set_count_interval(size_t count_interval) { + if (text_token_generator_) + text_token_generator_->set_count_interval(count_interval); + if (mm_token_generator_) + mm_token_generator_->set_count_interval(count_interval); +} + +void UnifiedRunner::set_time_interval(size_t time_interval) { + if (text_token_generator_) + text_token_generator_->set_time_interval(time_interval); + if (mm_token_generator_) + mm_token_generator_->set_time_interval(time_interval); +} + +int32_t UnifiedRunner::resolve_max_new_tokens(int32_t num_prompt_tokens, + int32_t max_seq_len, + int32_t max_context_len, + int32_t max_new_tokens) const { + int32_t result; + if (max_seq_len == -1 && max_new_tokens == -1) { + result = max_context_len - num_prompt_tokens; + } else if (max_seq_len == -1) { + result = std::min(max_new_tokens, max_context_len - num_prompt_tokens); + } else if (max_new_tokens == -1) { + result = std::min(max_seq_len, max_context_len) - num_prompt_tokens; + } else { + result = + std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens, + max_new_tokens); + } + return std::max(0, result); +} + +} // namespace example diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h new file mode 100644 index 000000000..9f38fb9e5 --- /dev/null +++ b/packages/react-native-executorch/common/runner/unified_runner.h @@ -0,0 +1,100 @@ +// packages/react-native-executorch/common/runner/unified_runner.h +#pragma once + +#include "multimodal_decoder_runner.h" +#include "multimodal_input.h" +#include "multimodal_prefiller.h" +#include "stats.h" +#include "text_decoder_runner.h" +#include "text_prefiller.h" +#include "text_token_generator.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +namespace llm = ::executorch::extension::llm; + +class UnifiedRunner { +public: + // module: raw pointer borrowed from BaseModel (text mode uses this) + // owned_module: unique_ptr taken for multimodal mode (nullptr in text mode) + // tokenizer_path: path to tokenizer JSON + // config: generation defaults + explicit UnifiedRunner( + ::executorch::extension::Module *module, + std::unique_ptr<::executorch::extension::Module> owned_module, + const std::string &tokenizer_path, + const llm::GenerationConfig &config = {.temperature = 0.8F, + .topp = 0.9F}); + + bool is_multimodal() const noexcept; + bool is_loaded() const; + ::executorch::runtime::Error load(); + + // Text-only generate — mirrors Runner::generate signature + ::executorch::runtime::Error + generate(const std::string &prompt, + const llm::GenerationConfig &generation_config = {}, + std::function token_callback = {}, + std::function stats_callback = {}); + + // Multimodal generate — mirrors MultimodalRunner::generate signature + ::executorch::runtime::Error + generate(const std::vector &inputs, float temperature, + float topp, int32_t max_new_tokens, + std::function token_callback = {}); + + void stop(); + void reset(); + + // Available for both modes + int32_t count_text_tokens(const std::string &text) const; + int32_t get_max_context_length() const; + void set_temperature(float temperature) noexcept; + void set_topp(float topp) noexcept; + void set_count_interval(size_t count_interval); + void set_time_interval(size_t time_interval); + + llm::Stats stats_; + +private: + int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len, + int32_t max_context_len, + int32_t max_new_tokens = -1) const; + + bool multimodal_{false}; + llm::GenerationConfig config_; + bool shouldStop_{false}; + int64_t pos_{0}; + + // module access — module_ is always a valid raw pointer + // In text mode: points to BaseModel's module_ (borrowed) + // In multimodal mode: points to owned_module_.get() (owned) + ::executorch::extension::Module *module_; + std::unique_ptr<::executorch::extension::Module> owned_module_; + + std::string tokenizer_path_; + std::unique_ptr tokenizer_; + std::unordered_map metadata_; + std::unique_ptr io_manager_; + + // Text-only subcomponents (null in multimodal mode) + std::unique_ptr text_decoder_runner_; + std::unique_ptr text_prefiller_; + std::unique_ptr text_token_generator_; + + // Multimodal subcomponents (null in text mode) + std::unique_ptr mm_decoder_runner_; + std::unique_ptr mm_prefiller_; + std::unique_ptr mm_token_generator_; +}; + +} // namespace example diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 702a00c45..bd000d270 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -24,6 +24,7 @@ export class LLMController { private _isReady = false; private _isGenerating = false; private _messageHistory: Message[] = []; + private isMultimodal_ = false; // User callbacks private tokenCallback: (token: string) => void; @@ -76,11 +77,13 @@ export class LLMController { tokenizerSource, tokenizerConfigSource, onDownloadProgressCallback, + isMultimodal = false, }: { modelSource: ResourceSource; tokenizerSource: ResourceSource; - tokenizerConfigSource: ResourceSource; + tokenizerConfigSource?: ResourceSource; onDownloadProgressCallback?: (downloadProgress: number) => void; + isMultimodal?: boolean; }) { // reset inner state when loading new model this.messageHistoryCallback(this.chatConfig.initialMessageHistory); @@ -88,37 +91,59 @@ export class LLMController { this.isReadyCallback(false); try { - const tokenizersPromise = ResourceFetcher.fetch( - undefined, - tokenizerSource, - tokenizerConfigSource - ); + let tokenizerPath: string | undefined; + let modelPath: string | undefined; + + if (isMultimodal) { + // Multimodal models don't need tokenizer config + const [tokenizerResults, modelResult] = await Promise.all([ + ResourceFetcher.fetch(undefined, tokenizerSource), + ResourceFetcher.fetch(onDownloadProgressCallback, modelSource), + ]); + tokenizerPath = tokenizerResults?.[0]; + modelPath = modelResult?.[0]; + + if (!tokenizerPath || !modelPath) { + throw new RnExecutorchError( + RnExecutorchErrorCode.DownloadInterrupted, + 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' + ); + } + } else { + const tokenizersPromise = ResourceFetcher.fetch( + undefined, + tokenizerSource, + tokenizerConfigSource! + ); - const modelPromise = ResourceFetcher.fetch( - onDownloadProgressCallback, - modelSource - ); + const modelPromise = ResourceFetcher.fetch( + onDownloadProgressCallback, + modelSource + ); - const [tokenizersResults, modelResult] = await Promise.all([ - tokenizersPromise, - modelPromise, - ]); + const [tokenizersResults, modelResult] = await Promise.all([ + tokenizersPromise, + modelPromise, + ]); - const tokenizerPath = tokenizersResults?.[0]; - const tokenizerConfigPath = tokenizersResults?.[1]; - const modelPath = modelResult?.[0]; + tokenizerPath = tokenizersResults?.[0]; + const tokenizerConfigPath = tokenizersResults?.[1]; + modelPath = modelResult?.[0]; - if (!tokenizerPath || !tokenizerConfigPath || !modelPath) { - throw new RnExecutorchError( - RnExecutorchErrorCode.DownloadInterrupted, - 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' + if (!tokenizerPath || !tokenizerConfigPath || !modelPath) { + throw new RnExecutorchError( + RnExecutorchErrorCode.DownloadInterrupted, + 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' + ); + } + + this.tokenizerConfig = JSON.parse( + await ResourceFetcher.fs.readAsString(tokenizerConfigPath!) ); } - this.tokenizerConfig = JSON.parse( - await ResourceFetcher.fs.readAsString(tokenizerConfigPath!) - ); this.nativeModule = global.loadLLM(modelPath, tokenizerPath); + this.isMultimodal_ = this.nativeModule.isMultimodal(); this.isReadyCallback(true); this.onToken = (data: string) => { if (!data) { @@ -180,6 +205,9 @@ export class LLMController { } private filterSpecialTokens(text: string): string { + if (!this.tokenizerConfig) { + return text; + } let filtered = text; if ( SPECIAL_TOKENS.EOS_TOKEN in this.tokenizerConfig && @@ -237,6 +265,64 @@ export class LLMController { } } + public async generateWithImage( + imagePath: string, + prompt: string + ): Promise { + if (!this._isReady) { + throw new RnExecutorchError( + RnExecutorchErrorCode.ModuleNotLoaded, + 'The model is currently not loaded.' + ); + } + if (!this.isMultimodal_) { + throw new RnExecutorchError( + RnExecutorchErrorCode.InvalidUserInput, + 'generateWithImage() requires a multimodal model. Load with isMultimodal: true.' + ); + } + if (this._isGenerating) { + throw new RnExecutorchError( + RnExecutorchErrorCode.ModelGenerating, + 'The model is currently generating.' + ); + } + try { + this.isGeneratingCallback(true); + this.nativeModule.reset(); + const response = await this.nativeModule.generateWithImage( + imagePath, + prompt, + this.onToken + ); + return response; + } catch (e) { + throw parseUnknownError(e); + } finally { + this.isGeneratingCallback(false); + } + } + + public async sendMessageWithImage( + imagePath: string, + message: string + ): Promise { + const updatedHistory = [ + ...this._messageHistory, + { content: message, role: 'user' as const }, + ]; + this.messageHistoryCallback(updatedHistory); + + const response = await this.generateWithImage(imagePath, message); + + this.messageHistoryCallback([ + ...this._messageHistory, + { content: response, role: 'assistant' }, + ]); + + return response; + } + public interrupt() { if (!this.nativeModule) { throw new RnExecutorchError( diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 5578c1de7..2920e1bb5 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -51,8 +51,9 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { await controllerInstance.load({ modelSource: model.modelSource, tokenizerSource: model.tokenizerSource, - tokenizerConfigSource: model.tokenizerConfigSource!, + tokenizerConfigSource: model.tokenizerConfigSource, onDownloadProgressCallback: setDownloadProgress, + isMultimodal: model.isMultimodal, }); } catch (e) { setError(parseUnknownError(e)); @@ -69,6 +70,7 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { model.modelSource, model.tokenizerSource, model.tokenizerConfigSource, + model.isMultimodal, preventLoad, ]); @@ -124,6 +126,14 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { [controllerInstance] ); + const sendMessageWithImage = useCallback( + (imagePath: string, message: string) => { + setResponse(''); + return controllerInstance.sendMessageWithImage(imagePath, message); + }, + [controllerInstance] + ); + return { messageHistory, response, @@ -140,5 +150,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { sendMessage: sendMessage, deleteMessage: deleteMessage, interrupt: interrupt, + sendMessageWithImage: sendMessageWithImage, }; }; diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts deleted file mode 100644 index 0a54239cc..000000000 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useMultimodalLLM.ts +++ /dev/null @@ -1,153 +0,0 @@ -import { useCallback, useEffect, useRef, useState } from 'react'; -import { ResourceSource } from '../../types/common'; -import { ResourceFetcher } from '../../utils/ResourceFetcher'; -import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; -import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; - -export interface MultimodalLLMProps { - model: { - modelSource: ResourceSource; - tokenizerSource: ResourceSource; - }; - preventLoad?: boolean; -} - -export interface MultimodalLLMType { - isReady: boolean; - isGenerating: boolean; - downloadProgress: number; - response: string; - error: RnExecutorchError | null; - generate: (imagePath: string, prompt: string) => Promise; - interrupt: () => void; -} - -/** - * React hook for managing a Multimodal LLM (VLM) instance. - * Uses `loadMultimodalLLM` native global, which wraps a multi-method PTE - * with vision_encoder, token_embedding, and text_decoder methods. - * - * @category Hooks - */ -export const useMultimodalLLM = ({ - model, - preventLoad = false, -}: MultimodalLLMProps): MultimodalLLMType => { - const [nativeModule, setNativeModule] = useState(null); - const [isReady, setIsReady] = useState(false); - const [isGenerating, setIsGenerating] = useState(false); - const [downloadProgress, setDownloadProgress] = useState(0); - const [response, setResponse] = useState(''); - const [error, setError] = useState(null); - - useEffect(() => { - setDownloadProgress(0); - setError(null); - setIsReady(false); - - if (preventLoad) return; - - let cancelled = false; - - (async () => { - try { - const [modelResults, tokenizerResults] = await Promise.all([ - ResourceFetcher.fetch(setDownloadProgress, model.modelSource), - ResourceFetcher.fetch(undefined, model.tokenizerSource), - ]); - - if (cancelled) return; - - const modelPath = modelResults?.[0]; - const tokenizerPath = tokenizerResults?.[0]; - - if (!modelPath || !tokenizerPath) { - throw new RnExecutorchError( - RnExecutorchErrorCode.DownloadInterrupted, - 'Download interrupted — not all files were fetched.' - ); - } - - const mod = global.loadMultimodalLLM(modelPath, tokenizerPath); - setNativeModule(mod); - setIsReady(true); - } catch (e) { - if (!cancelled) { - setError(parseUnknownError(e)); - } - } - })(); - - return () => { - cancelled = true; - }; - }, [model.modelSource, model.tokenizerSource, preventLoad]); - - const tokenBufferRef = useRef(''); - const rafRef = useRef | null>(null); - - const generate = useCallback( - async (imagePath: string, prompt: string): Promise => { - if (!nativeModule) { - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'Multimodal LLM is not loaded yet.' - ); - } - tokenBufferRef.current = ''; - if (rafRef.current !== null) { - cancelAnimationFrame(rafRef.current); - rafRef.current = null; - } - setResponse(''); - setIsGenerating(true); - try { - const result: string = await nativeModule.generate( - imagePath, - prompt, - (token: string) => { - tokenBufferRef.current += token; - if (rafRef.current === null) { - rafRef.current = requestAnimationFrame(() => { - rafRef.current = null; - const buffered = tokenBufferRef.current; - tokenBufferRef.current = ''; - setResponse((prev) => prev + buffered); - }); - } - } - ); - // Flush any remaining buffered tokens after generation completes - if (rafRef.current !== null) { - cancelAnimationFrame(rafRef.current); - rafRef.current = null; - } - if (tokenBufferRef.current) { - const remaining = tokenBufferRef.current; - tokenBufferRef.current = ''; - setResponse((prev) => prev + remaining); - } - return result; - } catch (e) { - throw parseUnknownError(e); - } finally { - setIsGenerating(false); - } - }, - [nativeModule] - ); - - const interrupt = useCallback(() => { - nativeModule?.interrupt(); - }, [nativeModule]); - - return { - isReady, - isGenerating, - downloadProgress, - response, - error, - generate, - interrupt, - }; -}; diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index e544d9cca..dd7557ca2 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -49,7 +49,6 @@ declare global { var loadVAD: (source: string) => any; var loadTextEmbeddings: (modelSource: string, tokenizerSource: string) => any; var loadLLM: (modelSource: string, tokenizerSource: string) => any; - var loadMultimodalLLM: (modelSource: string, tokenizerSource: string) => any; var loadTextToImage: ( tokenizerSource: string, encoderSource: string, @@ -98,7 +97,6 @@ if ( global.loadImageEmbeddings == null || global.loadVAD == null || global.loadLLM == null || - global.loadMultimodalLLM == null || global.loadSpeechToText == null || global.loadTextToSpeechKokoro == null || global.loadOCR == null || @@ -123,7 +121,6 @@ export * from './hooks/computer_vision/useImageEmbeddings'; export * from './hooks/computer_vision/useTextToImage'; export * from './hooks/natural_language_processing/useLLM'; -export * from './hooks/natural_language_processing/useMultimodalLLM'; export * from './hooks/natural_language_processing/useSpeechToText'; export * from './hooks/natural_language_processing/useTextToSpeech'; export * from './hooks/natural_language_processing/useTextEmbeddings'; diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 25d87e248..71dfcd3f8 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -20,6 +20,11 @@ export interface LLMProps { * `ResourceSource` pointing to the JSON file which contains the tokenizer config. */ tokenizerConfigSource?: ResourceSource; + /** + * Set to `true` when loading a vision-language (multimodal) model. + * Skips tokenizer config fetching and enables `sendMessageWithImage`. + */ + isMultimodal?: boolean; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. @@ -123,6 +128,16 @@ export interface LLMType { * Function to interrupt the current inference. */ interrupt: () => void; + + /** + * Send a user message with an image. Updates messageHistory after model responds. + * Only valid for multimodal models (loaded with `isMultimodal: true`). + * + * @param imagePath - Local path to the image file. + * @param message - The text question about the image. + * @returns The model's response as a string. + */ + sendMessageWithImage: (imagePath: string, message: string) => Promise; } /** diff --git a/yarn.lock b/yarn.lock index f839c07a6..c2f2e609c 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8721,6 +8721,15 @@ __metadata: languageName: node linkType: hard +"expo-document-picker@npm:~13.0.3": + version: 13.0.3 + resolution: "expo-document-picker@npm:13.0.3" + peerDependencies: + expo: "*" + checksum: 10/a336310e6327d26f36ac19b5867e2ef453dd59a0e30f7b2854c34bc1f874d967f92ced4e0b5fddc2b193ba1d88059033e6f3b076980c060169b191f4af184f90 + languageName: node + linkType: hard + "expo-file-system@npm:^19.0.20, expo-file-system@npm:~19.0.21": version: 19.0.21 resolution: "expo-file-system@npm:19.0.21" @@ -11450,6 +11459,7 @@ __metadata: expo-brightness: "npm:~14.0.8" expo-calendar: "npm:~15.0.8" expo-constants: "npm:~18.0.11" + expo-document-picker: "npm:~13.0.3" expo-font: "npm:~14.0.10" expo-linking: "npm:~8.0.10" expo-router: "npm:~6.0.17" @@ -11461,6 +11471,7 @@ __metadata: react-native-device-info: "npm:^15.0.2" react-native-executorch: "workspace:*" react-native-gesture-handler: "npm:~2.28.0" + react-native-image-picker: "npm:^7.2.2" react-native-loading-spinner-overlay: "npm:^3.0.1" react-native-markdown-display: "npm:^7.0.2" react-native-reanimated: "npm:~4.1.1" From bf50ae2ea822e390786c79a2cb599221c62f43ff Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 12:34:47 +0100 Subject: [PATCH 03/46] feat: add conversational VLM demo with multimodal/text-only support and fix token generation bugs Co-Authored-By: Claude Sonnet 4.6 --- apps/llm/app/_layout.tsx | 16 +- apps/llm/app/index.tsx | 24 ++ apps/llm/app/multimodal_llm/index.tsx | 368 +++++++----------- .../common/rnexecutorch/models/llm/LLM.cpp | 6 - .../common/runner/unified_runner.cpp | 39 +- .../common/runner/unified_runner.h | 4 +- .../src/controllers/LLMController.ts | 13 +- 7 files changed, 214 insertions(+), 256 deletions(-) diff --git a/apps/llm/app/_layout.tsx b/apps/llm/app/_layout.tsx index 523d3aaf7..4ab010693 100644 --- a/apps/llm/app/_layout.tsx +++ b/apps/llm/app/_layout.tsx @@ -57,38 +57,38 @@ export default function _layout() { headerTitleStyle: { color: ColorPalette.primary }, }} > - {/* */} - {/* + */} - {/* + */} - {/* + */} + /> Select a demo model + router.navigate('llm/')} + > + LLM + + router.navigate('llm_tool_calling/')} + > + LLM Tool Calling + + router.navigate('llm_structured_output/')} + > + LLM Structured Output + + router.navigate('voice_chat/')} + > + Voice Chat + router.navigate('multimodal_llm/')} diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 990f6cf1a..f5f402183 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -4,7 +4,6 @@ import { Keyboard, KeyboardAvoidingView, Platform, - ScrollView, StyleSheet, Text, TextInput, @@ -12,123 +11,42 @@ import { TouchableWithoutFeedback, View, } from 'react-native'; -import * as DocumentPicker from 'expo-document-picker'; import { launchImageLibrary } from 'react-native-image-picker'; import { useIsFocused } from '@react-navigation/native'; import { useLLM } from 'react-native-executorch'; +import SendIcon from '../../assets/icons/send_icon.svg'; +import PauseIcon from '../../assets/icons/pause_icon.svg'; import ColorPalette from '../../colors'; +import Messages from '../../components/Messages'; import Spinner from '../../components/Spinner'; import { GeneratingContext } from '../../context'; +const MODEL_SOURCE = + 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte'; +const TOKENIZER_SOURCE = + 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json'; +const TOKENIZER_CONFIG_SOURCE = + 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json'; + export default function MultimodalLLMScreenWrapper() { const isFocused = useIsFocused(); - return isFocused ? : null; -} - -// Outer component: collect model + tokenizer paths before mounting the hook -function MultimodalLLMScreenOuter() { - const [modelUri, setModelUri] = useState(null); - const [tokenizerUri, setTokenizerUri] = useState(null); - const [confirmed, setConfirmed] = useState(false); - - const pickFile = async (setter: (uri: string) => void) => { - const result = await DocumentPicker.getDocumentAsync({ - copyToCacheDirectory: false, - multiple: false, - }); - if (result.canceled) return; - const asset = result.assets[0]; - if (asset?.uri) { - setter(asset.uri); - } - }; - - if (!confirmed) { - return ( - - Select model files - - Pick the .pte model and tokenizer.json from your device storage. - - - pickFile(setModelUri)} - /> - pickFile(setTokenizerUri)} - /> - - setConfirmed(true)} - > - Load model - - - ); - } - - return ( - - ); + return isFocused ? : null; } -function FilePicker({ - label, - uri, - onPick, -}: { - label: string; - uri: string | null; - onPick: () => void; -}) { - const fileName = uri ? (uri.split('/').pop() ?? uri) : null; - return ( - - - {label} - - {fileName ?? 'Tap to pick file'} - - - - - ); -} - -function MultimodalLLMScreen({ - modelSource, - tokenizerSource, -}: { - modelSource: string; - tokenizerSource: string; -}) { +function MultimodalLLMScreen() { const [imageUri, setImageUri] = useState(null); - const [prompt, setPrompt] = useState(''); + const [userInput, setUserInput] = useState(''); const [isTextInputFocused, setIsTextInputFocused] = useState(false); - const scrollViewRef = useRef(null); + const textInputRef = useRef(null); const { setGlobalGenerating } = useContext(GeneratingContext); const vlm = useLLM({ - model: { modelSource, tokenizerSource, isMultimodal: true }, + model: { + modelSource: MODEL_SOURCE, + tokenizerSource: TOKENIZER_SOURCE, + tokenizerConfigSource: TOKENIZER_CONFIG_SOURCE, + isMultimodal: true, + }, }); useEffect(() => { @@ -136,26 +54,29 @@ function MultimodalLLMScreen({ }, [vlm.isGenerating, setGlobalGenerating]); useEffect(() => { - if (vlm.error) { - console.error('MultimodalLLM error:', vlm.error); - } + if (vlm.error) console.error('MultimodalLLM error:', vlm.error); }, [vlm.error]); const pickImage = async () => { const result = await launchImageLibrary({ mediaType: 'photo' }); if (result.assets && result.assets.length > 0) { const uri = result.assets[0]?.uri; - if (uri) { - setImageUri(uri); - } + if (uri) setImageUri(uri); } }; - const handleGenerate = async () => { - if (!imageUri || !prompt.trim() || !vlm.isReady || vlm.isGenerating) return; + const sendMessage = async () => { + if (!userInput.trim() || vlm.isGenerating) return; + const text = userInput.trim(); + setUserInput(''); + textInputRef.current?.clear(); Keyboard.dismiss(); try { - await vlm.sendMessageWithImage(imageUri, prompt.trim()); + if (imageUri) { + await vlm.sendMessageWithImage(imageUri, text); + } else { + await vlm.sendMessage(text); + } } catch (e) { console.error('Generation error:', e); } @@ -182,79 +103,86 @@ function MultimodalLLMScreen({ behavior={Platform.OS === 'ios' ? 'padding' : undefined} keyboardVerticalOffset={Platform.OS === 'ios' ? 120 : 40} > - - scrollViewRef.current?.scrollToEnd({ animated: true }) - } - > - {/* Image picker */} - - {imageUri ? ( - + {vlm.messageHistory.length ? ( + + - ) : ( - Tap to pick an image - )} - - - {/* Response area */} - {vlm.response ? ( - - Response: - {vlm.response} - ) : vlm.isGenerating ? ( - - Generating… + ) : ( + + Hello! 👋 + + Pick an image and ask me anything about it. + - ) : null} - + )} - {/* Bottom bar */} - - setIsTextInputFocused(true)} - onBlur={() => setIsTextInputFocused(false)} - style={[ - styles.textInput, - { - borderColor: isTextInputFocused - ? ColorPalette.blueDark - : ColorPalette.blueLight, - }, - ]} - placeholder="Ask about the image…" - placeholderTextColor="#C1C6E5" - multiline - value={prompt} - onChangeText={setPrompt} - /> - {vlm.isGenerating ? ( + {/* Image thumbnail strip */} + {imageUri && ( - Stop + + Tap to change - ) : ( + )} + + + {/* Image picker button */} - Ask + 📷 - )} + + setIsTextInputFocused(true)} + onBlur={() => setIsTextInputFocused(false)} + style={[ + styles.textInput, + { + borderColor: isTextInputFocused + ? ColorPalette.blueDark + : ColorPalette.blueLight, + }, + ]} + placeholder={imageUri ? 'Ask about the image…' : 'Your message'} + placeholderTextColor="#C1C6E5" + multiline + onChangeText={setUserInput} + /> + + {userInput.trim() && !vlm.isGenerating && ( + + + + )} + {vlm.isGenerating && ( + + + + )} + @@ -318,74 +246,76 @@ const styles = StyleSheet.create({ loadButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 15 }, // Chat phase - container: { flex: 1, backgroundColor: '#fff' }, - scrollView: { flex: 1 }, - scrollContent: { padding: 16, paddingBottom: 8 }, - imagePicker: { + container: { flex: 1 }, + chatContainer: { flex: 10, width: '100%' }, + helloMessageContainer: { + flex: 10, width: '100%', - height: 220, - borderRadius: 12, - borderWidth: 1, - borderColor: ColorPalette.blueLight, - borderStyle: 'dashed', - justifyContent: 'center', alignItems: 'center', - overflow: 'hidden', - marginBottom: 16, + justifyContent: 'center', }, - previewImage: { width: '100%', height: '100%' }, - imagePickerText: { - color: ColorPalette.blueLight, - fontSize: 16, + helloText: { + fontFamily: 'medium', + fontSize: 30, + color: ColorPalette.primary, + }, + bottomHelloText: { fontFamily: 'regular', + fontSize: 20, + lineHeight: 28, + textAlign: 'center', + color: ColorPalette.primary, + paddingHorizontal: 24, }, - responseContainer: { - backgroundColor: ColorPalette.seaBlueLight, + imageThumbnailContainer: { + flexDirection: 'row', + alignItems: 'center', + paddingHorizontal: 16, + paddingVertical: 6, + gap: 8, + }, + imageThumbnail: { + width: 48, + height: 48, borderRadius: 8, - padding: 12, - marginBottom: 8, + borderWidth: 1, + borderColor: ColorPalette.blueLight, }, - responseLabel: { + imageThumbnailHint: { fontSize: 12, - color: ColorPalette.blueDark, - fontFamily: 'medium', - marginBottom: 4, - }, - responseText: { - fontSize: 14, - lineHeight: 20, - color: ColorPalette.primary, fontFamily: 'regular', + color: ColorPalette.blueDark, }, bottomContainer: { + height: 100, + width: '100%', flexDirection: 'row', + justifyContent: 'space-between', alignItems: 'center', paddingHorizontal: 16, - paddingVertical: 12, - borderTopWidth: 1, - borderTopColor: ColorPalette.blueLight, - backgroundColor: '#fff', }, + imageButton: { + width: 40, + height: 40, + justifyContent: 'center', + alignItems: 'center', + marginRight: 4, + }, + imageButtonText: { fontSize: 22 }, textInput: { flex: 1, borderWidth: 1, borderRadius: 8, - fontSize: 14, lineHeight: 19.6, fontFamily: 'regular', + fontSize: 14, color: ColorPalette.primary, - padding: 12, - maxHeight: 100, + padding: 16, }, - actionButton: { - marginLeft: 8, - backgroundColor: ColorPalette.strongPrimary, - borderRadius: 8, - paddingHorizontal: 16, - paddingVertical: 12, + sendChatTouchable: { + height: '100%', + width: 48, justifyContent: 'center', - alignItems: 'center', + alignItems: 'flex-end', }, - actionButtonDisabled: { backgroundColor: ColorPalette.blueLight }, - actionButtonText: { color: '#fff', fontFamily: 'medium', fontSize: 14 }, }); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 66b151faa..320e9bebe 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -77,11 +77,6 @@ std::string LLM::generate(std::string input, throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Runner is not loaded"); } - if (multimodal_) { - throw RnExecutorchError( - RnExecutorchErrorCode::InvalidUserInput, - "This is a multimodal model. Call generate(imagePath, prompt, cb)."); - } std::string output; auto nativeCallback = [this, callback, &output](const std::string &token) { @@ -136,7 +131,6 @@ std::string LLM::generate(std::string imagePath, std::string prompt, throw RnExecutorchError(error, "Failed to generate multimodal response"); } - runner_->reset(); return output; } diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp index 98955d593..a136835a3 100644 --- a/packages/react-native-executorch/common/runner/unified_runner.cpp +++ b/packages/react-native-executorch/common/runner/unified_runner.cpp @@ -131,11 +131,23 @@ Error UnifiedRunner::generate( std::function token_callback, std::function stats_callback) { - ET_CHECK_MSG(!multimodal_, - "generate(prompt) called on a multimodal runner. Use " - "generate(vector) instead."); ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); + // In multimodal mode, delegate to the multimodal generate path with + // text-only input (no image). + if (multimodal_) { + std::vector text_inputs = { + llm::make_text_input(prompt)}; + float temp = + generation_config.temperature >= 0.F + ? generation_config.temperature + : (config_.temperature >= 0.F ? config_.temperature : 0.8F); + float topp = generation_config.topp >= 0.F + ? generation_config.topp + : (config_.topp >= 0.F ? config_.topp : 0.9F); + return generate(text_inputs, temp, topp, -1, token_callback); + } + if (!is_loaded()) { stats_.model_load_start_ms = llm::time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -248,6 +260,7 @@ Error UnifiedRunner::generate( stats_.inference_start_ms = llm::time_in_ms(); + int64_t pos_before_prefill = pos_; uint64_t prefill_next_token = 0; for (size_t i = 0; i < inputs.size(); ++i) { auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_); @@ -260,26 +273,14 @@ Error UnifiedRunner::generate( stats_.prompt_eval_end_ms = llm::time_in_ms(); stats_.num_prompt_tokens = pos_; - auto decode_result = - tokenizer_->decode(prefill_next_token, prefill_next_token); - if (!decode_result.ok()) { - ET_LOG(Error, "Tokenizer decode error %d", - static_cast(decode_result.error())); - return Error::InvalidArgument; - } - const std::string first_piece = std::move(*decode_result); - llm::safe_printf(first_piece.c_str()); - fflush(stdout); - if (token_callback) - token_callback(first_piece); - int64_t context_len = metadata_.count(kMaxContextLen) ? metadata_.at(kMaxContextLen) : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen) : 2048; - int32_t resolved_max_new = max_new_tokens > 0 - ? max_new_tokens - : static_cast(context_len - pos_); + int32_t resolved_max_new = + max_new_tokens > 0 + ? max_new_tokens + : static_cast(context_len - pos_before_prefill); resolved_max_new = std::max(0, resolved_max_new); std::vector seed_tokens = {prefill_next_token}; diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h index 9f38fb9e5..6f003fcc5 100644 --- a/packages/react-native-executorch/common/runner/unified_runner.h +++ b/packages/react-native-executorch/common/runner/unified_runner.h @@ -1,6 +1,7 @@ // packages/react-native-executorch/common/runner/unified_runner.h #pragma once +#include "irunner.h" #include "multimodal_decoder_runner.h" #include "multimodal_input.h" #include "multimodal_prefiller.h" @@ -32,8 +33,7 @@ class UnifiedRunner { ::executorch::extension::Module *module, std::unique_ptr<::executorch::extension::Module> owned_module, const std::string &tokenizer_path, - const llm::GenerationConfig &config = {.temperature = 0.8F, - .topp = 0.9F}); + const llm::GenerationConfig &config = llm::GenerationConfig{}); bool is_multimodal() const noexcept; bool is_loaded() const; diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index bd000d270..6e024220d 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -95,9 +95,12 @@ export class LLMController { let modelPath: string | undefined; if (isMultimodal) { - // Multimodal models don't need tokenizer config const [tokenizerResults, modelResult] = await Promise.all([ - ResourceFetcher.fetch(undefined, tokenizerSource), + ResourceFetcher.fetch( + undefined, + tokenizerSource, + ...(tokenizerConfigSource ? [tokenizerConfigSource] : []) + ), ResourceFetcher.fetch(onDownloadProgressCallback, modelSource), ]); tokenizerPath = tokenizerResults?.[0]; @@ -109,6 +112,12 @@ export class LLMController { 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' ); } + + if (tokenizerConfigSource && tokenizerResults?.[1]) { + this.tokenizerConfig = JSON.parse( + await ResourceFetcher.fs.readAsString(tokenizerResults[1]) + ); + } } else { const tokenizersPromise = ResourceFetcher.fetch( undefined, From 1695f7e26603e95ede1f93453607382003b6b96a Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 13:13:39 +0100 Subject: [PATCH 04/46] fix: default UnifiedRunner temperature to 0.8 and topp to 0.9 --- .../react-native-executorch/common/runner/unified_runner.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h index 6f003fcc5..ae7789bbe 100644 --- a/packages/react-native-executorch/common/runner/unified_runner.h +++ b/packages/react-native-executorch/common/runner/unified_runner.h @@ -33,7 +33,8 @@ class UnifiedRunner { ::executorch::extension::Module *module, std::unique_ptr<::executorch::extension::Module> owned_module, const std::string &tokenizer_path, - const llm::GenerationConfig &config = llm::GenerationConfig{}); + const llm::GenerationConfig &config = {.temperature = 0.8F, + .topp = 0.9F}); bool is_multimodal() const noexcept; bool is_loaded() const; From b660b0feaf5e3f57fc7de6dd0ce8ca040114ef90 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 13:14:37 +0100 Subject: [PATCH 05/46] feat: add NativeMessage struct and JSI conversion for message history --- .../host_objects/JsiConversions.h | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index df9abbdef..08acf6cff 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -228,6 +228,36 @@ getValue>(const jsi::Value &val, jsi::Runtime &runtime) { return getArrayAsVector(val, runtime); } +struct NativeMessage { + std::string role; // "user" | "assistant" | "system" + std::string content; + std::string mediaPath; // empty string if no media +}; + +template <> +inline std::vector +getValue>(const jsi::Value &val, + jsi::Runtime &runtime) { + jsi::Array array = val.asObject(runtime).asArray(runtime); + size_t length = array.size(runtime); + std::vector result; + result.reserve(length); + for (size_t i = 0; i < length; ++i) { + jsi::Object obj = array.getValueAtIndex(runtime, i).asObject(runtime); + NativeMessage msg; + msg.role = + obj.getProperty(runtime, "role").getString(runtime).utf8(runtime); + msg.content = + obj.getProperty(runtime, "content").getString(runtime).utf8(runtime); + auto mediaProp = obj.getProperty(runtime, "mediaPath"); + if (!mediaProp.isUndefined() && !mediaProp.isNull()) { + msg.mediaPath = mediaProp.getString(runtime).utf8(runtime); + } + result.push_back(std::move(msg)); + } + return result; +} + // Template specializations for std::span types template <> inline std::span getValue>(const jsi::Value &val, From 4331bded465523401a7910438220a69f1d65340a Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 13:17:14 +0100 Subject: [PATCH 06/46] feat: declare generateMultimodal on LLM and register JSI binding --- .../rnexecutorch/host_objects/ModelHostObject.h | 5 +++++ .../common/rnexecutorch/models/llm/LLM.h | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index f41da1e45..334f1f833 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -160,6 +160,11 @@ template class ModelHostObject : public JsiHostObject { &Model::generate)>, "generateWithImage")); + addFunctions( + JSI_EXPORT_FUNCTION(ModelHostObject, + promiseHostFunction<&Model::generateMultimodal>, + "generateMultimodal")); + addFunctions(JSI_EXPORT_FUNCTION( ModelHostObject, synchronousHostFunction<&Model::isMultimodal>, "isMultimodal")); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 3763fe924..8b5684ad5 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -2,10 +2,14 @@ #include #include +#include +#include #include #include +#include #include +#include #include namespace rnexecutorch { @@ -26,6 +30,11 @@ class LLM : public BaseModel { std::string generate(std::string imagePath, std::string prompt, std::shared_ptr callback); + // Multimodal generate — takes full message history, builds MultimodalInput[] + std::string generateMultimodal( + std::vector messages, + std::shared_ptr callback); + bool isMultimodal() const noexcept; void interrupt(); @@ -46,6 +55,10 @@ class LLM : public BaseModel { bool multimodal_; float temperature_ = 0.8f; float topp_ = 0.9f; + std::unordered_map + imageCache_; + const executorch::extension::llm::Image & + getOrLoadImage(const std::string &path); }; } // namespace models::llm From d6530e4242d1afdf1605f38c36791ae846bc091e Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 13:19:33 +0100 Subject: [PATCH 07/46] fix: remove redundant unordered_map and vector includes from LLM.h --- .../common/rnexecutorch/models/llm/LLM.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 8b5684ad5..5c9e1e458 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -2,8 +2,6 @@ #include #include -#include -#include #include #include From d261a45d1400adb739c55cbb7daf31506e3a12e6 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 13:21:35 +0100 Subject: [PATCH 08/46] feat: implement generateMultimodal with per-turn chat template and image cache Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/models/llm/LLM.cpp | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 320e9bebe..441d20dad 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -22,6 +22,10 @@ static constexpr int kImageChannels = 3; static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n"; static constexpr const char *kChatSuffix = "<|im_end|>\n<|im_start|>assistant\n"; +// Separator inserted after each assistant turn in multi-turn conversations +static constexpr const char *kAssistantTurnEnd = "<|im_end|>\n"; +// Prefix for subsequent user turns (no BOS token — only first turn has it) +static constexpr const char *kUserTurnPrefix = "<|im_start|>user\n"; static llm::Image loadImageForVLM(const std::string &imagePath) { cv::Mat mat = image_processing::readImage(imagePath); @@ -39,6 +43,14 @@ static llm::Image loadImageForVLM(const std::string &imagePath) { return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels); } +const llm::Image &LLM::getOrLoadImage(const std::string &path) { + auto it = imageCache_.find(path); + if (it != imageCache_.end()) { + return it->second; + } + return imageCache_.emplace(path, loadImageForVLM(path)).first->second; +} + LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, std::shared_ptr callInvoker) : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { @@ -134,6 +146,79 @@ std::string LLM::generate(std::string imagePath, std::string prompt, return output; } +std::string LLM::generateMultimodal( + std::vector messages, + std::shared_ptr callback) { + if (!runner_ || !runner_->is_loaded()) { + throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, + "Runner is not loaded"); + } + if (!multimodal_) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidUserInput, + "This is a text-only model. Use generate(prompt, cb) instead."); + } + + std::vector inputs; + bool isFirst = true; + + for (const auto &msg : messages) { + if (msg.role == "system") { + if (isFirst) { + inputs.push_back(llm::make_text_input(msg.content + "\n")); + } + continue; + } + + if (msg.role == "user") { + if (isFirst) { + inputs.push_back(llm::make_text_input(std::string(kChatPrefix))); + isFirst = false; + } else { + inputs.push_back(llm::make_text_input(std::string(kUserTurnPrefix))); + } + + if (!msg.mediaPath.empty()) { + const llm::Image &img = getOrLoadImage(msg.mediaPath); + inputs.push_back(llm::make_image_input(img)); + } + + if (!msg.content.empty()) { + inputs.push_back(llm::make_text_input(msg.content)); + } + + inputs.push_back(llm::make_text_input(std::string(kChatSuffix))); + } else if (msg.role == "assistant") { + inputs.push_back(llm::make_text_input(msg.content + kAssistantTurnEnd)); + isFirst = false; + } + } + + if (inputs.empty()) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "No inputs to generate from"); + } + + std::string output; + auto nativeCallback = [this, &callback, &output](const std::string &token) { + output += token; + if (callback && callInvoker) { + callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) { + callback->call(runtime, jsi::String::createFromUtf8(runtime, token)); + }); + } + }; + + runner_->reset(); + auto error = + runner_->generate(inputs, temperature_, topp_, -1, nativeCallback); + if (error != Error::Ok) { + throw RnExecutorchError(error, "Failed to generate multimodal response"); + } + + return output; +} + void LLM::interrupt() { if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, @@ -148,6 +233,7 @@ void LLM::reset() { "Can't reset a model that's not loaded"); } runner_->reset(); + imageCache_.clear(); } size_t LLM::getGeneratedTokenCount() const noexcept { From d91a64a60f21929d9bcb982575b1b9cf6cc0c530 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 13:25:00 +0100 Subject: [PATCH 09/46] feat: add mediaPath to Message, remove sendMessageWithImage from LLMType --- .../react-native-executorch/src/types/llm.ts | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 71dfcd3f8..26843db92 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -110,12 +110,14 @@ export interface LLMType { /** * Function to add user message to conversation. - * After model responds, `messageHistory` will be updated with both user message and model response. + * Pass `mediaPath` for a multimodal message (image, audio, etc.). + * After model responds, `messageHistory` will be updated. * * @param message - The message string to send. + * @param mediaPath - Optional local file path to media. * @returns The model's response as a `string`. */ - sendMessage: (message: string) => Promise; + sendMessage: (message: string, mediaPath?: string) => Promise; /** * Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated. @@ -128,16 +130,6 @@ export interface LLMType { * Function to interrupt the current inference. */ interrupt: () => void; - - /** - * Send a user message with an image. Updates messageHistory after model responds. - * Only valid for multimodal models (loaded with `isMultimodal: true`). - * - * @param imagePath - Local path to the image file. - * @param message - The text question about the image. - * @returns The model's response as a string. - */ - sendMessageWithImage: (imagePath: string, message: string) => Promise; } /** @@ -199,6 +191,11 @@ export type MessageRole = 'user' | 'assistant' | 'system'; export interface Message { role: MessageRole; content: string; + /** + * Optional local file path to media (image, audio, etc.). + * Only valid on `user` messages. + */ + mediaPath?: string; } /** From 49f5af68bdf2a0c47a2b8f7d47f6a5ac6f367d27 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 14:11:16 +0100 Subject: [PATCH 10/46] feat: replace sendMessageWithImage with sendMessage(msg, mediaPath?) using generateMultimodal Co-Authored-By: Claude Sonnet 4.6 --- apps/llm/app/multimodal_llm/index.tsx | 6 +- .../src/controllers/LLMController.ts | 128 ++++++++---------- .../natural_language_processing/useLLM.ts | 13 +- .../react-native-executorch/src/types/llm.ts | 2 +- 4 files changed, 61 insertions(+), 88 deletions(-) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index f5f402183..7a3f85671 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -72,11 +72,7 @@ function MultimodalLLMScreen() { textInputRef.current?.clear(); Keyboard.dismiss(); try { - if (imageUri) { - await vlm.sendMessageWithImage(imageUri, text); - } else { - await vlm.sendMessage(text); - } + await vlm.sendMessage(text, imageUri ?? undefined); } catch (e) { console.error('Generation error:', e); } diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 6e024220d..a47c04b9d 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -274,10 +274,7 @@ export class LLMController { } } - public async generateWithImage( - imagePath: string, - prompt: string - ): Promise { + private async generateMultimodal(messages: Message[]): Promise { if (!this._isReady) { throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, @@ -287,7 +284,7 @@ export class LLMController { if (!this.isMultimodal_) { throw new RnExecutorchError( RnExecutorchErrorCode.InvalidUserInput, - 'generateWithImage() requires a multimodal model. Load with isMultimodal: true.' + 'generateMultimodal() requires a multimodal model.' ); } if (this._isGenerating) { @@ -299,9 +296,8 @@ export class LLMController { try { this.isGeneratingCallback(true); this.nativeModule.reset(); - const response = await this.nativeModule.generateWithImage( - imagePath, - prompt, + const response = await this.nativeModule.generateMultimodal( + messages, this.onToken ); return response; @@ -312,26 +308,6 @@ export class LLMController { } } - public async sendMessageWithImage( - imagePath: string, - message: string - ): Promise { - const updatedHistory = [ - ...this._messageHistory, - { content: message, role: 'user' as const }, - ]; - this.messageHistoryCallback(updatedHistory); - - const response = await this.generateWithImage(imagePath, message); - - this.messageHistoryCallback([ - ...this._messageHistory, - { content: response, role: 'assistant' }, - ]); - - return response; - } - public interrupt() { if (!this.nativeModule) { throw new RnExecutorchError( @@ -399,36 +375,47 @@ export class LLMController { return await this.forward(renderedChat); } - public async sendMessage(message: string): Promise { - const updatedHistory = [ - ...this._messageHistory, - { content: message, role: 'user' as const }, - ]; + public async sendMessage( + message: string, + mediaPath?: string + ): Promise { + const newMessage: Message = { + content: message, + role: 'user', + ...(mediaPath ? { mediaPath } : {}), + }; + const updatedHistory = [...this._messageHistory, newMessage]; this.messageHistoryCallback(updatedHistory); - const countTokensCallback = (messages: Message[]) => { - const rendered = this.applyChatTemplate( - messages, - this.tokenizerConfig, - this.toolsConfig?.tools, - // eslint-disable-next-line camelcase - { tools_in_user_message: false, add_generation_prompt: true } - ); - return this.nativeModule.countTextTokens(rendered); - }; - const maxContextLength = this.nativeModule.getMaxContextLength(); - const messageHistoryWithPrompt = - this.chatConfig.contextStrategy.buildContext( - this.chatConfig.systemPrompt, - updatedHistory, - maxContextLength, - countTokensCallback + let response: string; + + if (mediaPath || this._messageHistory.some((m) => m.mediaPath)) { + // Any message in history has media — use multimodal path + response = await this.generateMultimodal(updatedHistory); + } else { + const countTokensCallback = (messages: Message[]) => { + const rendered = this.applyChatTemplate( + messages, + this.tokenizerConfig, + this.toolsConfig?.tools, + // eslint-disable-next-line camelcase + { tools_in_user_message: false, add_generation_prompt: true } + ); + return this.nativeModule.countTextTokens(rendered); + }; + const maxContextLength = this.nativeModule.getMaxContextLength(); + const messageHistoryWithPrompt = + this.chatConfig.contextStrategy.buildContext( + this.chatConfig.systemPrompt, + updatedHistory, + maxContextLength, + countTokensCallback + ); + response = await this.generate( + messageHistoryWithPrompt, + this.toolsConfig?.tools ); - - const response = await this.generate( - messageHistoryWithPrompt, - this.toolsConfig?.tools - ); + } if (!this.toolsConfig || this.toolsConfig.displayToolCalls) { this.messageHistoryCallback([ @@ -436,24 +423,23 @@ export class LLMController { { content: response, role: 'assistant' }, ]); } - if (!this.toolsConfig) { - return response; - } - const toolCalls = parseToolCall(response); - - for (const toolCall of toolCalls) { - this.toolsConfig - .executeToolCallback(toolCall) - .then((toolResponse: string | null) => { - if (toolResponse) { - this.messageHistoryCallback([ - ...this._messageHistory, - { content: toolResponse, role: 'assistant' }, - ]); - } - }); + if (this.toolsConfig) { + const toolCalls = parseToolCall(response); + for (const toolCall of toolCalls) { + this.toolsConfig + .executeToolCallback(toolCall) + .then((toolResponse: string | null) => { + if (toolResponse) { + this.messageHistoryCallback([ + ...this._messageHistory, + { content: toolResponse, role: 'assistant' }, + ]); + } + }); + } } + return response; } diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 2920e1bb5..deabbbbb0 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -94,9 +94,9 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { ); const sendMessage = useCallback( - (message: string) => { + (message: string, mediaPath?: string) => { setResponse(''); - return controllerInstance.sendMessage(message); + return controllerInstance.sendMessage(message, mediaPath); }, [controllerInstance] ); @@ -126,14 +126,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { [controllerInstance] ); - const sendMessageWithImage = useCallback( - (imagePath: string, message: string) => { - setResponse(''); - return controllerInstance.sendMessageWithImage(imagePath, message); - }, - [controllerInstance] - ); - return { messageHistory, response, @@ -150,6 +142,5 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { sendMessage: sendMessage, deleteMessage: deleteMessage, interrupt: interrupt, - sendMessageWithImage: sendMessageWithImage, }; }; diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 26843db92..0c648c25d 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -22,7 +22,7 @@ export interface LLMProps { tokenizerConfigSource?: ResourceSource; /** * Set to `true` when loading a vision-language (multimodal) model. - * Skips tokenizer config fetching and enables `sendMessageWithImage`. + * Skips tokenizer config fetching and enables multimodal message handling via `sendMessage`. */ isMultimodal?: boolean; }; From d07ce65c6cc0834c2157c1a8cc22c6cc17fe5087 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:04:50 +0100 Subject: [PATCH 11/46] fix: use updatedHistory for multimodal routing, remove redundant reset before generateMultimodal --- .../react-native-executorch/src/controllers/LLMController.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index a47c04b9d..cb854546f 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -295,7 +295,6 @@ export class LLMController { } try { this.isGeneratingCallback(true); - this.nativeModule.reset(); const response = await this.nativeModule.generateMultimodal( messages, this.onToken @@ -389,7 +388,7 @@ export class LLMController { let response: string; - if (mediaPath || this._messageHistory.some((m) => m.mediaPath)) { + if (updatedHistory.some((m) => m.mediaPath)) { // Any message in history has media — use multimodal path response = await this.generateMultimodal(updatedHistory); } else { From b29f74c1b2f639e6c80258cc88911c7557d12315 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:07:51 +0100 Subject: [PATCH 12/46] fix: skip system messages in generateMultimodal, clear imageUri after send --- apps/llm/app/multimodal_llm/index.tsx | 1 + .../common/rnexecutorch/models/llm/LLM.cpp | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 7a3f85671..486b4e109 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -73,6 +73,7 @@ function MultimodalLLMScreen() { Keyboard.dismiss(); try { await vlm.sendMessage(text, imageUri ?? undefined); + setImageUri(null); } catch (e) { console.error('Generation error:', e); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 441d20dad..ab6398bc3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -164,9 +164,8 @@ std::string LLM::generateMultimodal( for (const auto &msg : messages) { if (msg.role == "system") { - if (isFirst) { - inputs.push_back(llm::make_text_input(msg.content + "\n")); - } + // LFM2-VL has no dedicated system turn — skip silently, consistent + // with the single-turn generate(imagePath, prompt, cb) path. continue; } From e1d0f08df3730eae28d703c076c6920f4fe4c94f Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:17:23 +0100 Subject: [PATCH 13/46] feat: show image thumbnail in user message bubble when mediaPath is set --- apps/llm/components/MessageItem.tsx | 50 +++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 14 deletions(-) diff --git a/apps/llm/components/MessageItem.tsx b/apps/llm/components/MessageItem.tsx index c4d7d549e..2a235d459 100644 --- a/apps/llm/components/MessageItem.tsx +++ b/apps/llm/components/MessageItem.tsx @@ -4,6 +4,7 @@ import { StyleSheet, TouchableOpacity, Text, + Image, Platform, } from 'react-native'; import MarkdownComponent from './MarkdownComponent'; @@ -17,19 +18,31 @@ interface MessageItemProps { } const MessageItem = memo(({ message, deleteMessage }: MessageItemProps) => { - return ( - - {message.role === 'assistant' && ( + if (message.role === 'assistant') { + return ( + - )} - + + + + ); + } + + return ( + + + {message.mediaPath && ( + + )} + + ); }); @@ -64,17 +77,26 @@ const styles = StyleSheet.create({ marginVertical: 8, alignItems: 'center', }, - userMessage: { + userMessageWrapper: { flexDirection: 'row-reverse', - paddingHorizontal: 12, - paddingVertical: 8, marginRight: 8, marginVertical: 8, maxWidth: '75%', + alignSelf: 'flex-end', + alignItems: 'flex-start', + }, + userMessageBubble: { + flexDirection: 'column', + paddingHorizontal: 12, + paddingVertical: 8, borderRadius: 8, backgroundColor: ColorPalette.seaBlueLight, - alignSelf: 'flex-end', - alignItems: 'center', + }, + userMessageImage: { + width: 200, + height: 150, + borderRadius: 6, + marginBottom: 6, }, aiMessageIconContainer: { backgroundColor: ColorPalette.seaBlueLight, From 11cab574e756ff3a6e01a90312a37c3f8d23fbc1 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:20:18 +0100 Subject: [PATCH 14/46] fix: use resizeMode contain so full image is always visible in message bubble --- apps/llm/components/MessageItem.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/llm/components/MessageItem.tsx b/apps/llm/components/MessageItem.tsx index 2a235d459..58da5074c 100644 --- a/apps/llm/components/MessageItem.tsx +++ b/apps/llm/components/MessageItem.tsx @@ -38,7 +38,7 @@ const MessageItem = memo(({ message, deleteMessage }: MessageItemProps) => { )} @@ -94,7 +94,7 @@ const styles = StyleSheet.create({ }, userMessageImage: { width: 200, - height: 150, + height: 200, borderRadius: 6, marginBottom: 6, }, From 9ddd5d75030ee6f0f9d78f5482daf89092cbd576 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:32:31 +0100 Subject: [PATCH 15/46] refactor: derive isMultimodal from load param, unify load branches, remove tokenizerConfig guard Co-Authored-By: Claude Sonnet 4.6 --- .../host_objects/ModelHostObject.h | 4 - .../common/rnexecutorch/models/llm/LLM.cpp | 2 - .../common/rnexecutorch/models/llm/LLM.h | 2 - .../src/controllers/LLMController.ts | 73 +++++-------------- 4 files changed, 18 insertions(+), 63 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 334f1f833..2c7a3e535 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -164,10 +164,6 @@ template class ModelHostObject : public JsiHostObject { JSI_EXPORT_FUNCTION(ModelHostObject, promiseHostFunction<&Model::generateMultimodal>, "generateMultimodal")); - - addFunctions(JSI_EXPORT_FUNCTION( - ModelHostObject, synchronousHostFunction<&Model::isMultimodal>, - "isMultimodal")); } if constexpr (meta::SameAs) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index ab6398bc3..2210a5cf8 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -81,8 +81,6 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, fs::file_size(fs::path(tokenizerSource)); } -bool LLM::isMultimodal() const noexcept { return multimodal_; } - std::string LLM::generate(std::string input, std::shared_ptr callback) { if (!runner_ || !runner_->is_loaded()) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 5c9e1e458..11f8c5e06 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -33,8 +33,6 @@ class LLM : public BaseModel { std::vector messages, std::shared_ptr callback); - bool isMultimodal() const noexcept; - void interrupt(); void reset(); void unload() noexcept; diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index cb854546f..e9a113459 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -91,68 +91,34 @@ export class LLMController { this.isReadyCallback(false); try { - let tokenizerPath: string | undefined; - let modelPath: string | undefined; - - if (isMultimodal) { - const [tokenizerResults, modelResult] = await Promise.all([ - ResourceFetcher.fetch( - undefined, - tokenizerSource, - ...(tokenizerConfigSource ? [tokenizerConfigSource] : []) - ), - ResourceFetcher.fetch(onDownloadProgressCallback, modelSource), - ]); - tokenizerPath = tokenizerResults?.[0]; - modelPath = modelResult?.[0]; - - if (!tokenizerPath || !modelPath) { - throw new RnExecutorchError( - RnExecutorchErrorCode.DownloadInterrupted, - 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' - ); - } - - if (tokenizerConfigSource && tokenizerResults?.[1]) { - this.tokenizerConfig = JSON.parse( - await ResourceFetcher.fs.readAsString(tokenizerResults[1]) - ); - } - } else { - const tokenizersPromise = ResourceFetcher.fetch( + const [tokenizerResults, modelResult] = await Promise.all([ + ResourceFetcher.fetch( undefined, tokenizerSource, - tokenizerConfigSource! - ); - - const modelPromise = ResourceFetcher.fetch( - onDownloadProgressCallback, - modelSource - ); - - const [tokenizersResults, modelResult] = await Promise.all([ - tokenizersPromise, - modelPromise, - ]); + ...(tokenizerConfigSource ? [tokenizerConfigSource] : []) + ), + ResourceFetcher.fetch(onDownloadProgressCallback, modelSource), + ]); - tokenizerPath = tokenizersResults?.[0]; - const tokenizerConfigPath = tokenizersResults?.[1]; - modelPath = modelResult?.[0]; + const tokenizerPath = tokenizerResults?.[0]; + const tokenizerConfigPath = tokenizerResults?.[1]; + const modelPath = modelResult?.[0]; - if (!tokenizerPath || !tokenizerConfigPath || !modelPath) { - throw new RnExecutorchError( - RnExecutorchErrorCode.DownloadInterrupted, - 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' - ); - } + if (!tokenizerPath || !modelPath) { + throw new RnExecutorchError( + RnExecutorchErrorCode.DownloadInterrupted, + 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' + ); + } + if (tokenizerConfigPath) { this.tokenizerConfig = JSON.parse( - await ResourceFetcher.fs.readAsString(tokenizerConfigPath!) + await ResourceFetcher.fs.readAsString(tokenizerConfigPath) ); } this.nativeModule = global.loadLLM(modelPath, tokenizerPath); - this.isMultimodal_ = this.nativeModule.isMultimodal(); + this.isMultimodal_ = isMultimodal; this.isReadyCallback(true); this.onToken = (data: string) => { if (!data) { @@ -214,9 +180,6 @@ export class LLMController { } private filterSpecialTokens(text: string): string { - if (!this.tokenizerConfig) { - return text; - } let filtered = text; if ( SPECIAL_TOKENS.EOS_TOKEN in this.tokenizerConfig && From 7d2ce9b3e73f795b905c4d0a6af53e29046b7b14 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:37:47 +0100 Subject: [PATCH 16/46] refactor: remove isMultimodal flag, inline generateMultimodal into sendMessage Co-Authored-By: Claude Sonnet 4.6 --- apps/llm/app/multimodal_llm/index.tsx | 1 - .../src/controllers/LLMController.ts | 86 +++++++------------ .../natural_language_processing/useLLM.ts | 2 - .../react-native-executorch/src/types/llm.ts | 5 -- 4 files changed, 31 insertions(+), 63 deletions(-) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 486b4e109..cf9a3f4e6 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -45,7 +45,6 @@ function MultimodalLLMScreen() { modelSource: MODEL_SOURCE, tokenizerSource: TOKENIZER_SOURCE, tokenizerConfigSource: TOKENIZER_CONFIG_SOURCE, - isMultimodal: true, }, }); diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index e9a113459..43852e09d 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -24,8 +24,6 @@ export class LLMController { private _isReady = false; private _isGenerating = false; private _messageHistory: Message[] = []; - private isMultimodal_ = false; - // User callbacks private tokenCallback: (token: string) => void; private messageHistoryCallback: (messageHistory: Message[]) => void; @@ -77,13 +75,11 @@ export class LLMController { tokenizerSource, tokenizerConfigSource, onDownloadProgressCallback, - isMultimodal = false, }: { modelSource: ResourceSource; tokenizerSource: ResourceSource; tokenizerConfigSource?: ResourceSource; onDownloadProgressCallback?: (downloadProgress: number) => void; - isMultimodal?: boolean; }) { // reset inner state when loading new model this.messageHistoryCallback(this.chatConfig.initialMessageHistory); @@ -91,34 +87,37 @@ export class LLMController { this.isReadyCallback(false); try { - const [tokenizerResults, modelResult] = await Promise.all([ - ResourceFetcher.fetch( - undefined, - tokenizerSource, - ...(tokenizerConfigSource ? [tokenizerConfigSource] : []) - ), - ResourceFetcher.fetch(onDownloadProgressCallback, modelSource), + const tokenizersPromise = ResourceFetcher.fetch( + undefined, + tokenizerSource, + ...(tokenizerConfigSource ? [tokenizerConfigSource] : []) + ); + + const modelPromise = ResourceFetcher.fetch( + onDownloadProgressCallback, + modelSource + ); + + const [tokenizersResults, modelResult] = await Promise.all([ + tokenizersPromise, + modelPromise, ]); - const tokenizerPath = tokenizerResults?.[0]; - const tokenizerConfigPath = tokenizerResults?.[1]; + const tokenizerPath = tokenizersResults?.[0]; + const tokenizerConfigPath = tokenizersResults?.[1]; const modelPath = modelResult?.[0]; - if (!tokenizerPath || !modelPath) { + if (!tokenizerPath || !tokenizerConfigPath || !modelPath) { throw new RnExecutorchError( RnExecutorchErrorCode.DownloadInterrupted, 'The download has been interrupted. As a result, not every file was downloaded. Please retry the download.' ); } - if (tokenizerConfigPath) { - this.tokenizerConfig = JSON.parse( - await ResourceFetcher.fs.readAsString(tokenizerConfigPath) - ); - } - + this.tokenizerConfig = JSON.parse( + await ResourceFetcher.fs.readAsString(tokenizerConfigPath!) + ); this.nativeModule = global.loadLLM(modelPath, tokenizerPath); - this.isMultimodal_ = isMultimodal; this.isReadyCallback(true); this.onToken = (data: string) => { if (!data) { @@ -237,39 +236,6 @@ export class LLMController { } } - private async generateMultimodal(messages: Message[]): Promise { - if (!this._isReady) { - throw new RnExecutorchError( - RnExecutorchErrorCode.ModuleNotLoaded, - 'The model is currently not loaded.' - ); - } - if (!this.isMultimodal_) { - throw new RnExecutorchError( - RnExecutorchErrorCode.InvalidUserInput, - 'generateMultimodal() requires a multimodal model.' - ); - } - if (this._isGenerating) { - throw new RnExecutorchError( - RnExecutorchErrorCode.ModelGenerating, - 'The model is currently generating.' - ); - } - try { - this.isGeneratingCallback(true); - const response = await this.nativeModule.generateMultimodal( - messages, - this.onToken - ); - return response; - } catch (e) { - throw parseUnknownError(e); - } finally { - this.isGeneratingCallback(false); - } - } - public interrupt() { if (!this.nativeModule) { throw new RnExecutorchError( @@ -353,7 +319,17 @@ export class LLMController { if (updatedHistory.some((m) => m.mediaPath)) { // Any message in history has media — use multimodal path - response = await this.generateMultimodal(updatedHistory); + try { + this.isGeneratingCallback(true); + response = await this.nativeModule.generateMultimodal( + updatedHistory, + this.onToken + ); + } catch (e) { + throw parseUnknownError(e); + } finally { + this.isGeneratingCallback(false); + } } else { const countTokensCallback = (messages: Message[]) => { const rendered = this.applyChatTemplate( diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index deabbbbb0..502b06f1d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -53,7 +53,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { tokenizerSource: model.tokenizerSource, tokenizerConfigSource: model.tokenizerConfigSource, onDownloadProgressCallback: setDownloadProgress, - isMultimodal: model.isMultimodal, }); } catch (e) { setError(parseUnknownError(e)); @@ -70,7 +69,6 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { model.modelSource, model.tokenizerSource, model.tokenizerConfigSource, - model.isMultimodal, preventLoad, ]); diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 0c648c25d..df0b3a06d 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -20,11 +20,6 @@ export interface LLMProps { * `ResourceSource` pointing to the JSON file which contains the tokenizer config. */ tokenizerConfigSource?: ResourceSource; - /** - * Set to `true` when loading a vision-language (multimodal) model. - * Skips tokenizer config fetching and enables multimodal message handling via `sendMessage`. - */ - isMultimodal?: boolean; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. From 87fa1f0b4773dc01c08405427e8f5c7ad346f8e9 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:40:12 +0100 Subject: [PATCH 17/46] fix: make tokenizerConfigSource required throughout load pipeline Co-Authored-By: Claude Sonnet 4.6 --- .../react-native-executorch/src/controllers/LLMController.ts | 4 ++-- packages/react-native-executorch/src/types/llm.ts | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 43852e09d..925ad2420 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -78,7 +78,7 @@ export class LLMController { }: { modelSource: ResourceSource; tokenizerSource: ResourceSource; - tokenizerConfigSource?: ResourceSource; + tokenizerConfigSource: ResourceSource; onDownloadProgressCallback?: (downloadProgress: number) => void; }) { // reset inner state when loading new model @@ -90,7 +90,7 @@ export class LLMController { const tokenizersPromise = ResourceFetcher.fetch( undefined, tokenizerSource, - ...(tokenizerConfigSource ? [tokenizerConfigSource] : []) + tokenizerConfigSource ); const modelPromise = ResourceFetcher.fetch( diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index df0b3a06d..c660e3a7d 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -19,7 +19,7 @@ export interface LLMProps { /** * `ResourceSource` pointing to the JSON file which contains the tokenizer config. */ - tokenizerConfigSource?: ResourceSource; + tokenizerConfigSource: ResourceSource; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. From b39895288116c3e74182858a075b84ac3ffe61f6 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:41:59 +0100 Subject: [PATCH 18/46] fix: prepend system prompt to multimodal history before generateMultimodal Co-Authored-By: Claude Sonnet 4.6 --- .../src/controllers/LLMController.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 925ad2420..bd76f5e4e 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -319,10 +319,14 @@ export class LLMController { if (updatedHistory.some((m) => m.mediaPath)) { // Any message in history has media — use multimodal path + const historyWithSystemPrompt = [ + { content: this.chatConfig.systemPrompt, role: 'system' as const }, + ...updatedHistory, + ]; try { this.isGeneratingCallback(true); response = await this.nativeModule.generateMultimodal( - updatedHistory, + historyWithSystemPrompt, this.onToken ); } catch (e) { From a0b80e38ae4937677d87e941b1c02aae15d645ac Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 15:56:55 +0100 Subject: [PATCH 19/46] =?UTF-8?q?refactor:=20unify=20generate=20=E2=80=94?= =?UTF-8?q?=20Jinja=20renders=20prompt+=20tokens=20in=20JS,=20C++?= =?UTF-8?q?=20splits=20on=20placeholder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../host_objects/JsiConversions.h | 30 ----- .../host_objects/ModelHostObject.h | 11 +- .../common/rnexecutorch/models/llm/LLM.cpp | 103 +++++------------- .../common/rnexecutorch/models/llm/LLM.h | 15 +-- .../src/controllers/LLMController.ts | 95 +++++++++------- 5 files changed, 92 insertions(+), 162 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index 08acf6cff..df9abbdef 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -228,36 +228,6 @@ getValue>(const jsi::Value &val, jsi::Runtime &runtime) { return getArrayAsVector(val, runtime); } -struct NativeMessage { - std::string role; // "user" | "assistant" | "system" - std::string content; - std::string mediaPath; // empty string if no media -}; - -template <> -inline std::vector -getValue>(const jsi::Value &val, - jsi::Runtime &runtime) { - jsi::Array array = val.asObject(runtime).asArray(runtime); - size_t length = array.size(runtime); - std::vector result; - result.reserve(length); - for (size_t i = 0; i < length; ++i) { - jsi::Object obj = array.getValueAtIndex(runtime, i).asObject(runtime); - NativeMessage msg; - msg.role = - obj.getProperty(runtime, "role").getString(runtime).utf8(runtime); - msg.content = - obj.getProperty(runtime, "content").getString(runtime).utf8(runtime); - auto mediaProp = obj.getProperty(runtime, "mediaPath"); - if (!mediaProp.isUndefined() && !mediaProp.isNull()) { - msg.mediaPath = mediaProp.getString(runtime).utf8(runtime); - } - result.push_back(std::move(msg)); - } - return result; -} - // Template specializations for std::span types template <> inline std::span getValue>(const jsi::Value &val, diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 2c7a3e535..a4af6eb8f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -156,14 +156,9 @@ template class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION( ModelHostObject, promiseHostFunction)>( - &Model::generate)>, - "generateWithImage")); - - addFunctions( - JSI_EXPORT_FUNCTION(ModelHostObject, - promiseHostFunction<&Model::generateMultimodal>, - "generateMultimodal")); + std::string, std::vector, + std::shared_ptr)>(&Model::generate)>, + "generateMultimodal")); } if constexpr (meta::SameAs) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 2210a5cf8..b26cd3b20 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -18,15 +18,6 @@ using executorch::runtime::Error; static constexpr int kImageSize = 512; static constexpr int kImageChannels = 3; -// LFM2-VL chat template -static constexpr const char *kChatPrefix = "<|startoftext|><|im_start|>user\n"; -static constexpr const char *kChatSuffix = - "<|im_end|>\n<|im_start|>assistant\n"; -// Separator inserted after each assistant turn in multi-turn conversations -static constexpr const char *kAssistantTurnEnd = "<|im_end|>\n"; -// Prefix for subsequent user turns (no BOS token — only first turn has it) -static constexpr const char *kUserTurnPrefix = "<|im_start|>user\n"; - static llm::Image loadImageForVLM(const std::string &imagePath) { cv::Mat mat = image_processing::readImage(imagePath); cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); @@ -106,7 +97,8 @@ std::string LLM::generate(std::string input, return output; } -std::string LLM::generate(std::string imagePath, std::string prompt, +std::string LLM::generate(std::string prompt, + std::vector imagePaths, std::shared_ptr callback) { if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, @@ -118,77 +110,34 @@ std::string LLM::generate(std::string imagePath, std::string prompt, "This is a text-only model. Call generate(prompt, cb)."); } - llm::Image image = loadImageForVLM(imagePath); - std::vector inputs = { - llm::make_text_input(std::string(kChatPrefix)), - llm::make_image_input(std::move(image)), - llm::make_text_input(prompt + kChatSuffix), - }; - - std::string output; - auto nativeCallback = [this, &callback, &output](const std::string &token) { - output += token; - if (callback && callInvoker) { - callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) { - callback->call(runtime, jsi::String::createFromUtf8(runtime, token)); - }); - } - }; - - auto error = - runner_->generate(inputs, temperature_, topp_, -1, nativeCallback); - if (error != Error::Ok) { - throw RnExecutorchError(error, "Failed to generate multimodal response"); - } - - return output; -} - -std::string LLM::generateMultimodal( - std::vector messages, - std::shared_ptr callback) { - if (!runner_ || !runner_->is_loaded()) { - throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, - "Runner is not loaded"); - } - if (!multimodal_) { - throw RnExecutorchError( - RnExecutorchErrorCode::InvalidUserInput, - "This is a text-only model. Use generate(prompt, cb) instead."); - } + // Split rendered prompt on "" placeholders and interleave with images. + static constexpr const char *kImageToken = ""; + static constexpr size_t kImageTokenLen = 7; // strlen("") std::vector inputs; - bool isFirst = true; - - for (const auto &msg : messages) { - if (msg.role == "system") { - // LFM2-VL has no dedicated system turn — skip silently, consistent - // with the single-turn generate(imagePath, prompt, cb) path. - continue; - } - - if (msg.role == "user") { - if (isFirst) { - inputs.push_back(llm::make_text_input(std::string(kChatPrefix))); - isFirst = false; - } else { - inputs.push_back(llm::make_text_input(std::string(kUserTurnPrefix))); - } - - if (!msg.mediaPath.empty()) { - const llm::Image &img = getOrLoadImage(msg.mediaPath); - inputs.push_back(llm::make_image_input(img)); + size_t imageIdx = 0; + size_t searchPos = 0; + + while (true) { + size_t found = prompt.find(kImageToken, searchPos); + if (found == std::string::npos) { + // Remaining text after last image (or entire prompt if no images) + if (searchPos < prompt.size()) { + inputs.push_back(llm::make_text_input(prompt.substr(searchPos))); } - - if (!msg.content.empty()) { - inputs.push_back(llm::make_text_input(msg.content)); - } - - inputs.push_back(llm::make_text_input(std::string(kChatSuffix))); - } else if (msg.role == "assistant") { - inputs.push_back(llm::make_text_input(msg.content + kAssistantTurnEnd)); - isFirst = false; + break; + } + // Text segment before this placeholder + if (found > searchPos) { + inputs.push_back( + llm::make_text_input(prompt.substr(searchPos, found - searchPos))); + } + // Image at this position + if (imageIdx < imagePaths.size()) { + const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]); + inputs.push_back(llm::make_image_input(img)); } + searchPos = found + kImageTokenLen; } if (inputs.empty()) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 11f8c5e06..6e47dbed0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -5,7 +5,6 @@ #include #include -#include #include #include #include @@ -20,19 +19,15 @@ class LLM : public BaseModel { const std::string &tokenizerSource, std::shared_ptr callInvoker); - // Text-only generate (existing signature — used by LLMController) - std::string generate(std::string input, + // Text-only: pre-rendered prompt string + std::string generate(std::string prompt, std::shared_ptr callback); - // Multimodal generate (image + text prompt) - std::string generate(std::string imagePath, std::string prompt, + // Multimodal: pre-rendered prompt string with placeholders + + // ordered list of image paths (one per placeholder) + std::string generate(std::string prompt, std::vector imagePaths, std::shared_ptr callback); - // Multimodal generate — takes full message history, builds MultimodalInput[] - std::string generateMultimodal( - std::vector messages, - std::shared_ptr callback); - void interrupt(); void reset(); void unload() noexcept; diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index bd76f5e4e..7062d78cd 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -211,7 +211,7 @@ export class LLMController { this.isGeneratingCallback(false); } - public async forward(input: string): Promise { + public async forward(input: string, imagePaths?: string[]): Promise { if (!this._isReady) { throw new RnExecutorchError( RnExecutorchErrorCode.ModuleNotLoaded, @@ -227,7 +227,14 @@ export class LLMController { try { this.isGeneratingCallback(true); this.nativeModule.reset(); - const response = await this.nativeModule.generate(input, this.onToken); + const response = + imagePaths && imagePaths.length > 0 + ? await this.nativeModule.generateMultimodal( + input, + imagePaths, + this.onToken + ) + : await this.nativeModule.generate(input, this.onToken); return this.filterSpecialTokens(response); } catch (e) { throw parseUnknownError(e); @@ -317,42 +324,56 @@ export class LLMController { let response: string; - if (updatedHistory.some((m) => m.mediaPath)) { - // Any message in history has media — use multimodal path - const historyWithSystemPrompt = [ - { content: this.chatConfig.systemPrompt, role: 'system' as const }, - ...updatedHistory, - ]; - try { - this.isGeneratingCallback(true); - response = await this.nativeModule.generateMultimodal( - historyWithSystemPrompt, - this.onToken - ); - } catch (e) { - throw parseUnknownError(e); - } finally { - this.isGeneratingCallback(false); - } + const isMultimodal = updatedHistory.some((m) => m.mediaPath); + + // For multimodal messages, convert mediaPath into structured content so + // the chat template emits placeholders in the right position. + const historyForTemplate = isMultimodal + ? updatedHistory.map((m) => + m.mediaPath + ? { + ...m, + content: [ + { type: 'image' }, + { type: 'text', text: m.content }, + ] as any, + } + : m + ) + : updatedHistory; + + const countTokensCallback = (messages: Message[]) => { + const rendered = this.applyChatTemplate( + messages, + this.tokenizerConfig, + this.toolsConfig?.tools, + // eslint-disable-next-line camelcase + { tools_in_user_message: false, add_generation_prompt: true } + ); + return this.nativeModule.countTextTokens(rendered); + }; + const maxContextLength = this.nativeModule.getMaxContextLength(); + const messageHistoryWithPrompt = + this.chatConfig.contextStrategy.buildContext( + this.chatConfig.systemPrompt, + historyForTemplate, + maxContextLength, + countTokensCallback + ); + + if (isMultimodal) { + const renderedPrompt = this.applyChatTemplate( + messageHistoryWithPrompt, + this.tokenizerConfig, + undefined, + // eslint-disable-next-line camelcase + { tools_in_user_message: false, add_generation_prompt: true } + ); + const imagePaths = updatedHistory + .filter((m) => m.mediaPath) + .map((m) => m.mediaPath!); + response = await this.forward(renderedPrompt, imagePaths); } else { - const countTokensCallback = (messages: Message[]) => { - const rendered = this.applyChatTemplate( - messages, - this.tokenizerConfig, - this.toolsConfig?.tools, - // eslint-disable-next-line camelcase - { tools_in_user_message: false, add_generation_prompt: true } - ); - return this.nativeModule.countTextTokens(rendered); - }; - const maxContextLength = this.nativeModule.getMaxContextLength(); - const messageHistoryWithPrompt = - this.chatConfig.contextStrategy.buildContext( - this.chatConfig.systemPrompt, - updatedHistory, - maxContextLength, - countTokensCallback - ); response = await this.generate( messageHistoryWithPrompt, this.toolsConfig?.tools From 13f631e618e1d812c1ca6fd4231f4c389a6b0e22 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 16:04:19 +0100 Subject: [PATCH 20/46] fix: collect imagePaths from messageHistoryWithPrompt, not full history Co-Authored-By: Claude Sonnet 4.6 --- .../react-native-executorch/src/controllers/LLMController.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 7062d78cd..7e3b8baf2 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -369,7 +369,7 @@ export class LLMController { // eslint-disable-next-line camelcase { tools_in_user_message: false, add_generation_prompt: true } ); - const imagePaths = updatedHistory + const imagePaths = messageHistoryWithPrompt .filter((m) => m.mediaPath) .map((m) => m.mediaPath!); response = await this.forward(renderedPrompt, imagePaths); From 76f9c7c035ed0cc005399bf71e9beffdf48e6f52 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 16:07:36 +0100 Subject: [PATCH 21/46] fix: typing --- .../src/hooks/natural_language_processing/useLLM.ts | 2 +- packages/react-native-executorch/src/types/llm.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 502b06f1d..99210e357 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -51,7 +51,7 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { await controllerInstance.load({ modelSource: model.modelSource, tokenizerSource: model.tokenizerSource, - tokenizerConfigSource: model.tokenizerConfigSource, + tokenizerConfigSource: model.tokenizerConfigSource!, onDownloadProgressCallback: setDownloadProgress, }); } catch (e) { diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index c660e3a7d..df0b3a06d 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -19,7 +19,7 @@ export interface LLMProps { /** * `ResourceSource` pointing to the JSON file which contains the tokenizer config. */ - tokenizerConfigSource: ResourceSource; + tokenizerConfigSource?: ResourceSource; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. From ab8c088dc16cf9b47fff6f9583f4951a88a94d6b Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 17:00:40 +0100 Subject: [PATCH 22/46] feat: correctly calculate image tokens --- .../common/runner/multimodal_prefiller.cpp | 1 - .../react-native-executorch/src/controllers/LLMController.ts | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp index c39c7cc0f..098763550 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp @@ -11,7 +11,6 @@ #include "multimodal_prefiller.h" #include "constants.h" -#include "util.h" namespace executorch { namespace extension { diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 7e3b8baf2..7e9a37c86 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -342,6 +342,7 @@ export class LLMController { ) : updatedHistory; + const IMAGE_VISUAL_TOKENS = 256; const countTokensCallback = (messages: Message[]) => { const rendered = this.applyChatTemplate( messages, @@ -350,7 +351,9 @@ export class LLMController { // eslint-disable-next-line camelcase { tools_in_user_message: false, add_generation_prompt: true } ); - return this.nativeModule.countTextTokens(rendered); + const textTokens = this.nativeModule.countTextTokens(rendered); + const imageCount = messages.filter((m) => m.mediaPath).length; + return textTokens + imageCount * (IMAGE_VISUAL_TOKENS - 1); }; const maxContextLength = this.nativeModule.getMaxContextLength(); const messageHistoryWithPrompt = From c211ba9519f9cac177bcc0ddf01ca299eeeef14e Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 17:01:57 +0100 Subject: [PATCH 23/46] fix: add missing import --- .../common/runner/multimodal_prefiller.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp index 098763550..c39c7cc0f 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp @@ -11,6 +11,7 @@ #include "multimodal_prefiller.h" #include "constants.h" +#include "util.h" namespace executorch { namespace extension { From 0e29349e838a349e436a8412ab319ad4a11c8531 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 17:20:31 +0100 Subject: [PATCH 24/46] fix: fall back to max_seq_len when model doesn't export max_context_len --- .../common/runner/unified_runner.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp index a136835a3..12b431e8b 100644 --- a/packages/react-native-executorch/common/runner/unified_runner.cpp +++ b/packages/react-native-executorch/common/runner/unified_runner.cpp @@ -73,9 +73,11 @@ Error UnifiedRunner::load() { if (config_.max_seq_len < 0) config_.max_seq_len = static_cast(metadata_.at(kMaxSeqLen)); - if (config_.max_context_length < 0) + if (config_.max_context_length < 0) { + auto ctx = metadata_.at(kMaxContextLen); config_.max_context_length = - static_cast(metadata_.at(kMaxContextLen)); + static_cast(ctx > 128 ? ctx : metadata_.at(kMaxSeqLen)); + } if (config_.max_new_tokens < 0) config_.max_new_tokens = std::min(config_.max_seq_len, config_.max_context_length); From 520233f0d889ae69bb9f48ac33a0dd8d961beae8 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 17:25:31 +0100 Subject: [PATCH 25/46] =?UTF-8?q?fix:=20address=20code=20review=20?= =?UTF-8?q?=E2=80=94=20error=20on=20image/placeholder=20mismatch,=20remove?= =?UTF-8?q?=20double=20reset,=20fix=20max=5Fcontext=5Flen=20fallback,=20re?= =?UTF-8?q?quire=20tokenizerConfigSource,=20pass=20tools=20in=20multimodal?= =?UTF-8?q?=20branch,=20capture=20callback=20by=20value?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../common/rnexecutorch/models/llm/LLM.cpp | 12 +++++++----- .../common/runner/unified_runner.cpp | 5 +++-- .../src/controllers/LLMController.ts | 2 +- packages/react-native-executorch/src/types/llm.ts | 2 +- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index b26cd3b20..acccedbd0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -133,10 +133,13 @@ std::string LLM::generate(std::string prompt, llm::make_text_input(prompt.substr(searchPos, found - searchPos))); } // Image at this position - if (imageIdx < imagePaths.size()) { - const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]); - inputs.push_back(llm::make_image_input(img)); + if (imageIdx >= imagePaths.size()) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidUserInput, + "More placeholders in prompt than image paths provided"); } + const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]); + inputs.push_back(llm::make_image_input(img)); searchPos = found + kImageTokenLen; } @@ -146,7 +149,7 @@ std::string LLM::generate(std::string prompt, } std::string output; - auto nativeCallback = [this, &callback, &output](const std::string &token) { + auto nativeCallback = [this, callback, &output](const std::string &token) { output += token; if (callback && callInvoker) { callInvoker->invokeAsync([callback, token](jsi::Runtime &runtime) { @@ -155,7 +158,6 @@ std::string LLM::generate(std::string prompt, } }; - runner_->reset(); auto error = runner_->generate(inputs, temperature_, topp_, -1, nativeCallback); if (error != Error::Ok) { diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp index 12b431e8b..6f4238ca6 100644 --- a/packages/react-native-executorch/common/runner/unified_runner.cpp +++ b/packages/react-native-executorch/common/runner/unified_runner.cpp @@ -74,9 +74,10 @@ Error UnifiedRunner::load() { if (config_.max_seq_len < 0) config_.max_seq_len = static_cast(metadata_.at(kMaxSeqLen)); if (config_.max_context_length < 0) { - auto ctx = metadata_.at(kMaxContextLen); config_.max_context_length = - static_cast(ctx > 128 ? ctx : metadata_.at(kMaxSeqLen)); + method_names.count(kMaxContextLen) + ? static_cast(metadata_.at(kMaxContextLen)) + : static_cast(metadata_.at(kMaxSeqLen)); } if (config_.max_new_tokens < 0) config_.max_new_tokens = diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 7e9a37c86..180be9677 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -368,7 +368,7 @@ export class LLMController { const renderedPrompt = this.applyChatTemplate( messageHistoryWithPrompt, this.tokenizerConfig, - undefined, + this.toolsConfig?.tools, // eslint-disable-next-line camelcase { tools_in_user_message: false, add_generation_prompt: true } ); diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index df0b3a06d..c660e3a7d 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -19,7 +19,7 @@ export interface LLMProps { /** * `ResourceSource` pointing to the JSON file which contains the tokenizer config. */ - tokenizerConfigSource?: ResourceSource; + tokenizerConfigSource: ResourceSource; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. From dfd1a811d38f9dc2acaddbe89b1a4893a668ced9 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Mon, 2 Mar 2026 17:42:04 +0100 Subject: [PATCH 26/46] feat: dynamic sendMessage type based on flag --- apps/llm/app/multimodal_llm/index.tsx | 22 ++++---- .../natural_language_processing/useLLM.ts | 14 ++++- .../react-native-executorch/src/types/llm.ts | 55 ++++++++++++++----- 3 files changed, 63 insertions(+), 28 deletions(-) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index cf9a3f4e6..3178b602b 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -21,13 +21,6 @@ import Messages from '../../components/Messages'; import Spinner from '../../components/Spinner'; import { GeneratingContext } from '../../context'; -const MODEL_SOURCE = - 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte'; -const TOKENIZER_SOURCE = - 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json'; -const TOKENIZER_CONFIG_SOURCE = - 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json'; - export default function MultimodalLLMScreenWrapper() { const isFocused = useIsFocused(); return isFocused ? : null; @@ -42,9 +35,13 @@ function MultimodalLLMScreen() { const vlm = useLLM({ model: { - modelSource: MODEL_SOURCE, - tokenizerSource: TOKENIZER_SOURCE, - tokenizerConfigSource: TOKENIZER_CONFIG_SOURCE, + isMultimodal: true, + modelSource: + 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte', + tokenizerSource: + 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json', + tokenizerConfigSource: + 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json', }, }); @@ -70,9 +67,10 @@ function MultimodalLLMScreen() { setUserInput(''); textInputRef.current?.clear(); Keyboard.dismiss(); + const currentImageUri = imageUri; + setImageUri(null); try { - await vlm.sendMessage(text, imageUri ?? undefined); - setImageUri(null); + await vlm.sendMessage(text, currentImageUri ?? undefined); } catch (e) { console.error('Generation error:', e); } diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 99210e357..9846bbff7 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -4,6 +4,7 @@ import { LLMProps, LLMTool, LLMType, + LLMTypeMultimodal, Message, } from '../../types/llm'; import { LLMController } from '../../controllers/LLMController'; @@ -14,9 +15,16 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; * * @category Hooks * @param model - Object containing model, tokenizer, and tokenizer config sources. - * @returns An object implementing the `LLMType` interface for interacting with the LLM. + * @returns An object implementing the `LLMTypeMultimodal` interface when `model.isMultimodal` is `true`, otherwise `LLMType`. */ -export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { +export function useLLM( + props: LLMProps & { model: { isMultimodal: true } } +): LLMTypeMultimodal; +export function useLLM(props: LLMProps): LLMType; +export function useLLM({ + model, + preventLoad = false, +}: LLMProps): LLMType | LLMTypeMultimodal { const [token, setToken] = useState(''); const [response, setResponse] = useState(''); const [messageHistory, setMessageHistory] = useState([]); @@ -141,4 +149,4 @@ export const useLLM = ({ model, preventLoad = false }: LLMProps): LLMType => { deleteMessage: deleteMessage, interrupt: interrupt, }; -}; +} diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index c660e3a7d..1429b7c84 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -20,6 +20,10 @@ export interface LLMProps { * `ResourceSource` pointing to the JSON file which contains the tokenizer config. */ tokenizerConfigSource: ResourceSource; + /** + * Set to `true` for vision-language models that accept image inputs via `sendMessage`. + */ + isMultimodal?: boolean; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. @@ -28,11 +32,11 @@ export interface LLMProps { } /** - * React hook for managing a Large Language Model (LLM) instance. + * Base return type for `useLLM`. Contains all fields except `sendMessage`. * * @category Types */ -export interface LLMType { +export interface LLMTypeBase { /** * History containing all messages in conversation. This field is updated after model responds to sendMessage. */ @@ -91,7 +95,7 @@ export interface LLMType { */ generate: (messages: Message[], tools?: LLMTool[]) => Promise; /** - * Returns the number of total tokens from the previous generation.This is a sum of prompt tokens and generated tokens. + * Returns the number of total tokens from the previous generation. This is a sum of prompt tokens and generated tokens. * * @returns The count of prompt and generated tokens. */ @@ -103,28 +107,53 @@ export interface LLMType { */ getPromptTokenCount: () => number; + /** + * Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated. + * + * @param index - The index of the message to delete from history. + */ + deleteMessage: (index: number) => void; + + /** + * Function to interrupt the current inference. + */ + interrupt: () => void; +} + +/** + * Return type for `useLLM` when `model.isMultimodal` is `true`. + * `sendMessage` accepts an optional `mediaPath` argument for image inputs. + * + * @category Types + */ +export interface LLMTypeMultimodal extends LLMTypeBase { /** * Function to add user message to conversation. - * Pass `mediaPath` for a multimodal message (image, audio, etc.). + * Pass `mediaPath` with a local image path to send a multimodal message. * After model responds, `messageHistory` will be updated. * * @param message - The message string to send. - * @param mediaPath - Optional local file path to media. + * @param mediaPath - Optional local file path to an image. * @returns The model's response as a `string`. */ sendMessage: (message: string, mediaPath?: string) => Promise; +} +/** + * Return type for `useLLM` when `model.isMultimodal` is absent or `false`. + * `sendMessage` accepts only text. + * + * @category Types + */ +export interface LLMType extends LLMTypeBase { /** - * Deletes all messages starting with message on `index` position. After deletion `messageHistory` will be updated. + * Function to add user message to conversation. + * After model responds, `messageHistory` will be updated. * - * @param index - The index of the message to delete from history. - */ - deleteMessage: (index: number) => void; - - /** - * Function to interrupt the current inference. + * @param message - The message string to send. + * @returns The model's response as a `string`. */ - interrupt: () => void; + sendMessage: (message: string) => Promise; } /** From 3d67b669c610a57d6db6107a7f34515a7bee4bef Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 10:32:00 +0100 Subject: [PATCH 27/46] fix: model stopping generation in the middle of its answer --- .../common/runner/unified_runner.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp index 6f4238ca6..5dcdf127c 100644 --- a/packages/react-native-executorch/common/runner/unified_runner.cpp +++ b/packages/react-native-executorch/common/runner/unified_runner.cpp @@ -263,7 +263,6 @@ Error UnifiedRunner::generate( stats_.inference_start_ms = llm::time_in_ms(); - int64_t pos_before_prefill = pos_; uint64_t prefill_next_token = 0; for (size_t i = 0; i < inputs.size(); ++i) { auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_); @@ -276,14 +275,10 @@ Error UnifiedRunner::generate( stats_.prompt_eval_end_ms = llm::time_in_ms(); stats_.num_prompt_tokens = pos_; - int64_t context_len = metadata_.count(kMaxContextLen) - ? metadata_.at(kMaxContextLen) - : metadata_.count(kMaxSeqLen) ? metadata_.at(kMaxSeqLen) - : 2048; int32_t resolved_max_new = max_new_tokens > 0 ? max_new_tokens - : static_cast(context_len - pos_before_prefill); + : static_cast(config_.max_context_length - pos_); resolved_max_new = std::max(0, resolved_max_new); std::vector seed_tokens = {prefill_next_token}; From 2b26c5dd152f6b9ee02bc51097e5baea7db24e8d Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 11:44:12 +0100 Subject: [PATCH 28/46] feat: add LLMCapability type and parameterize LLMTypeMultimodal --- apps/llm/app/multimodal_llm/index.tsx | 7 ++-- .../natural_language_processing/useLLM.ts | 9 ++--- .../react-native-executorch/src/types/llm.ts | 35 ++++++++++++++----- 3 files changed, 36 insertions(+), 15 deletions(-) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 3178b602b..542af4740 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -35,7 +35,7 @@ function MultimodalLLMScreen() { const vlm = useLLM({ model: { - isMultimodal: true, + capabilities: ['vision'] as const, modelSource: 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte', tokenizerSource: @@ -70,7 +70,10 @@ function MultimodalLLMScreen() { const currentImageUri = imageUri; setImageUri(null); try { - await vlm.sendMessage(text, currentImageUri ?? undefined); + await vlm.sendMessage( + text, + currentImageUri ? { imagePath: currentImageUri } : undefined + ); } catch (e) { console.error('Generation error:', e); } diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 9846bbff7..570ae640d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -1,5 +1,6 @@ import { useCallback, useEffect, useState } from 'react'; import { + LLMCapability, LLMConfig, LLMProps, LLMTool, @@ -15,11 +16,11 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; * * @category Hooks * @param model - Object containing model, tokenizer, and tokenizer config sources. - * @returns An object implementing the `LLMTypeMultimodal` interface when `model.isMultimodal` is `true`, otherwise `LLMType`. + * @returns An object implementing the `LLMTypeMultimodal` interface when `model.capabilities` is provided, otherwise `LLMType`. */ -export function useLLM( - props: LLMProps & { model: { isMultimodal: true } } -): LLMTypeMultimodal; +export function useLLM( + props: LLMProps & { model: { capabilities: C } } +): LLMTypeMultimodal; export function useLLM(props: LLMProps): LLMType; export function useLLM({ model, diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 1429b7c84..aea9817bb 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -1,6 +1,20 @@ import { RnExecutorchError } from '../errors/errorUtils'; import { ResourceSource } from './common'; +/** + * Capabilities a multimodal LLM can have. + * @category Types + */ +export type LLMCapability = 'vision' | 'audio'; + +/** + * Derives the media argument shape for `sendMessage` from a capabilities tuple. + * @category Types + */ +export type MediaArg = + ('vision' extends C[number] ? { imagePath?: string } : object) & + ('audio' extends C[number] ? { audioPath?: string } : object); + /** * Properties for initializing and configuring a Large Language Model (LLM) instance. * @@ -21,9 +35,11 @@ export interface LLMProps { */ tokenizerConfigSource: ResourceSource; /** - * Set to `true` for vision-language models that accept image inputs via `sendMessage`. + * Optional list of modality capabilities the model supports. + * Determines the type of the `media` argument in `sendMessage`. + * Example: `['vision']` enables `sendMessage(text, { imagePath })`. */ - isMultimodal?: boolean; + capabilities?: readonly LLMCapability[]; }; /** * Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. @@ -121,22 +137,23 @@ export interface LLMTypeBase { } /** - * Return type for `useLLM` when `model.isMultimodal` is `true`. - * `sendMessage` accepts an optional `mediaPath` argument for image inputs. - * + * Return type for `useLLM` when `model.capabilities` is provided. + * `sendMessage` accepts a typed `media` object based on declared capabilities. * @category Types */ -export interface LLMTypeMultimodal extends LLMTypeBase { +export interface LLMTypeMultimodal< + C extends readonly LLMCapability[] = readonly LLMCapability[], +> extends LLMTypeBase { /** * Function to add user message to conversation. - * Pass `mediaPath` with a local image path to send a multimodal message. + * Pass a `media` object whose shape is determined by the declared capabilities. * After model responds, `messageHistory` will be updated. * * @param message - The message string to send. - * @param mediaPath - Optional local file path to an image. + * @param media - Optional media object (e.g. `{ imagePath }` for vision, `{ audioPath }` for audio). * @returns The model's response as a `string`. */ - sendMessage: (message: string, mediaPath?: string) => Promise; + sendMessage: (message: string, media?: MediaArg) => Promise; } /** From 8d1b4ebccaabf16e3a3d8b996c187d369d8dd9c7 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 11:45:49 +0100 Subject: [PATCH 29/46] feat: update sendMessage to accept typed media object --- .../react-native-executorch/src/controllers/LLMController.ts | 3 ++- .../src/hooks/natural_language_processing/useLLM.ts | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 180be9677..e09646d08 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -312,8 +312,9 @@ export class LLMController { public async sendMessage( message: string, - mediaPath?: string + media?: { imagePath?: string; audioPath?: string } ): Promise { + const mediaPath = media?.imagePath ?? media?.audioPath; const newMessage: Message = { content: message, role: 'user', diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 570ae640d..f83d39352 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -101,9 +101,9 @@ export function useLLM({ ); const sendMessage = useCallback( - (message: string, mediaPath?: string) => { + (message: string, media?: { imagePath?: string; audioPath?: string }) => { setResponse(''); - return controllerInstance.sendMessage(message, mediaPath); + return controllerInstance.sendMessage(message, media); }, [controllerInstance] ); From f3edf5d0aafd04c0c7e4122c4fa39ba68971a6a6 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 11:46:16 +0100 Subject: [PATCH 30/46] feat: add LFM2_VL_1_6B and LFM2_VL_1_6B_QUANTIZED model constants --- .../src/constants/modelUrls.ts | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 499abf63a..325a13133 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -371,6 +371,32 @@ export const LFM2_5_1_2B_INSTRUCT_QUANTIZED = { tokenizerConfigSource: LFM2_5_1_2B_TOKENIZER_CONFIG, }; +// LFM2.5-VL-1.6B (Vision-Language) +const LFM2_VL_1_6B_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_xnnpack.pte`; +const LFM2_VL_1_6B_QUANTIZED_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte`; +const LFM2_VL_TOKENIZER = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json`; +const LFM2_VL_TOKENIZER_CONFIG = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json`; + +/** + * @category Models - VLM + */ +export const LFM2_VL_1_6B = { + capabilities: ['vision'] as const, + modelSource: LFM2_VL_1_6B_MODEL, + tokenizerSource: LFM2_VL_TOKENIZER, + tokenizerConfigSource: LFM2_VL_TOKENIZER_CONFIG, +}; + +/** + * @category Models - VLM + */ +export const LFM2_VL_1_6B_QUANTIZED = { + capabilities: ['vision'] as const, + modelSource: LFM2_VL_1_6B_QUANTIZED_MODEL, + tokenizerSource: LFM2_VL_TOKENIZER, + tokenizerConfigSource: LFM2_VL_TOKENIZER_CONFIG, +}; + // Classification const EFFICIENTNET_V2_S_MODEL = Platform.OS === `ios` From 6eba3f78ea4fce5ff86971c8248db89fbbd4a034 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 12:55:44 +0100 Subject: [PATCH 31/46] feat: add IEncoder interface and VisionEncoder Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/tests/CMakeLists.txt | 3 +- .../tests/integration/LLMTest.cpp | 15 ++++++ .../common/runner/encoders/iencoder.h | 21 ++++++++ .../common/runner/encoders/vision_encoder.cpp | 49 +++++++++++++++++++ .../common/runner/encoders/vision_encoder.h | 22 +++++++++ 5 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 packages/react-native-executorch/common/runner/encoders/iencoder.h create mode 100644 packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp create mode 100644 packages/react-native-executorch/common/runner/encoders/vision_encoder.h diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index e2a8c16bf..874b96732 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -210,11 +210,12 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp add_rn_test(LLMTests integration/LLMTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp - ${COMMON_DIR}/runner/runner.cpp + ${COMMON_DIR}/runner/runner.cpp # keep until Task 5 ${COMMON_DIR}/runner/text_prefiller.cpp ${COMMON_DIR}/runner/text_decoder_runner.cpp ${COMMON_DIR}/runner/sampler.cpp ${COMMON_DIR}/runner/arange_util.cpp + ${COMMON_DIR}/runner/encoders/vision_encoder.cpp # add this LIBS tokenizers_deps ) diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index e79294090..f44fff810 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -6,6 +6,7 @@ #include #include #include +#include using namespace rnexecutorch; using namespace rnexecutorch::models::llm; @@ -153,3 +154,17 @@ TEST_F(LLMTest, EmptyPromptThrows) { LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); EXPECT_THROW((void)model.generate("", nullptr), RnExecutorchError); } + +TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) { + // smolLm2_135M_8da4w.pte has no vision_encoder method + auto module = std::make_unique<::executorch::extension::Module>( + "smolLm2_135M_8da4w.pte", + ::executorch::extension::Module::LoadMode::File); + + auto encoder = + std::make_unique(module.get()); + + EXPECT_THROW( + { ET_CHECK_OK_OR_RETURN_ERROR(encoder->load()); }, + rnexecutorch::RnExecutorchError); +} diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h new file mode 100644 index 000000000..3f46ef775 --- /dev/null +++ b/packages/react-native-executorch/common/runner/encoders/iencoder.h @@ -0,0 +1,21 @@ +// common/runner/encoders/iencoder.h +#pragma once + +#include +#include +#include +#include + +namespace executorch::extension::llm { + +class IEncoder { +public: + virtual ~IEncoder() = default; + virtual ::executorch::runtime::Error load() = 0; + virtual bool is_loaded() const = 0; + // Encodes one input segment, returns embeddings EValue + virtual ::executorch::runtime::Result<::executorch::runtime::EValue> + encode(const MultimodalInput &input) = 0; +}; + +} // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp new file mode 100644 index 000000000..0b7a56bb7 --- /dev/null +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -0,0 +1,49 @@ +// common/runner/encoders/vision_encoder.cpp +#include "vision_encoder.h" + +#include +#include +#include + +namespace executorch::extension::llm { + +using ::executorch::runtime::Error; +using ::executorch::runtime::EValue; +using ::executorch::runtime::Result; + +VisionEncoder::VisionEncoder(::executorch::extension::Module *module) + : module_(module) {} + +Error VisionEncoder::load() { + auto method_names_result = module_->method_names(); + if (!method_names_result.ok() || + method_names_result->count(kVisionEncoderMethod) == 0) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::InvalidConfig, + "Model does not support vision: 'vision_encoder' method not found. " + "Check that the .pte file matches the declared capabilities."); + } + return module_->load_method(kVisionEncoderMethod); +} + +bool VisionEncoder::is_loaded() const { + return module_->is_method_loaded(kVisionEncoderMethod); +} + +Result VisionEncoder::encode(const MultimodalInput &input) { + if (!input.is_image()) { + return Error::InvalidArgument; + } + const Image &image = input.get_image(); + auto image_tensor_result = image.toTensor(/*with_batch=*/true); + if (!image_tensor_result.ok()) { + return image_tensor_result.error(); + } + auto result = module_->execute(kVisionEncoderMethod, *image_tensor_result); + if (!result.ok()) { + return result.error(); + } + return (*result)[0]; +} + +} // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h new file mode 100644 index 000000000..688b0cf3a --- /dev/null +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -0,0 +1,22 @@ +// common/runner/encoders/vision_encoder.h +#pragma once + +#include "iencoder.h" +#include + +namespace executorch::extension::llm { + +class VisionEncoder : public IEncoder { +public: + explicit VisionEncoder(::executorch::extension::Module *module); + + ::executorch::runtime::Error load() override; + bool is_loaded() const override; + ::executorch::runtime::Result<::executorch::runtime::EValue> + encode(const MultimodalInput &input) override; + +private: + ::executorch::extension::Module *module_; +}; + +} // namespace executorch::extension::llm From 0819c20b01f4200c5a62b0efde3d18e4fbe3ea87 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 13:03:21 +0100 Subject: [PATCH 32/46] fix: address vision_encoder quality review issues --- .../tests/integration/LLMTest.cpp | 4 +-- .../common/runner/encoders/vision_encoder.cpp | 25 ++++++++++++++++--- .../common/runner/encoders/vision_encoder.h | 1 + 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index f44fff810..e3885eba4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -164,7 +164,5 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) { auto encoder = std::make_unique(module.get()); - EXPECT_THROW( - { ET_CHECK_OK_OR_RETURN_ERROR(encoder->load()); }, - rnexecutorch::RnExecutorchError); + EXPECT_THROW(encoder->load(), rnexecutorch::RnExecutorchError); } diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 0b7a56bb7..35ce84c6d 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -15,9 +15,14 @@ VisionEncoder::VisionEncoder(::executorch::extension::Module *module) : module_(module) {} Error VisionEncoder::load() { + if (is_loaded()) { + return Error::Ok; + } auto method_names_result = module_->method_names(); - if (!method_names_result.ok() || - method_names_result->count(kVisionEncoderMethod) == 0) { + if (!method_names_result.ok()) { + return method_names_result.error(); + } + if (method_names_result->count(kVisionEncoderMethod) == 0) { throw rnexecutorch::RnExecutorchError( rnexecutorch::RnExecutorchErrorCode::InvalidConfig, "Model does not support vision: 'vision_encoder' method not found. " @@ -31,11 +36,25 @@ bool VisionEncoder::is_loaded() const { } Result VisionEncoder::encode(const MultimodalInput &input) { + if (!is_loaded()) { + return Error::InvalidState; + } if (!input.is_image()) { return Error::InvalidArgument; } const Image &image = input.get_image(); - auto image_tensor_result = image.toTensor(/*with_batch=*/true); + auto method_meta_result = module_->method_meta(kVisionEncoderMethod); + if (!method_meta_result.ok()) { + return method_meta_result.error(); + } + auto &method_meta = *method_meta_result; + auto input_meta_result = method_meta.input_tensor_meta(0); + if (!input_meta_result.ok()) { + return input_meta_result.error(); + } + auto expected_dims = input_meta_result->sizes(); + auto image_tensor_result = + image.toTensor(/*with_batch=*/expected_dims.size() == 4); if (!image_tensor_result.ok()) { return image_tensor_result.error(); } diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index 688b0cf3a..5b3dd0aec 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -3,6 +3,7 @@ #include "iencoder.h" #include +#include namespace executorch::extension::llm { From 1de96bb70f531b0b03d8ee819cf8f31b8576e533 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 13:08:26 +0100 Subject: [PATCH 33/46] feat: add BaseLLMRunner with shared state and load() Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/tests/CMakeLists.txt | 1 + .../tests/integration/LLMTest.cpp | 40 +++++ .../common/runner/base_llm_runner.cpp | 163 ++++++++++++++++++ .../common/runner/base_llm_runner.h | 88 ++++++++++ 4 files changed, 292 insertions(+) create mode 100644 packages/react-native-executorch/common/runner/base_llm_runner.cpp create mode 100644 packages/react-native-executorch/common/runner/base_llm_runner.h diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 874b96732..aebecc1b6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -211,6 +211,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp ${COMMON_DIR}/runner/runner.cpp # keep until Task 5 + ${COMMON_DIR}/runner/base_llm_runner.cpp ${COMMON_DIR}/runner/text_prefiller.cpp ${COMMON_DIR}/runner/text_decoder_runner.cpp ${COMMON_DIR}/runner/sampler.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index e3885eba4..7f35519af 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -166,3 +166,43 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) { EXPECT_THROW(encoder->load(), rnexecutorch::RnExecutorchError); } + +#include + +// Minimal concrete subclass — only used in tests to verify base class behavior +class StubRunner : public example::BaseLLMRunner { +public: + using BaseLLMRunner::BaseLLMRunner; + bool is_loaded() const override { return loaded_; } + ::executorch::runtime::Error load_subcomponents() override { + loaded_ = true; + return ::executorch::runtime::Error::Ok; + } + ::executorch::runtime::Error generate_internal( + const std::vector<::executorch::extension::llm::MultimodalInput> &, + std::function) override { + return ::executorch::runtime::Error::Ok; + } + void stop_impl() override {} + void set_temperature_impl(float t) override { last_temp_ = t; } + void set_topp_impl(float) override {} + void set_count_interval_impl(size_t) override {} + void set_time_interval_impl(size_t) override {} + + bool loaded_ = false; + float last_temp_ = -1.f; +}; + +TEST(BaseLLMRunnerTest, SetTemperatureWritesConfigAndCallsImpl) { + StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json"); + runner.set_temperature(0.5f); + EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f); + EXPECT_FLOAT_EQ(runner.last_temp_, 0.5f); +} + +TEST(BaseLLMRunnerTest, ResetZerosPos) { + StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json"); + runner.pos_ = 42; + runner.reset(); + EXPECT_EQ(runner.pos_, 0); +} diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp new file mode 100644 index 000000000..a987528a0 --- /dev/null +++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp @@ -0,0 +1,163 @@ +// common/runner/base_llm_runner.cpp +#include "base_llm_runner.h" +#include "constants.h" +#include "util.h" +#include +#include + +namespace example { + +using namespace executorch::extension::llm; +using ::executorch::extension::Module; +using ::executorch::runtime::Error; + +BaseLLMRunner::BaseLLMRunner(Module *module, + std::unique_ptr owned_module, + const std::string &tokenizer_path, + const llm::GenerationConfig &config) + : config_(config), module_(owned_module ? owned_module.get() : module), + owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path), + tokenizer_(std::make_unique()), + metadata_({ + {kEnableDynamicShape, false}, + {kMaxSeqLen, 128}, + {kMaxContextLen, 128}, + {kUseKVCache, true}, + {kUseSDPAWithKVCache, false}, + }) {} + +Error BaseLLMRunner::load() { + if (is_loaded()) + return Error::Ok; + + auto status = tokenizer_->load(tokenizer_path_); + if (status != tokenizers::Error::Ok) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Unexpected issue occurred while loading tokenizer"); + } + + const auto method_names = + ET_UNWRAP(module_->method_names(), "Failed reading method names"); + + metadata_[kVocabSize] = tokenizer_->vocab_size(); + for (auto &pair : metadata_) { + const auto &method_name = pair.first; + auto &value = pair.second; + if (method_names.count(method_name)) { + value = ET_UNWRAP(module_->get(method_name)) + .toScalar() + .to(); + } + ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); + } + + if (config_.max_seq_len < 0) + config_.max_seq_len = static_cast(metadata_.at(kMaxSeqLen)); + if (config_.max_context_length < 0) { + config_.max_context_length = + method_names.count(kMaxContextLen) + ? static_cast(metadata_.at(kMaxContextLen)) + : static_cast(metadata_.at(kMaxSeqLen)); + } + if (config_.max_new_tokens < 0) + config_.max_new_tokens = + std::min(config_.max_seq_len, config_.max_context_length); + if (config_.enable_dynamic_shape) + config_.enable_dynamic_shape = + static_cast(metadata_.at(kEnableDynamicShape)); + if (config_.enable_kv_cache) + config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); + + auto eos_ids = std::make_unique>(); + if (method_names.count(kEosIds)) { + for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { + eos_ids->emplace(static_cast(eos_id.toScalar().to())); + } + } + if (eos_ids->empty()) { + eos_ids->emplace(7); // fallback <|im_end|> + } + + io_manager_ = std::make_unique(*module_); + + return load_subcomponents(); +} + +Error BaseLLMRunner::generate( + const std::string &prompt, const llm::GenerationConfig &generation_config, + std::function token_callback, + std::function stats_callback) { + + ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); + + std::vector inputs = {llm::make_text_input(prompt)}; + auto err = generate_internal(inputs, token_callback); + + if (stats_callback) + stats_callback(stats_); + + return err; +} + +void BaseLLMRunner::stop() { stop_impl(); } + +void BaseLLMRunner::reset() { + stats_.reset(); + pos_ = 0; +} + +int32_t BaseLLMRunner::count_text_tokens(const std::string &text) const { + auto encodeResult = + tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens); + if (!encodeResult.ok()) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Encoding failed during token count check."); + } + return static_cast(encodeResult.get().size()); +} + +int32_t BaseLLMRunner::get_max_context_length() const { + if (!is_loaded()) + return static_cast(metadata_.at(kMaxContextLen)); + return config_.max_context_length; +} + +void BaseLLMRunner::set_temperature(float temperature) noexcept { + config_.temperature = temperature; + set_temperature_impl(temperature); +} + +void BaseLLMRunner::set_topp(float topp) noexcept { + config_.topp = topp; + set_topp_impl(topp); +} + +void BaseLLMRunner::set_count_interval(size_t count_interval) { + set_count_interval_impl(count_interval); +} + +void BaseLLMRunner::set_time_interval(size_t time_interval) { + set_time_interval_impl(time_interval); +} + +int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens, + int32_t max_seq_len, + int32_t max_context_len, + int32_t max_new_tokens) const { + int32_t result; + if (max_seq_len == -1 && max_new_tokens == -1) + result = max_context_len - num_prompt_tokens; + else if (max_seq_len == -1) + result = std::min(max_new_tokens, max_context_len - num_prompt_tokens); + else if (max_new_tokens == -1) + result = std::min(max_seq_len, max_context_len) - num_prompt_tokens; + else + result = + std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens, + max_new_tokens); + return std::max(0, result); +} + +} // namespace example diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h new file mode 100644 index 000000000..7d2eef285 --- /dev/null +++ b/packages/react-native-executorch/common/runner/base_llm_runner.h @@ -0,0 +1,88 @@ +// common/runner/base_llm_runner.h +#pragma once + +#include "io_manager.h" +#include "irunner.h" +#include "multimodal_input.h" +#include "stats.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace example { + +namespace llm = ::executorch::extension::llm; + +class BaseLLMRunner { +public: + explicit BaseLLMRunner( + ::executorch::extension::Module *module, + std::unique_ptr<::executorch::extension::Module> owned_module, + const std::string &tokenizer_path, + const llm::GenerationConfig &config = {.temperature = 0.8F, + .topp = 0.9F}); + + virtual ~BaseLLMRunner() = default; + + virtual bool is_loaded() const = 0; + + // Loads tokenizer + metadata + eos, then calls load_subcomponents() + virtual ::executorch::runtime::Error load(); + + // Text convenience — wraps string in make_text_input, calls generate_internal + ::executorch::runtime::Error + generate(const std::string &prompt, + const llm::GenerationConfig &generation_config = {}, + std::function token_callback = {}, + std::function stats_callback = {}); + + // Multimodal entry point — subclasses implement this + virtual ::executorch::runtime::Error generate_internal( + const std::vector &inputs, + std::function token_callback) = 0; + + void stop(); + void reset(); + int32_t count_text_tokens(const std::string &text) const; + int32_t get_max_context_length() const; + + // Writes config_ then propagates to subclass impl + void set_temperature(float temperature) noexcept; + void set_topp(float topp) noexcept; + void set_count_interval(size_t count_interval); + void set_time_interval(size_t time_interval); + + llm::Stats stats_; + + // Public for test access + llm::GenerationConfig config_; + int64_t pos_{0}; + +protected: + virtual ::executorch::runtime::Error load_subcomponents() = 0; + virtual void stop_impl() = 0; + virtual void set_temperature_impl(float temperature) = 0; + virtual void set_topp_impl(float topp) = 0; + virtual void set_count_interval_impl(size_t count_interval) = 0; + virtual void set_time_interval_impl(size_t time_interval) = 0; + + int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len, + int32_t max_context_len, + int32_t max_new_tokens = -1) const; + + ::executorch::extension::Module *module_; + std::unique_ptr<::executorch::extension::Module> owned_module_; + std::string tokenizer_path_; + std::unique_ptr tokenizer_; + std::unordered_map metadata_; + std::unique_ptr io_manager_; + bool shouldStop_{false}; +}; + +} // namespace example From e08b391611e477d720bcab046e6d917063a48320 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 13:11:20 +0100 Subject: [PATCH 34/46] feat: add TextRunner Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/tests/CMakeLists.txt | 1 + .../tests/integration/LLMTest.cpp | 24 +++ .../common/runner/text_runner.cpp | 168 ++++++++++++++++++ .../common/runner/text_runner.h | 42 +++++ 4 files changed, 235 insertions(+) create mode 100644 packages/react-native-executorch/common/runner/text_runner.cpp create mode 100644 packages/react-native-executorch/common/runner/text_runner.h diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index aebecc1b6..66e65cd6d 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -212,6 +212,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp ${COMMON_DIR}/runner/runner.cpp # keep until Task 5 ${COMMON_DIR}/runner/base_llm_runner.cpp + ${COMMON_DIR}/runner/text_runner.cpp ${COMMON_DIR}/runner/text_prefiller.cpp ${COMMON_DIR}/runner/text_decoder_runner.cpp ${COMMON_DIR}/runner/sampler.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index 7f35519af..dd253b487 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -206,3 +206,27 @@ TEST(BaseLLMRunnerTest, ResetZerosPos) { runner.reset(); EXPECT_EQ(runner.pos_, 0); } + +#include + +TEST(TextRunnerTest, LoadsSuccessfully) { + auto module = std::make_unique<::executorch::extension::Module>( + "smolLm2_135M_8da4w.pte", + ::executorch::extension::Module::LoadMode::File); + + example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json"); + auto err = runner.load(); + EXPECT_EQ(err, ::executorch::runtime::Error::Ok); + EXPECT_TRUE(runner.is_loaded()); +} + +TEST(TextRunnerTest, SetTemperaturePropagatesToDecoder) { + auto module = std::make_unique<::executorch::extension::Module>( + "smolLm2_135M_8da4w.pte", + ::executorch::extension::Module::LoadMode::File); + + example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json"); + runner.load(); + EXPECT_NO_THROW(runner.set_temperature(0.5f)); + EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f); +} diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp new file mode 100644 index 000000000..279244855 --- /dev/null +++ b/packages/react-native-executorch/common/runner/text_runner.cpp @@ -0,0 +1,168 @@ +// common/runner/text_runner.cpp +#include "text_runner.h" +#include "constants.h" +#include "util.h" +#include +#include + +namespace example { + +using namespace executorch::extension::llm; +using ::executorch::extension::Module; +using ::executorch::runtime::Error; + +TextRunner::TextRunner(Module *module, std::unique_ptr owned_module, + const std::string &tokenizer_path, + const llm::GenerationConfig &config) + : BaseLLMRunner(module, std::move(owned_module), tokenizer_path, config) {} + +bool TextRunner::is_loaded() const { + return module_ && module_->is_loaded() && tokenizer_ && + tokenizer_->is_loaded() && text_decoder_runner_ && text_prefiller_ && + text_token_generator_; +} + +Error TextRunner::load_subcomponents() { + ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); + + // Re-detect eos_ids from the module (base class built them but doesn't pass + // them down yet — reconstruct with the same fallback logic). + auto eos_ids = std::make_unique>(); + const auto method_names = + ET_UNWRAP(module_->method_names(), "Failed reading method names"); + if (method_names.count(kEosIds)) { + for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { + eos_ids->emplace(static_cast(eos_id.toScalar().to())); + } + } + if (eos_ids->empty()) { + eos_ids->emplace(7); // fallback <|im_end|> + } + + llm::Stats *stats_ptr = &stats_; + + text_decoder_runner_ = std::make_unique( + module_, io_manager_.get(), config_.temperature, config_.topp); + text_prefiller_ = std::make_unique( + text_decoder_runner_.get(), config_.enable_kv_cache, + config_.enable_dynamic_shape, config_.max_seq_len); + text_token_generator_ = std::make_unique( + tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache, + std::move(eos_ids), stats_ptr); + + return Error::Ok; +} + +Error TextRunner::generate_internal( + const std::vector &inputs, + std::function token_callback) { + + if (inputs.empty()) { + ET_LOG(Error, "MultimodalInput vector cannot be empty"); + return Error::InvalidArgument; + } + + const std::string &prompt = inputs[0].get_text(); + ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); + + if (!is_loaded()) { + stats_.model_load_start_ms = llm::time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(load()); + stats_.model_load_end_ms = llm::time_in_ms(); + } + + std::function wrapped_callback = + [token_callback](const std::string &piece) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + if (token_callback) + token_callback(piece); + }; + + stats_.inference_start_ms = llm::time_in_ms(); + shouldStop_ = false; + + int64_t context_len_left = + static_cast(config_.max_context_length) - pos_; + + auto encodeResult = + tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens); + if (!encodeResult.ok()) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Unexpected issue occurred while encoding: " + + std::to_string(static_cast(encodeResult.error()))); + } + std::vector prompt_tokens = encodeResult.get(); + int num_prompt_tokens = prompt_tokens.size(); + + ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument, + "Expected at least 1 prompt token"); + ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < config_.max_seq_len, + InvalidArgument, + "num_prompt_tokens %d >= max_seq_len %" PRId32, + num_prompt_tokens, config_.max_seq_len); + + int32_t max_new_tokens = resolve_max_new_tokens( + num_prompt_tokens, config_.max_seq_len, + static_cast(context_len_left), config_.max_new_tokens); + + ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument, + "Max new tokens %d is <= 0", max_new_tokens); + + if (config_.echo) + wrapped_callback(prompt); + + auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_); + stats_.first_token_ms = llm::time_in_ms(); + stats_.prompt_eval_end_ms = llm::time_in_ms(); + ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); + + uint64_t cur_token = prefill_res.get(); + auto decodeResult = tokenizer_->decode({cur_token}); + if (!decodeResult.ok()) { + throw rnexecutorch::RnExecutorchError( + rnexecutorch::RnExecutorchErrorCode::TokenizerError, + "Unexpected issue occurred while decoding: " + + std::to_string(static_cast(decodeResult.error()))); + } + + prompt_tokens.push_back(cur_token); + int64_t num_generated = ET_UNWRAP(text_token_generator_->generate( + prompt_tokens, pos_, max_new_tokens - 1, config_.temperature, + config_.topp, wrapped_callback)); + + pos_ += num_generated; + stats_.inference_end_ms = llm::time_in_ms(); + stats_.num_prompt_tokens = num_prompt_tokens; + stats_.num_generated_tokens = num_generated; + + return Error::Ok; +} + +void TextRunner::stop_impl() { + if (text_token_generator_) + text_token_generator_->stop(); +} + +void TextRunner::set_temperature_impl(float temperature) { + if (text_decoder_runner_) + text_decoder_runner_->set_temperature(temperature); +} + +void TextRunner::set_topp_impl(float topp) { + if (text_decoder_runner_) + text_decoder_runner_->set_topp(topp); +} + +void TextRunner::set_count_interval_impl(size_t count_interval) { + if (text_token_generator_) + text_token_generator_->set_count_interval(count_interval); +} + +void TextRunner::set_time_interval_impl(size_t time_interval) { + if (text_token_generator_) + text_token_generator_->set_time_interval(time_interval); +} + +} // namespace example diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h new file mode 100644 index 000000000..e590f4c88 --- /dev/null +++ b/packages/react-native-executorch/common/runner/text_runner.h @@ -0,0 +1,42 @@ +// common/runner/text_runner.h +#pragma once + +#include "base_llm_runner.h" +#include "text_decoder_runner.h" +#include "text_prefiller.h" +#include "text_token_generator.h" + +namespace example { + +class TextRunner : public BaseLLMRunner { +public: + explicit TextRunner( + ::executorch::extension::Module *module, + std::unique_ptr<::executorch::extension::Module> owned_module, + const std::string &tokenizer_path, + const ::executorch::extension::llm::GenerationConfig &config = { + .temperature = 0.8F, .topp = 0.9F}); + + bool is_loaded() const override; + + ::executorch::runtime::Error generate_internal( + const std::vector<::executorch::extension::llm::MultimodalInput> &inputs, + std::function token_callback) override; + +protected: + ::executorch::runtime::Error load_subcomponents() override; + void stop_impl() override; + void set_temperature_impl(float temperature) override; + void set_topp_impl(float topp) override; + void set_count_interval_impl(size_t count_interval) override; + void set_time_interval_impl(size_t time_interval) override; + +private: + std::unique_ptr<::executorch::extension::llm::TextDecoderRunner> + text_decoder_runner_; + std::unique_ptr<::executorch::extension::llm::TextPrefiller> text_prefiller_; + std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> + text_token_generator_; +}; + +} // namespace example From 6703559d48ed00cb9858769f8c8a8563b2aec8db Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 13:14:18 +0100 Subject: [PATCH 35/46] feat: add MultimodalRunner with plug-in encoder map --- .../common/rnexecutorch/tests/CMakeLists.txt | 4 +- .../tests/integration/LLMTest.cpp | 21 +++ .../common/runner/multimodal_runner.cpp | 121 ++++++++++++++++++ .../common/runner/multimodal_runner.h | 57 +++++++++ 4 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.cpp create mode 100644 packages/react-native-executorch/common/runner/multimodal_runner.h diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 66e65cd6d..a077a0c5a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -213,12 +213,14 @@ add_rn_test(LLMTests integration/LLMTest.cpp ${COMMON_DIR}/runner/runner.cpp # keep until Task 5 ${COMMON_DIR}/runner/base_llm_runner.cpp ${COMMON_DIR}/runner/text_runner.cpp + ${COMMON_DIR}/runner/multimodal_runner.cpp + ${COMMON_DIR}/runner/multimodal_prefiller.cpp ${COMMON_DIR}/runner/text_prefiller.cpp ${COMMON_DIR}/runner/text_decoder_runner.cpp ${COMMON_DIR}/runner/sampler.cpp ${COMMON_DIR}/runner/arange_util.cpp ${COMMON_DIR}/runner/encoders/vision_encoder.cpp # add this - LIBS tokenizers_deps + LIBS tokenizers_deps opencv_deps ) add_rn_test(TextToImageTests integration/TextToImageTest.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index dd253b487..9ee8a64b2 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -230,3 +230,24 @@ TEST(TextRunnerTest, SetTemperaturePropagatesToDecoder) { EXPECT_NO_THROW(runner.set_temperature(0.5f)); EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f); } + +#include + +TEST(MultimodalRunnerTest, LoadFailsWithClearErrorWhenCapabilityMismatch) { + // smolLm2_135M_8da4w.pte is text-only — declaring vision capability should + // throw + auto module = std::make_unique<::executorch::extension::Module>( + "smolLm2_135M_8da4w.pte", + ::executorch::extension::Module::LoadMode::File); + + std::map> + encoders; + encoders[executorch::extension::llm::MultimodalType::Image] = + std::make_unique(module.get()); + + example::MultimodalRunner runner(std::move(module), "smollm_tokenizer.json", + std::move(encoders)); + + EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError); +} diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp new file mode 100644 index 000000000..363f11d11 --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -0,0 +1,121 @@ +// common/runner/multimodal_runner.cpp +#include "multimodal_runner.h" +#include "constants.h" +#include "util.h" +#include + +namespace example { + +using namespace executorch::extension::llm; +using ::executorch::extension::Module; +using ::executorch::runtime::Error; + +MultimodalRunner::MultimodalRunner( + std::unique_ptr owned_module, const std::string &tokenizer_path, + std::map> encoders, + const llm::GenerationConfig &config) + : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config), + encoders_(std::move(encoders)) {} + +bool MultimodalRunner::is_loaded() const { + if (!mm_prefiller_ || !mm_token_generator_) + return false; + if (!mm_prefiller_->is_method_loaded() || !mm_token_generator_->is_loaded()) + return false; + for (const auto &[type, encoder] : encoders_) { + if (!encoder->is_loaded()) + return false; + } + return true; +} + +Error MultimodalRunner::load_subcomponents() { + // Load and validate all declared encoders — throws on mismatch + for (auto &[type, encoder] : encoders_) { + encoder->load(); + } + + llm::Stats *stats_ptr = &stats_; + auto eos_ids = std::make_unique>(); + eos_ids->emplace(7); // fallback + + mm_decoder_runner_ = std::make_unique( + module_, io_manager_.get()); + mm_prefiller_ = std::make_unique( + module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get()); + mm_token_generator_ = std::make_unique( + tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true, + std::move(eos_ids), stats_ptr); + + ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load()); + ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load()); + return Error::Ok; +} + +Error MultimodalRunner::generate_internal( + const std::vector &inputs, + std::function token_callback) { + + if (inputs.empty()) + return Error::InvalidArgument; + if (!is_loaded()) + ET_CHECK_OK_OR_RETURN_ERROR(load()); + + stats_.inference_start_ms = llm::time_in_ms(); + + uint64_t prefill_next_token = 0; + for (const auto &input : inputs) { + auto prefill_result = mm_prefiller_->prefill(input, pos_); + if (!prefill_result.ok()) + return prefill_result.error(); + prefill_next_token = prefill_result.get(); + } + + stats_.first_token_ms = llm::time_in_ms(); + stats_.prompt_eval_end_ms = llm::time_in_ms(); + stats_.num_prompt_tokens = pos_; + + int32_t resolved_max_new = + static_cast(config_.max_context_length - pos_); + resolved_max_new = std::max(0, resolved_max_new); + + std::vector seed_tokens = {prefill_next_token}; + auto wrapped_callback = [&](const std::string &piece) { + llm::safe_printf(piece.c_str()); + fflush(stdout); + if (token_callback) + token_callback(piece); + }; + + auto generate_result = mm_token_generator_->generate( + seed_tokens, pos_, + static_cast(std::max(0, resolved_max_new - 1)), + config_.temperature, config_.topp, wrapped_callback); + + if (!generate_result.ok()) + return generate_result.error(); + + int64_t num_generated = generate_result.get(); + pos_ += num_generated; + stats_.inference_end_ms = llm::time_in_ms(); + stats_.num_generated_tokens = num_generated; + + return Error::Ok; +} + +void MultimodalRunner::stop_impl() { + if (mm_token_generator_) + mm_token_generator_->stop(); +} + +void MultimodalRunner::set_count_interval_impl(size_t count_interval) { + if (mm_token_generator_) + mm_token_generator_->set_count_interval(count_interval); +} + +void MultimodalRunner::set_time_interval_impl(size_t time_interval) { + if (mm_token_generator_) + mm_token_generator_->set_time_interval(time_interval); +} + +} // namespace example diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h new file mode 100644 index 000000000..4190127e6 --- /dev/null +++ b/packages/react-native-executorch/common/runner/multimodal_runner.h @@ -0,0 +1,57 @@ +// common/runner/multimodal_runner.h +#pragma once + +#include "base_llm_runner.h" +#include "encoders/iencoder.h" +#include "multimodal_decoder_runner.h" +#include "multimodal_input.h" +#include "multimodal_prefiller.h" +#include "text_token_generator.h" +#include + +namespace executorch::extension::llm { +// Tag enum for keying encoder map +enum class MultimodalType { Image, Audio }; +} // namespace executorch::extension::llm + +namespace example { + +class MultimodalRunner : public BaseLLMRunner { +public: + explicit MultimodalRunner( + std::unique_ptr<::executorch::extension::Module> owned_module, + const std::string &tokenizer_path, + std::map<::executorch::extension::llm::MultimodalType, + std::unique_ptr<::executorch::extension::llm::IEncoder>> + encoders, + const ::executorch::extension::llm::GenerationConfig &config = { + .temperature = 0.8F, .topp = 0.9F}); + + bool is_loaded() const override; + + ::executorch::runtime::Error generate_internal( + const std::vector<::executorch::extension::llm::MultimodalInput> &inputs, + std::function token_callback) override; + +protected: + ::executorch::runtime::Error load_subcomponents() override; + void stop_impl() override; + void set_temperature_impl(float) override { + } // config_ already updated by base + void set_topp_impl(float) override {} // config_ already updated by base + void set_count_interval_impl(size_t count_interval) override; + void set_time_interval_impl(size_t time_interval) override; + +private: + std::map<::executorch::extension::llm::MultimodalType, + std::unique_ptr<::executorch::extension::llm::IEncoder>> + encoders_; + std::unique_ptr<::executorch::extension::llm::MultimodalDecoderRunner> + mm_decoder_runner_; + std::unique_ptr<::executorch::extension::llm::MultimodalPrefiller> + mm_prefiller_; + std::unique_ptr<::executorch::extension::llm::TextTokenGenerator> + mm_token_generator_; +}; + +} // namespace example From a1edb3c809bf1a5d91fc373225e1bb4fc1b314a4 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 13:19:28 +0100 Subject: [PATCH 36/46] feat: wire capabilities through LLM.cpp, delete UnifiedRunner Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/models/llm/LLM.cpp | 38 +- .../common/rnexecutorch/models/llm/LLM.h | 10 +- .../common/rnexecutorch/tests/CMakeLists.txt | 3 +- .../tests/integration/LLMTest.cpp | 36 +- .../common/runner/unified_runner.cpp | 387 ------------------ .../common/runner/unified_runner.h | 101 ----- 6 files changed, 46 insertions(+), 529 deletions(-) delete mode 100644 packages/react-native-executorch/common/runner/unified_runner.cpp delete mode 100644 packages/react-native-executorch/common/runner/unified_runner.h diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index acccedbd0..a23957bb6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -2,10 +2,14 @@ #include #include +#include #include #include #include +#include #include +#include +#include namespace rnexecutorch::models::llm { namespace llm = ::executorch::extension::llm; @@ -43,24 +47,23 @@ const llm::Image &LLM::getOrLoadImage(const std::string &path) { } LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, + std::vector capabilities, std::shared_ptr callInvoker) : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { - // Peek at method names to decide text vs multimodal before constructing - // runner - auto method_names_result = module_->method_names(); - multimodal_ = method_names_result.ok() && - method_names_result->count(llm::kTokenEmbeddingMethod) > 0 && - method_names_result->count(llm::kTextModelMethod) > 0; - - if (multimodal_) { - // Transfer module_ ownership to the runner (same as old MultimodalLLM) - runner_ = std::make_unique( - nullptr, std::move(module_), tokenizerSource); + if (capabilities.empty()) { + runner_ = std::make_unique(module_.get(), nullptr, + tokenizerSource); } else { - // Lend module_ as a raw pointer (same as old LLM) - runner_ = std::make_unique(module_.get(), nullptr, - tokenizerSource); + std::map> encoders; + for (const auto &cap : capabilities) { + if (cap == "vision") { + encoders[llm::MultimodalType::Image] = + std::make_unique(module_.get()); + } + } + runner_ = std::make_unique( + std::move(module_), tokenizerSource, std::move(encoders)); } auto loadResult = runner_->load(); @@ -104,7 +107,7 @@ std::string LLM::generate(std::string prompt, throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Runner is not loaded"); } - if (!multimodal_) { + if (!dynamic_cast(runner_.get())) { throw RnExecutorchError( RnExecutorchErrorCode::InvalidUserInput, "This is a text-only model. Call generate(prompt, cb)."); @@ -158,8 +161,7 @@ std::string LLM::generate(std::string prompt, } }; - auto error = - runner_->generate(inputs, temperature_, topp_, -1, nativeCallback); + auto error = runner_->generate_internal(inputs, nativeCallback); if (error != Error::Ok) { throw RnExecutorchError(error, "Failed to generate multimodal response"); } @@ -242,7 +244,6 @@ void LLM::setTemperature(float temperature) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, "Temperature must be non-negative"); } - temperature_ = temperature; runner_->set_temperature(temperature); } @@ -255,7 +256,6 @@ void LLM::setTopp(float topp) { throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, "Top-p must be between 0.0 and 1.0"); } - topp_ = topp; runner_->set_topp(topp); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 6e47dbed0..60c8bc148 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -2,12 +2,13 @@ #include #include +#include #include #include #include +#include #include -#include namespace rnexecutorch { namespace models::llm { @@ -17,6 +18,7 @@ class LLM : public BaseModel { public: explicit LLM(const std::string &modelSource, const std::string &tokenizerSource, + std::vector capabilities, std::shared_ptr callInvoker); // Text-only: pre-rendered prompt string @@ -42,10 +44,7 @@ class LLM : public BaseModel { int32_t getMaxContextLength() const; private: - std::unique_ptr runner_; - bool multimodal_; - float temperature_ = 0.8f; - float topp_ = 0.9f; + std::unique_ptr runner_; std::unordered_map imageCache_; const executorch::extension::llm::Image & @@ -54,5 +53,6 @@ class LLM : public BaseModel { } // namespace models::llm REGISTER_CONSTRUCTOR(models::llm::LLM, std::string, std::string, + std::vector, std::shared_ptr); } // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index a077a0c5a..56b640cc0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -210,7 +210,6 @@ add_rn_test(SpeechToTextTests integration/SpeechToTextTest.cpp add_rn_test(LLMTests integration/LLMTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/llm/LLM.cpp - ${COMMON_DIR}/runner/runner.cpp # keep until Task 5 ${COMMON_DIR}/runner/base_llm_runner.cpp ${COMMON_DIR}/runner/text_runner.cpp ${COMMON_DIR}/runner/multimodal_runner.cpp @@ -219,7 +218,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp ${COMMON_DIR}/runner/text_decoder_runner.cpp ${COMMON_DIR}/runner/sampler.cpp ${COMMON_DIR}/runner/arange_util.cpp - ${COMMON_DIR}/runner/encoders/vision_encoder.cpp # add this + ${COMMON_DIR}/runner/encoders/vision_encoder.cpp LIBS tokenizers_deps opencv_deps ) diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index 9ee8a64b2..cad94fa10 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -38,12 +38,12 @@ template <> struct ModelTraits { using ModelType = LLM; static ModelType createValid() { - return ModelType(kValidModelPath, kValidTokenizerPath, + return ModelType(kValidModelPath, kValidTokenizerPath, {}, rnexecutorch::createMockCallInvoker()); } static ModelType createInvalid() { - return ModelType("nonexistent.pte", kValidTokenizerPath, + return ModelType("nonexistent.pte", kValidTokenizerPath, {}, rnexecutorch::createMockCallInvoker()); } @@ -68,18 +68,24 @@ class LLMTest : public ::testing::Test { }; TEST(LLMCtorTests, InvalidTokenizerPathThrows) { - EXPECT_THROW(LLM(kValidModelPath, "nonexistent_tokenizer.json", + EXPECT_THROW(LLM(kValidModelPath, "nonexistent_tokenizer.json", {}, createMockCallInvoker()), RnExecutorchError); } +TEST(LLMCtorTests, WrongCapabilitiesThrowsClearError) { + EXPECT_THROW(LLM(kValidModelPath, kValidTokenizerPath, {"vision"}, + createMockCallInvoker()), + rnexecutorch::RnExecutorchError); +} + TEST_F(LLMTest, GetGeneratedTokenCountInitiallyZero) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_EQ(model.getGeneratedTokenCount(), 0); } TEST_F(LLMTest, SetTemperature) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); // Should not throw for valid values EXPECT_NO_THROW(model.setTemperature(0.5f)); EXPECT_NO_THROW(model.setTemperature(1.0f)); @@ -87,43 +93,43 @@ TEST_F(LLMTest, SetTemperature) { } TEST_F(LLMTest, SetTemperatureNegativeThrows) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_THROW(model.setTemperature(-0.1f), RnExecutorchError); } TEST_F(LLMTest, SetTopp) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_NO_THROW(model.setTopp(0.9f)); EXPECT_NO_THROW(model.setTopp(0.5f)); EXPECT_NO_THROW(model.setTopp(1.0f)); } TEST_F(LLMTest, SetToppInvalidThrows) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_THROW(model.setTopp(-0.1f), RnExecutorchError); EXPECT_THROW(model.setTopp(1.1f), RnExecutorchError); } TEST_F(LLMTest, SetCountInterval) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_NO_THROW(model.setCountInterval(5)); EXPECT_NO_THROW(model.setCountInterval(10)); } TEST_F(LLMTest, SetTimeInterval) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_NO_THROW(model.setTimeInterval(100)); EXPECT_NO_THROW(model.setTimeInterval(500)); } TEST_F(LLMTest, InterruptThrowsWhenUnloaded) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); model.unload(); EXPECT_THROW(model.interrupt(), RnExecutorchError); } TEST_F(LLMTest, SettersThrowWhenUnloaded) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); model.unload(); // All setters should throw when model is unloaded EXPECT_THROW(model.setTemperature(0.5f), RnExecutorchError); @@ -133,7 +139,7 @@ TEST_F(LLMTest, SettersThrowWhenUnloaded) { } TEST_F(LLMTest, GenerateProducesValidOutput) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); model.setTemperature(0.0f); std::string prompt = formatChatML(kSystemPrompt, "Repeat exactly this: `naszponcilem testy`"); @@ -142,7 +148,7 @@ TEST_F(LLMTest, GenerateProducesValidOutput) { } TEST_F(LLMTest, GenerateUpdatesTokenCount) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_EQ(model.getGeneratedTokenCount(), 0); std::string prompt = formatChatML(kSystemPrompt, "Repeat exactly this: 'naszponcilem testy'"); @@ -151,7 +157,7 @@ TEST_F(LLMTest, GenerateUpdatesTokenCount) { } TEST_F(LLMTest, EmptyPromptThrows) { - LLM model(kValidModelPath, kValidTokenizerPath, mockInvoker_); + LLM model(kValidModelPath, kValidTokenizerPath, {}, mockInvoker_); EXPECT_THROW((void)model.generate("", nullptr), RnExecutorchError); } diff --git a/packages/react-native-executorch/common/runner/unified_runner.cpp b/packages/react-native-executorch/common/runner/unified_runner.cpp deleted file mode 100644 index 5dcdf127c..000000000 --- a/packages/react-native-executorch/common/runner/unified_runner.cpp +++ /dev/null @@ -1,387 +0,0 @@ -// packages/react-native-executorch/common/runner/unified_runner.cpp -#include "unified_runner.h" -#include "constants.h" -#include "util.h" -#include -#include -#include - -namespace example { - -using namespace executorch::extension::llm; -using ::executorch::extension::Module; -using ::executorch::runtime::Error; -using ::executorch::runtime::Result; - -UnifiedRunner::UnifiedRunner(Module *module, - std::unique_ptr owned_module, - const std::string &tokenizer_path, - const llm::GenerationConfig &config) - : config_(config), module_(owned_module ? owned_module.get() : module), - owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path), - tokenizer_(std::make_unique()), - metadata_({ - {kEnableDynamicShape, false}, - {kMaxSeqLen, 128}, - {kMaxContextLen, 128}, - {kUseKVCache, true}, - {kUseSDPAWithKVCache, false}, - }) {} - -bool UnifiedRunner::is_multimodal() const noexcept { return multimodal_; } - -bool UnifiedRunner::is_loaded() const { - if (multimodal_) { - return mm_prefiller_ && mm_prefiller_->is_method_loaded() && - mm_token_generator_ && mm_token_generator_->is_loaded(); - } - return module_->is_loaded() && tokenizer_->is_loaded() && - text_decoder_runner_ && text_prefiller_ && text_token_generator_; -} - -Error UnifiedRunner::load() { - if (is_loaded()) { - return Error::Ok; - } - - auto status = tokenizer_->load(tokenizer_path_); - if (status != tokenizers::Error::Ok) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Unexpected issue occurred while loading tokenizer"); - } - - // Detect mode by inspecting method names - const auto method_names = - ET_UNWRAP(module_->method_names(), "Failed reading method names"); - - multimodal_ = method_names.count(kTokenEmbeddingMethod) > 0 && - method_names.count(kTextModelMethod) > 0; - - // Load metadata - metadata_[kVocabSize] = tokenizer_->vocab_size(); - for (auto &pair : metadata_) { - const auto &method_name = pair.first; - auto &value = pair.second; - if (method_names.count(method_name)) { - value = ET_UNWRAP(module_->get(method_name)) - .toScalar() - .to(); - } - ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); - } - - if (config_.max_seq_len < 0) - config_.max_seq_len = static_cast(metadata_.at(kMaxSeqLen)); - if (config_.max_context_length < 0) { - config_.max_context_length = - method_names.count(kMaxContextLen) - ? static_cast(metadata_.at(kMaxContextLen)) - : static_cast(metadata_.at(kMaxSeqLen)); - } - if (config_.max_new_tokens < 0) - config_.max_new_tokens = - std::min(config_.max_seq_len, config_.max_context_length); - if (config_.enable_dynamic_shape) - config_.enable_dynamic_shape = - static_cast(metadata_.at(kEnableDynamicShape)); - if (config_.enable_kv_cache) - config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); - - // Load EOS ids - auto eos_ids = std::make_unique>(); - if (method_names.count(kEosIds)) { - for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { - eos_ids->emplace(static_cast(eos_id.toScalar().to())); - } - } - if (eos_ids->empty()) { - eos_ids->emplace(7); // fallback <|im_end|> - } - - io_manager_ = std::make_unique(*module_); - llm::Stats *stats_ptr = &stats_; - - if (multimodal_) { - mm_decoder_runner_ = std::make_unique( - module_, io_manager_.get()); - mm_prefiller_ = std::make_unique( - module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get()); - mm_token_generator_ = std::make_unique( - tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true, - std::move(eos_ids), stats_ptr); - - ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load()); - ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load()); - } else { - ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - - text_decoder_runner_ = std::make_unique( - module_, io_manager_.get(), config_.temperature, config_.topp); - text_prefiller_ = std::make_unique( - text_decoder_runner_.get(), config_.enable_kv_cache, - config_.enable_dynamic_shape, config_.max_seq_len); - text_token_generator_ = std::make_unique( - tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache, - std::move(eos_ids), stats_ptr); - } - - return Error::Ok; -} - -Error UnifiedRunner::generate( - const std::string &prompt, const llm::GenerationConfig &generation_config, - std::function token_callback, - std::function stats_callback) { - - ET_CHECK_MSG(!prompt.empty(), "Prompt cannot be null"); - - // In multimodal mode, delegate to the multimodal generate path with - // text-only input (no image). - if (multimodal_) { - std::vector text_inputs = { - llm::make_text_input(prompt)}; - float temp = - generation_config.temperature >= 0.F - ? generation_config.temperature - : (config_.temperature >= 0.F ? config_.temperature : 0.8F); - float topp = generation_config.topp >= 0.F - ? generation_config.topp - : (config_.topp >= 0.F ? config_.topp : 0.9F); - return generate(text_inputs, temp, topp, -1, token_callback); - } - - if (!is_loaded()) { - stats_.model_load_start_ms = llm::time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(load()); - stats_.model_load_end_ms = llm::time_in_ms(); - } - - std::function wrapped_callback = - [token_callback, &generation_config](const std::string &piece) { - if (!generation_config.warming) { - llm::safe_printf(piece.c_str()); - fflush(stdout); - } - if (token_callback) - token_callback(piece); - }; - - stats_.inference_start_ms = llm::time_in_ms(); - shouldStop_ = false; - - int32_t max_seq_len = generation_config.max_seq_len >= 0 - ? generation_config.max_seq_len - : config_.max_seq_len; - int32_t max_context_length = generation_config.max_context_length >= 0 - ? generation_config.max_context_length - : config_.max_context_length; - int32_t new_tokens_limit = generation_config.max_new_tokens >= 0 - ? generation_config.max_new_tokens - : config_.max_new_tokens; - float temperature = generation_config.temperature >= 0.F - ? generation_config.temperature - : config_.temperature; - float topp = - generation_config.topp >= 0.F ? generation_config.topp : config_.topp; - - int64_t context_len_left = static_cast(max_context_length) - pos_; - - auto encodeResult = - tokenizer_->encode(prompt, numOfAddedBoSTokens, numOfAddedEoSTokens); - if (!encodeResult.ok()) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Unexpected issue occurred while encoding: " + - std::to_string(static_cast(encodeResult.error()))); - } - std::vector prompt_tokens = encodeResult.get(); - int num_prompt_tokens = prompt_tokens.size(); - - ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens >= 1, InvalidArgument, - "Expected at least 1 prompt token"); - ET_CHECK_OR_RETURN_ERROR(num_prompt_tokens < max_seq_len, InvalidArgument, - "num_prompt_tokens %d >= max_seq_len %" PRId32, - num_prompt_tokens, max_seq_len); - - int32_t max_new_tokens = resolve_max_new_tokens( - num_prompt_tokens, max_seq_len, static_cast(context_len_left), - new_tokens_limit); - - ET_CHECK_OR_RETURN_ERROR(max_new_tokens > 0, InvalidArgument, - "Max new tokens %d is <= 0", max_new_tokens); - - if (generation_config.echo) - wrapped_callback(prompt); - - auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_); - stats_.first_token_ms = llm::time_in_ms(); - stats_.prompt_eval_end_ms = llm::time_in_ms(); - ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); - - uint64_t cur_token = prefill_res.get(); - auto decodeResult = tokenizer_->decode({cur_token}); - if (!decodeResult.ok()) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Unexpected issue occurred while decoding: " + - std::to_string(static_cast(decodeResult.error()))); - } - - prompt_tokens.push_back(cur_token); - int64_t num_generated = ET_UNWRAP( - text_token_generator_->generate(prompt_tokens, pos_, max_new_tokens - 1, - temperature, topp, wrapped_callback)); - - pos_ += num_generated; - stats_.inference_end_ms = llm::time_in_ms(); - stats_.num_prompt_tokens = num_prompt_tokens; - stats_.num_generated_tokens = num_generated; - - if (stats_callback) - stats_callback(stats_); - - return Error::Ok; -} - -Error UnifiedRunner::generate( - const std::vector &inputs, float temperature, - float topp, int32_t max_new_tokens, - std::function token_callback) { - - ET_CHECK_MSG(multimodal_, - "generate(MultimodalInput) called on a text-only runner. Use " - "generate(string) instead."); - - if (inputs.empty()) { - ET_LOG(Error, "MultimodalInput vector cannot be empty"); - return Error::InvalidArgument; - } - - if (!is_loaded()) - ET_CHECK_OK_OR_RETURN_ERROR(load()); - - stats_.inference_start_ms = llm::time_in_ms(); - - uint64_t prefill_next_token = 0; - for (size_t i = 0; i < inputs.size(); ++i) { - auto prefill_result = mm_prefiller_->prefill(inputs[i], pos_); - if (!prefill_result.ok()) - return prefill_result.error(); - prefill_next_token = prefill_result.get(); - } - - stats_.first_token_ms = llm::time_in_ms(); - stats_.prompt_eval_end_ms = llm::time_in_ms(); - stats_.num_prompt_tokens = pos_; - - int32_t resolved_max_new = - max_new_tokens > 0 - ? max_new_tokens - : static_cast(config_.max_context_length - pos_); - resolved_max_new = std::max(0, resolved_max_new); - - std::vector seed_tokens = {prefill_next_token}; - auto wrapped_callback = [&](const std::string &piece) { - llm::safe_printf(piece.c_str()); - fflush(stdout); - if (token_callback) - token_callback(piece); - }; - - auto generate_result = mm_token_generator_->generate( - seed_tokens, pos_, - static_cast(std::max(0, resolved_max_new - 1)), temperature, - topp, wrapped_callback); - - if (!generate_result.ok()) - return generate_result.error(); - - int64_t num_generated = generate_result.get(); - pos_ += num_generated; - - stats_.inference_end_ms = llm::time_in_ms(); - stats_.num_generated_tokens = num_generated; - - return Error::Ok; -} - -void UnifiedRunner::stop() { - if (multimodal_) { - if (mm_token_generator_) - mm_token_generator_->stop(); - } else { - if (text_token_generator_) - text_token_generator_->stop(); - } -} - -void UnifiedRunner::reset() { - stats_.reset(); - pos_ = 0; -} - -int32_t UnifiedRunner::count_text_tokens(const std::string &text) const { - auto encodeResult = - tokenizer_->encode(text, numOfAddedBoSTokens, numOfAddedEoSTokens); - if (!encodeResult.ok()) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::TokenizerError, - "Encoding failed during token count check."); - } - return static_cast(encodeResult.get().size()); -} - -int32_t UnifiedRunner::get_max_context_length() const { - if (!is_loaded()) { - return static_cast(metadata_.at(kMaxContextLen)); - } - return config_.max_context_length; -} - -void UnifiedRunner::set_temperature(float temperature) noexcept { - config_.temperature = temperature; - if (text_decoder_runner_) - text_decoder_runner_->set_temperature(temperature); -} - -void UnifiedRunner::set_topp(float topp) noexcept { - config_.topp = topp; - if (text_decoder_runner_) - text_decoder_runner_->set_topp(topp); -} - -void UnifiedRunner::set_count_interval(size_t count_interval) { - if (text_token_generator_) - text_token_generator_->set_count_interval(count_interval); - if (mm_token_generator_) - mm_token_generator_->set_count_interval(count_interval); -} - -void UnifiedRunner::set_time_interval(size_t time_interval) { - if (text_token_generator_) - text_token_generator_->set_time_interval(time_interval); - if (mm_token_generator_) - mm_token_generator_->set_time_interval(time_interval); -} - -int32_t UnifiedRunner::resolve_max_new_tokens(int32_t num_prompt_tokens, - int32_t max_seq_len, - int32_t max_context_len, - int32_t max_new_tokens) const { - int32_t result; - if (max_seq_len == -1 && max_new_tokens == -1) { - result = max_context_len - num_prompt_tokens; - } else if (max_seq_len == -1) { - result = std::min(max_new_tokens, max_context_len - num_prompt_tokens); - } else if (max_new_tokens == -1) { - result = std::min(max_seq_len, max_context_len) - num_prompt_tokens; - } else { - result = - std::min(std::min(max_seq_len, max_context_len) - num_prompt_tokens, - max_new_tokens); - } - return std::max(0, result); -} - -} // namespace example diff --git a/packages/react-native-executorch/common/runner/unified_runner.h b/packages/react-native-executorch/common/runner/unified_runner.h deleted file mode 100644 index ae7789bbe..000000000 --- a/packages/react-native-executorch/common/runner/unified_runner.h +++ /dev/null @@ -1,101 +0,0 @@ -// packages/react-native-executorch/common/runner/unified_runner.h -#pragma once - -#include "irunner.h" -#include "multimodal_decoder_runner.h" -#include "multimodal_input.h" -#include "multimodal_prefiller.h" -#include "stats.h" -#include "text_decoder_runner.h" -#include "text_prefiller.h" -#include "text_token_generator.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace example { - -namespace llm = ::executorch::extension::llm; - -class UnifiedRunner { -public: - // module: raw pointer borrowed from BaseModel (text mode uses this) - // owned_module: unique_ptr taken for multimodal mode (nullptr in text mode) - // tokenizer_path: path to tokenizer JSON - // config: generation defaults - explicit UnifiedRunner( - ::executorch::extension::Module *module, - std::unique_ptr<::executorch::extension::Module> owned_module, - const std::string &tokenizer_path, - const llm::GenerationConfig &config = {.temperature = 0.8F, - .topp = 0.9F}); - - bool is_multimodal() const noexcept; - bool is_loaded() const; - ::executorch::runtime::Error load(); - - // Text-only generate — mirrors Runner::generate signature - ::executorch::runtime::Error - generate(const std::string &prompt, - const llm::GenerationConfig &generation_config = {}, - std::function token_callback = {}, - std::function stats_callback = {}); - - // Multimodal generate — mirrors MultimodalRunner::generate signature - ::executorch::runtime::Error - generate(const std::vector &inputs, float temperature, - float topp, int32_t max_new_tokens, - std::function token_callback = {}); - - void stop(); - void reset(); - - // Available for both modes - int32_t count_text_tokens(const std::string &text) const; - int32_t get_max_context_length() const; - void set_temperature(float temperature) noexcept; - void set_topp(float topp) noexcept; - void set_count_interval(size_t count_interval); - void set_time_interval(size_t time_interval); - - llm::Stats stats_; - -private: - int32_t resolve_max_new_tokens(int32_t num_prompt_tokens, int32_t max_seq_len, - int32_t max_context_len, - int32_t max_new_tokens = -1) const; - - bool multimodal_{false}; - llm::GenerationConfig config_; - bool shouldStop_{false}; - int64_t pos_{0}; - - // module access — module_ is always a valid raw pointer - // In text mode: points to BaseModel's module_ (borrowed) - // In multimodal mode: points to owned_module_.get() (owned) - ::executorch::extension::Module *module_; - std::unique_ptr<::executorch::extension::Module> owned_module_; - - std::string tokenizer_path_; - std::unique_ptr tokenizer_; - std::unordered_map metadata_; - std::unique_ptr io_manager_; - - // Text-only subcomponents (null in multimodal mode) - std::unique_ptr text_decoder_runner_; - std::unique_ptr text_prefiller_; - std::unique_ptr text_token_generator_; - - // Multimodal subcomponents (null in text mode) - std::unique_ptr mm_decoder_runner_; - std::unique_ptr mm_prefiller_; - std::unique_ptr mm_token_generator_; -}; - -} // namespace example From 7076a9f403b0fe30cbcdf1b37df8835433e8fec8 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Tue, 3 Mar 2026 13:21:57 +0100 Subject: [PATCH 37/46] feat: forward capabilities from LLMController to native --- .../src/controllers/LLMController.ts | 7 ++++++- .../src/hooks/natural_language_processing/useLLM.ts | 2 ++ packages/react-native-executorch/src/index.ts | 6 +++++- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index e09646d08..c52e537e4 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -5,6 +5,7 @@ import { DEFAULT_CHAT_CONFIG } from '../constants/llmDefaults'; import { ChatConfig, GenerationConfig, + LLMCapability, LLMTool, Message, SPECIAL_TOKENS, @@ -74,11 +75,13 @@ export class LLMController { modelSource, tokenizerSource, tokenizerConfigSource, + capabilities, onDownloadProgressCallback, }: { modelSource: ResourceSource; tokenizerSource: ResourceSource; tokenizerConfigSource: ResourceSource; + capabilities?: readonly LLMCapability[]; onDownloadProgressCallback?: (downloadProgress: number) => void; }) { // reset inner state when loading new model @@ -117,7 +120,9 @@ export class LLMController { this.tokenizerConfig = JSON.parse( await ResourceFetcher.fs.readAsString(tokenizerConfigPath!) ); - this.nativeModule = global.loadLLM(modelPath, tokenizerPath); + this.nativeModule = global.loadLLM(modelPath, tokenizerPath, [ + ...(capabilities ?? []), + ]); this.isReadyCallback(true); this.onToken = (data: string) => { if (!data) { diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index f83d39352..c2fcd01bc 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -61,6 +61,7 @@ export function useLLM({ modelSource: model.modelSource, tokenizerSource: model.tokenizerSource, tokenizerConfigSource: model.tokenizerConfigSource!, + capabilities: model.capabilities, onDownloadProgressCallback: setDownloadProgress, }); } catch (e) { @@ -78,6 +79,7 @@ export function useLLM({ model.modelSource, model.tokenizerSource, model.tokenizerConfigSource, + model.capabilities, preventLoad, ]); diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index dd7557ca2..3e6723b55 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -48,7 +48,11 @@ declare global { var loadImageEmbeddings: (source: string) => any; var loadVAD: (source: string) => any; var loadTextEmbeddings: (modelSource: string, tokenizerSource: string) => any; - var loadLLM: (modelSource: string, tokenizerSource: string) => any; + var loadLLM: ( + modelSource: string, + tokenizerSource: string, + capabilities: string[] + ) => any; var loadTextToImage: ( tokenizerSource: string, encoderSource: string, From 96525bcfe208ebbe34bd715895d866fd6a96c69e Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 13:34:54 +0100 Subject: [PATCH 38/46] feat: add logging, fix metadata application, fix module ownership and EOS IDs Co-Authored-By: Claude Sonnet 4.6 --- apps/llm/app/multimodal_llm/index.tsx | 12 ++----- .../common/rnexecutorch/models/llm/LLM.cpp | 2 +- .../common/rnexecutorch/tests/CMakeLists.txt | 1 + .../common/runner/base_llm_runner.cpp | 13 ++++---- .../common/runner/constants.h | 1 - .../common/runner/encoders/vision_encoder.cpp | 20 ++++++++++++ .../common/runner/multimodal_runner.cpp | 32 ++++++++++++++++++- .../common/runner/text_runner.cpp | 11 +++++-- .../common/runner/text_runner.h | 1 - .../src/constants/modelUrls.ts | 13 +------- .../natural_language_processing/useLLM.ts | 4 ++- 11 files changed, 74 insertions(+), 36 deletions(-) diff --git a/apps/llm/app/multimodal_llm/index.tsx b/apps/llm/app/multimodal_llm/index.tsx index 542af4740..1781684a0 100644 --- a/apps/llm/app/multimodal_llm/index.tsx +++ b/apps/llm/app/multimodal_llm/index.tsx @@ -13,7 +13,7 @@ import { } from 'react-native'; import { launchImageLibrary } from 'react-native-image-picker'; import { useIsFocused } from '@react-navigation/native'; -import { useLLM } from 'react-native-executorch'; +import { useLLM, LFM2_VL_1_6B_QUANTIZED } from 'react-native-executorch'; import SendIcon from '../../assets/icons/send_icon.svg'; import PauseIcon from '../../assets/icons/pause_icon.svg'; import ColorPalette from '../../colors'; @@ -34,15 +34,7 @@ function MultimodalLLMScreen() { const { setGlobalGenerating } = useContext(GeneratingContext); const vlm = useLLM({ - model: { - capabilities: ['vision'] as const, - modelSource: - 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte', - tokenizerSource: - 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json', - tokenizerConfigSource: - 'https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json', - }, + model: LFM2_VL_1_6B_QUANTIZED, }); useEffect(() => { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index a23957bb6..8d046d6e9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -52,7 +52,7 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { if (capabilities.empty()) { - runner_ = std::make_unique(module_.get(), nullptr, + runner_ = std::make_unique(std::move(module_), tokenizerSource); } else { std::map> encoders; diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 56b640cc0..159f00159 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -219,6 +219,7 @@ add_rn_test(LLMTests integration/LLMTest.cpp ${COMMON_DIR}/runner/sampler.cpp ${COMMON_DIR}/runner/arange_util.cpp ${COMMON_DIR}/runner/encoders/vision_encoder.cpp + ${IMAGE_UTILS_SOURCES} LIBS tokenizers_deps opencv_deps ) diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp index a987528a0..fcb647ec5 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp +++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp @@ -4,6 +4,7 @@ #include "util.h" #include #include +#include namespace example { @@ -23,7 +24,6 @@ BaseLLMRunner::BaseLLMRunner(Module *module, {kMaxSeqLen, 128}, {kMaxContextLen, 128}, {kUseKVCache, true}, - {kUseSDPAWithKVCache, false}, }) {} Error BaseLLMRunner::load() { @@ -49,7 +49,8 @@ Error BaseLLMRunner::load() { .toScalar() .to(); } - ET_LOG(Info, "Metadata: %s = %" PRId64, method_name.c_str(), value); + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[BaseLLMRunner] Metadata:", method_name, "=", value); } if (config_.max_seq_len < 0) @@ -63,11 +64,9 @@ Error BaseLLMRunner::load() { if (config_.max_new_tokens < 0) config_.max_new_tokens = std::min(config_.max_seq_len, config_.max_context_length); - if (config_.enable_dynamic_shape) - config_.enable_dynamic_shape = - static_cast(metadata_.at(kEnableDynamicShape)); - if (config_.enable_kv_cache) - config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); + config_.enable_dynamic_shape = + static_cast(metadata_.at(kEnableDynamicShape)); + config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); auto eos_ids = std::make_unique>(); if (method_names.count(kEosIds)) { diff --git a/packages/react-native-executorch/common/runner/constants.h b/packages/react-native-executorch/common/runner/constants.h index e75466829..f1fee2347 100644 --- a/packages/react-native-executorch/common/runner/constants.h +++ b/packages/react-native-executorch/common/runner/constants.h @@ -17,7 +17,6 @@ inline constexpr auto kMaxSeqLen = "get_max_seq_len"; inline constexpr auto kMaxContextLen = "get_max_context_len"; inline constexpr auto kVocabSize = "get_vocab_size"; inline constexpr auto kUseKVCache = "use_kv_cache"; -inline constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache"; // Multimodal method name conventions inline constexpr auto kVisionEncoderMethod = "vision_encoder"; diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 35ce84c6d..0e49abe5a 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -2,6 +2,7 @@ #include "vision_encoder.h" #include +#include #include #include @@ -22,12 +23,26 @@ Error VisionEncoder::load() { if (!method_names_result.ok()) { return method_names_result.error(); } + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, + "[VisionEncoder] Available methods:"); + for (const auto &name : *method_names_result) { + auto val = module_->get(name); + if (val.ok()) { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name, "=", + val->toScalar().to()); + } else { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name); + } + } + if (method_names_result->count(kVisionEncoderMethod) == 0) { throw rnexecutorch::RnExecutorchError( rnexecutorch::RnExecutorchErrorCode::InvalidConfig, "Model does not support vision: 'vision_encoder' method not found. " "Check that the .pte file matches the declared capabilities."); } + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[VisionEncoder] Loading method:", kVisionEncoderMethod); return module_->load_method(kVisionEncoderMethod); } @@ -53,11 +68,16 @@ Result VisionEncoder::encode(const MultimodalInput &input) { return input_meta_result.error(); } auto expected_dims = input_meta_result->sizes(); + rnexecutorch::log( + rnexecutorch::LOG_LEVEL::Debug, "[VisionEncoder] Expected input dims:", + std::vector(expected_dims.begin(), expected_dims.end())); auto image_tensor_result = image.toTensor(/*with_batch=*/expected_dims.size() == 4); if (!image_tensor_result.ok()) { return image_tensor_result.error(); } + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[VisionEncoder] Running encode"); auto result = module_->execute(kVisionEncoderMethod, *image_tensor_result); if (!result.ok()) { return result.error(); diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp index 363f11d11..4960f7845 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -3,6 +3,7 @@ #include "constants.h" #include "util.h" #include +#include namespace example { @@ -30,14 +31,37 @@ bool MultimodalRunner::is_loaded() const { } Error MultimodalRunner::load_subcomponents() { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading", + encoders_.size(), "encoder(s)"); // Load and validate all declared encoders — throws on mismatch for (auto &[type, encoder] : encoders_) { + rnexecutorch::log( + rnexecutorch::LOG_LEVEL::Debug, + "[MultimodalRunner] Loading encoder type:", static_cast(type)); encoder->load(); + rnexecutorch::log( + rnexecutorch::LOG_LEVEL::Info, + "[MultimodalRunner] Encoder loaded, type:", static_cast(type)); } llm::Stats *stats_ptr = &stats_; auto eos_ids = std::make_unique>(); - eos_ids->emplace(7); // fallback + const auto method_names = + ET_UNWRAP(module_->method_names(), "Failed reading method names"); + if (method_names.count(kEosIds)) { + for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { + eos_ids->emplace(static_cast(eos_id.toScalar().to())); + } + } + if (eos_ids->empty()) { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Warn, + "[MultimodalRunner] get_eos_ids not found in model, " + "falling back to {7}"); + eos_ids->emplace(7); + } else { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[MultimodalRunner] EOS IDs loaded:", *eos_ids); + } mm_decoder_runner_ = std::make_unique( module_, io_manager_.get()); @@ -49,6 +73,8 @@ Error MultimodalRunner::load_subcomponents() { ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load()); ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load()); + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[MultimodalRunner] All subcomponents loaded successfully"); return Error::Ok; } @@ -74,6 +100,10 @@ Error MultimodalRunner::generate_internal( stats_.first_token_ms = llm::time_in_ms(); stats_.prompt_eval_end_ms = llm::time_in_ms(); stats_.num_prompt_tokens = pos_; + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[MultimodalRunner] Prefill took", + stats_.prompt_eval_end_ms - stats_.inference_start_ms, + "ms for", pos_, "tokens"); int32_t resolved_max_new = static_cast(config_.max_context_length - pos_); diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp index 279244855..fa2225f3d 100644 --- a/packages/react-native-executorch/common/runner/text_runner.cpp +++ b/packages/react-native-executorch/common/runner/text_runner.cpp @@ -4,6 +4,7 @@ #include "util.h" #include #include +#include namespace example { @@ -11,10 +12,10 @@ using namespace executorch::extension::llm; using ::executorch::extension::Module; using ::executorch::runtime::Error; -TextRunner::TextRunner(Module *module, std::unique_ptr owned_module, +TextRunner::TextRunner(std::unique_ptr owned_module, const std::string &tokenizer_path, const llm::GenerationConfig &config) - : BaseLLMRunner(module, std::move(owned_module), tokenizer_path, config) {} + : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config) {} bool TextRunner::is_loaded() const { return module_ && module_->is_loaded() && tokenizer_ && @@ -43,6 +44,9 @@ Error TextRunner::load_subcomponents() { text_decoder_runner_ = std::make_unique( module_, io_manager_.get(), config_.temperature, config_.topp); + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, + "[TextRunner] Parallel prefill (enable_dynamic_shape):", + config_.enable_dynamic_shape); text_prefiller_ = std::make_unique( text_decoder_runner_.get(), config_.enable_kv_cache, config_.enable_dynamic_shape, config_.max_seq_len); @@ -116,6 +120,9 @@ Error TextRunner::generate_internal( auto prefill_res = text_prefiller_->prefill(prompt_tokens, pos_); stats_.first_token_ms = llm::time_in_ms(); stats_.prompt_eval_end_ms = llm::time_in_ms(); + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[TextRunner] Prefill took", + stats_.prompt_eval_end_ms - stats_.inference_start_ms, + "ms for", num_prompt_tokens, "tokens"); ET_CHECK_OK_OR_RETURN_ERROR(prefill_res.error()); uint64_t cur_token = prefill_res.get(); diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h index e590f4c88..17394ee3f 100644 --- a/packages/react-native-executorch/common/runner/text_runner.h +++ b/packages/react-native-executorch/common/runner/text_runner.h @@ -11,7 +11,6 @@ namespace example { class TextRunner : public BaseLLMRunner { public: explicit TextRunner( - ::executorch::extension::Module *module, std::unique_ptr<::executorch::extension::Module> owned_module, const std::string &tokenizer_path, const ::executorch::extension::llm::GenerationConfig &config = { diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 325a13133..0dec9a6b0 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -372,21 +372,10 @@ export const LFM2_5_1_2B_INSTRUCT_QUANTIZED = { }; // LFM2.5-VL-1.6B (Vision-Language) -const LFM2_VL_1_6B_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_xnnpack.pte`; -const LFM2_VL_1_6B_QUANTIZED_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2p5_vl_1.6B_quantized_xnnpack.pte`; +const LFM2_VL_1_6B_QUANTIZED_MODEL = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_v2.pte`; const LFM2_VL_TOKENIZER = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json`; const LFM2_VL_TOKENIZER_CONFIG = `https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_config_2_5.json`; -/** - * @category Models - VLM - */ -export const LFM2_VL_1_6B = { - capabilities: ['vision'] as const, - modelSource: LFM2_VL_1_6B_MODEL, - tokenizerSource: LFM2_VL_TOKENIZER, - tokenizerConfigSource: LFM2_VL_TOKENIZER_CONFIG, -}; - /** * @category Models - VLM */ diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index c2fcd01bc..877f3a02d 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -33,6 +33,7 @@ export function useLLM({ const [isGenerating, setIsGenerating] = useState(false); const [downloadProgress, setDownloadProgress] = useState(0); const [error, setError] = useState(null); + const capabilitiesKey = model.capabilities?.join(',') ?? ''; const tokenCallback = useCallback((newToken: string) => { setToken(newToken); @@ -74,12 +75,13 @@ export function useLLM({ controllerInstance.delete(); } }; + // eslint-disable-next-line react-hooks/exhaustive-deps }, [ controllerInstance, model.modelSource, model.tokenizerSource, model.tokenizerConfigSource, - model.capabilities, + capabilitiesKey, // intentional: serialized string to avoid array reference re-runs preventLoad, ]); From b3ce27eaae5771c17a62af2205fec2710fb59bdb Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 14:51:19 +0100 Subject: [PATCH 39/46] refactor: replace Image class with ImagePath + VisionEncoder embedding cache Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/models/llm/LLM.cpp | 34 +------ .../common/rnexecutorch/models/llm/LLM.h | 5 -- .../common/runner/encoders/vision_encoder.cpp | 65 +++++++++++--- .../common/runner/encoders/vision_encoder.h | 5 ++ .../common/runner/image.h | 88 ------------------- .../common/runner/multimodal_input.h | 25 +++--- .../common/runner/multimodal_prefiller.cpp | 41 ++------- .../common/runner/multimodal_prefiller.h | 5 +- .../common/runner/multimodal_runner.cpp | 8 +- 9 files changed, 90 insertions(+), 186 deletions(-) delete mode 100644 packages/react-native-executorch/common/runner/image.h diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 8d046d6e9..8cdeffbe0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -4,10 +4,8 @@ #include #include #include -#include #include #include -#include #include #include @@ -18,34 +16,6 @@ using namespace facebook; using executorch::extension::module::Module; using executorch::runtime::Error; -// LFM2-VL vision encoder expects [1, 3, 512, 512] NCHW float32, values [0,255] -static constexpr int kImageSize = 512; -static constexpr int kImageChannels = 3; - -static llm::Image loadImageForVLM(const std::string &imagePath) { - cv::Mat mat = image_processing::readImage(imagePath); - cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); - cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); - - std::vector chw(kImageChannels * kImageSize * kImageSize); - const int pixelCount = kImageSize * kImageSize; - for (int i = 0; i < pixelCount; ++i) { - cv::Vec3b px = mat.at(i / kImageSize, i % kImageSize); - for (int c = 0; c < kImageChannels; ++c) { - chw[c * pixelCount + i] = static_cast(px[c]); - } - } - return llm::Image(std::move(chw), kImageSize, kImageSize, kImageChannels); -} - -const llm::Image &LLM::getOrLoadImage(const std::string &path) { - auto it = imageCache_.find(path); - if (it != imageCache_.end()) { - return it->second; - } - return imageCache_.emplace(path, loadImageForVLM(path)).first->second; -} - LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, std::vector capabilities, std::shared_ptr callInvoker) @@ -141,8 +111,7 @@ std::string LLM::generate(std::string prompt, RnExecutorchErrorCode::InvalidUserInput, "More placeholders in prompt than image paths provided"); } - const llm::Image &img = getOrLoadImage(imagePaths[imageIdx++]); - inputs.push_back(llm::make_image_input(img)); + inputs.push_back(llm::make_image_input(imagePaths[imageIdx++])); searchPos = found + kImageTokenLen; } @@ -183,7 +152,6 @@ void LLM::reset() { "Can't reset a model that's not loaded"); } runner_->reset(); - imageCache_.clear(); } size_t LLM::getGeneratedTokenCount() const noexcept { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 60c8bc148..b341e3811 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -8,7 +8,6 @@ #include #include #include -#include namespace rnexecutorch { namespace models::llm { @@ -45,10 +44,6 @@ class LLM : public BaseModel { private: std::unique_ptr runner_; - std::unordered_map - imageCache_; - const executorch::extension::llm::Image & - getOrLoadImage(const std::string &path); }; } // namespace models::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 0e49abe5a..44beb2a7c 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -3,8 +3,11 @@ #include #include +#include #include -#include + +#include +#include namespace executorch::extension::llm { @@ -12,6 +15,10 @@ using ::executorch::runtime::Error; using ::executorch::runtime::EValue; using ::executorch::runtime::Result; +// LFM2-VL vision encoder expects [1, 3, H, W] NCHW float32, values [0, 255] +static constexpr int kImageSize = 512; +static constexpr int kImageChannels = 3; + VisionEncoder::VisionEncoder(::executorch::extension::Module *module) : module_(module) {} @@ -57,32 +64,62 @@ Result VisionEncoder::encode(const MultimodalInput &input) { if (!input.is_image()) { return Error::InvalidArgument; } - const Image &image = input.get_image(); + + const std::string &path = input.get_image_path(); + + // Return cached embedding if available + auto it = embedding_cache_.find(path); + if (it != embedding_cache_.end()) { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, + "[VisionEncoder] Cache hit for:", path); + return it->second; + } + + // Load and preprocess image: resize → BGR→RGB → HWC uint8 → CHW float32 + cv::Mat mat = rnexecutorch::image_processing::readImage(path); + cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); + cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); + + std::vector chw(kImageChannels * kImageSize * kImageSize); + const int pixelCount = kImageSize * kImageSize; + for (int i = 0; i < pixelCount; ++i) { + cv::Vec3b px = mat.at(i / kImageSize, i % kImageSize); + for (int c = 0; c < kImageChannels; ++c) { + chw[c * pixelCount + i] = static_cast(px[c]); + } + } + + // Determine expected input shape (with or without batch dim) auto method_meta_result = module_->method_meta(kVisionEncoderMethod); if (!method_meta_result.ok()) { return method_meta_result.error(); } - auto &method_meta = *method_meta_result; - auto input_meta_result = method_meta.input_tensor_meta(0); + auto input_meta_result = method_meta_result->input_tensor_meta(0); if (!input_meta_result.ok()) { return input_meta_result.error(); } auto expected_dims = input_meta_result->sizes(); - rnexecutorch::log( - rnexecutorch::LOG_LEVEL::Debug, "[VisionEncoder] Expected input dims:", - std::vector(expected_dims.begin(), expected_dims.end())); - auto image_tensor_result = - image.toTensor(/*with_batch=*/expected_dims.size() == 4); - if (!image_tensor_result.ok()) { - return image_tensor_result.error(); + const bool with_batch = expected_dims.size() == 4; + + std::vector<::executorch::aten::SizesType> sizes = {kImageChannels, + kImageSize, kImageSize}; + if (with_batch) { + sizes.insert(sizes.begin(), 1); } + + auto image_tensor = ::executorch::extension::from_blob( + chw.data(), sizes, ::executorch::aten::ScalarType::Float); + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, - "[VisionEncoder] Running encode"); - auto result = module_->execute(kVisionEncoderMethod, *image_tensor_result); + "[VisionEncoder] Running encode for:", path); + auto result = module_->execute(kVisionEncoderMethod, image_tensor); if (!result.ok()) { return result.error(); } - return (*result)[0]; + + EValue embedding = (*result)[0]; + embedding_cache_.emplace(path, embedding); + return embedding; } } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index 5b3dd0aec..5af0491bd 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -3,7 +3,10 @@ #include "iencoder.h" #include +#include #include +#include +#include namespace executorch::extension::llm { @@ -18,6 +21,8 @@ class VisionEncoder : public IEncoder { private: ::executorch::extension::Module *module_; + std::unordered_map + embedding_cache_; }; } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/image.h b/packages/react-native-executorch/common/runner/image.h deleted file mode 100644 index 86373ca91..000000000 --- a/packages/react-native-executorch/common/runner/image.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -// Ported from executorch/extension/llm/runner/image.h - -#pragma once - -#include -#include -#include - -#include -#include - -namespace executorch { -namespace extension { -namespace llm { - -class Image { -public: - Image() : width_(0), height_(0), channels_(0) {} - - Image(std::vector &&data, int32_t width, int32_t height, - int32_t channels) - : data_(std::move(data)), width_(width), height_(height), - channels_(channels) {} - - Image(std::vector &&data, int32_t width, int32_t height, - int32_t channels) - : data_(std::move(data)), width_(width), height_(height), - channels_(channels) {} - - int32_t width() const { return width_; } - int32_t height() const { return height_; } - int32_t channels() const { return channels_; } - - bool is_uint8() const { - return std::holds_alternative>(data_); - } - bool is_float() const { - return std::holds_alternative>(data_); - } - - const std::vector &get_uint8_data() const & { - return std::get>(data_); - } - const std::vector &get_float_data() const & { - return std::get>(data_); - } - std::vector &get_float_data() & { - return std::get>(data_); - } - - ::executorch::runtime::Result<::executorch::extension::TensorPtr> - toTensor(bool with_batch = false) const { - std::vector<::executorch::aten::SizesType> sizes = {channels(), height(), - width()}; - if (with_batch) { - sizes.insert(sizes.begin(), 1); - } - if (is_float()) { - return ::executorch::extension::from_blob( - const_cast(get_float_data().data()), sizes, - ::executorch::aten::ScalarType::Float); - } else if (is_uint8()) { - return ::executorch::extension::from_blob( - const_cast(get_uint8_data().data()), sizes, - ::executorch::aten::ScalarType::Byte); - } - ET_LOG(Error, "Image data is not initialized."); - return ::executorch::runtime::Error::NotSupported; - } - -private: - std::variant, std::vector> data_; - int32_t width_; - int32_t height_; - int32_t channels_; -}; - -} // namespace llm -} // namespace extension -} // namespace executorch diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h index 4ce588db6..1d56f5f5a 100644 --- a/packages/react-native-executorch/common/runner/multimodal_input.h +++ b/packages/react-native-executorch/common/runner/multimodal_input.h @@ -11,7 +11,6 @@ #pragma once -#include #include #include #include @@ -20,6 +19,11 @@ namespace executorch { namespace extension { namespace llm { +// Tagged struct to distinguish image paths from text strings in the variant. +struct ImagePath { + std::string path; +}; + class MultimodalInput { public: explicit MultimodalInput(const std::string &text) : data_(text) {} @@ -28,8 +32,8 @@ class MultimodalInput { : data_(tokens) {} explicit MultimodalInput(std::vector &&tokens) : data_(std::move(tokens)) {} - explicit MultimodalInput(const Image &image) : data_(image) {} - explicit MultimodalInput(Image &&image) : data_(std::move(image)) {} + explicit MultimodalInput(ImagePath image_path) + : data_(std::move(image_path)) {} MultimodalInput(const MultimodalInput &) = default; MultimodalInput &operator=(const MultimodalInput &) = default; @@ -43,17 +47,19 @@ class MultimodalInput { return std::holds_alternative>(data_); } bool is_image() const noexcept { - return std::holds_alternative(data_); + return std::holds_alternative(data_); } const std::string &get_text() const & { return std::get(data_); } const std::vector &get_tokens() const & { return std::get>(data_); } - const Image &get_image() const & { return std::get(data_); } + const std::string &get_image_path() const & { + return std::get(data_).path; + } private: - std::variant, Image> data_; + std::variant, ImagePath> data_; }; inline MultimodalInput make_text_input(const std::string &text) noexcept { @@ -62,11 +68,8 @@ inline MultimodalInput make_text_input(const std::string &text) noexcept { inline MultimodalInput make_text_input(std::string &&text) noexcept { return MultimodalInput(std::move(text)); } -inline MultimodalInput make_image_input(const Image &image) noexcept { - return MultimodalInput(image); -} -inline MultimodalInput make_image_input(Image &&image) noexcept { - return MultimodalInput(std::move(image)); +inline MultimodalInput make_image_input(std::string path) noexcept { + return MultimodalInput(ImagePath{std::move(path)}); } } // namespace llm diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp index c39c7cc0f..a9a4715a7 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp @@ -24,9 +24,10 @@ using ::executorch::runtime::Result; MultimodalPrefiller::MultimodalPrefiller( Module *module, MultimodalDecoderRunner *decoder_runner, - tokenizers::HFTokenizer *tokenizer, IOManager *io_manager) + tokenizers::HFTokenizer *tokenizer, IOManager *io_manager, + IEncoder *image_encoder) : module_(module), decoder_runner_(decoder_runner), tokenizer_(tokenizer), - io_manager_(io_manager) {} + io_manager_(io_manager), image_encoder_(image_encoder) {} Result MultimodalPrefiller::prefill(const MultimodalInput &input, int64_t &start_pos) { @@ -36,37 +37,11 @@ Result MultimodalPrefiller::prefill(const MultimodalInput &input, TensorPtr sliced_embed_storage; if (input.is_image()) { - const Image &image = input.get_image(); - - // Query input dtype expected by vision_encoder. - auto method_meta_result = module_->method_meta(kVisionEncoderMethod); - ET_CHECK_OK_OR_RETURN_ERROR(method_meta_result.error(), - "Failed to get method_meta for %s", - kVisionEncoderMethod); - auto &method_meta = *method_meta_result; - - ET_CHECK_OR_RETURN_ERROR(method_meta.num_inputs() > 0, InvalidArgument, - "vision_encoder has no inputs"); - auto input_meta_result = method_meta.input_tensor_meta(0); - ET_CHECK_OK_OR_RETURN_ERROR(input_meta_result.error(), - "Cannot get vision_encoder input meta at 0"); - auto expected_dtype = input_meta_result->scalar_type(); - - ET_CHECK_OR_RETURN_ERROR( - expected_dtype == ::executorch::aten::ScalarType::Float && - image.is_float(), - InvalidArgument, "vision_encoder expects float32 image data"); - - auto expected_dims = input_meta_result->sizes(); - auto image_tensor_result = - image.toTensor(/*with_batch=*/expected_dims.size() == 4); - ET_CHECK_OK_OR_RETURN_ERROR(image_tensor_result.error(), - "Failed to convert image to tensor"); - - auto image_encoder_result = - module_->execute(kVisionEncoderMethod, *image_tensor_result); - ET_CHECK_OK_OR_RETURN_ERROR(image_encoder_result.error()); - encoder_output = (*image_encoder_result)[0]; + ET_CHECK_OR_RETURN_ERROR(image_encoder_ != nullptr, InvalidState, + "No image encoder registered"); + auto encode_result = image_encoder_->encode(input); + ET_CHECK_OK_OR_RETURN_ERROR(encode_result.error(), "Image encoding failed"); + encoder_output = *encode_result; } else if (input.is_text() || input.is_tokens()) { std::vector tokens; diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h index ee0f99a5b..4effee7b7 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.h +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h @@ -14,6 +14,7 @@ #include "multimodal_input.h" #include #include +#include namespace executorch { namespace extension { @@ -26,7 +27,8 @@ class MultimodalPrefiller { explicit MultimodalPrefiller(Module *module, MultimodalDecoderRunner *decoder_runner, tokenizers::HFTokenizer *tokenizer, - IOManager *io_manager); + IOManager *io_manager, + IEncoder *image_encoder = nullptr); // Prefill one input segment. Updates start_pos in-place. // Returns the first predicted token after this segment. @@ -41,6 +43,7 @@ class MultimodalPrefiller { MultimodalDecoderRunner *decoder_runner_; tokenizers::HFTokenizer *tokenizer_; IOManager *io_manager_; + IEncoder *image_encoder_; }; } // namespace llm diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp index 4960f7845..3d70e04b5 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -65,8 +65,14 @@ Error MultimodalRunner::load_subcomponents() { mm_decoder_runner_ = std::make_unique( module_, io_manager_.get()); + llm::IEncoder *image_encoder = nullptr; + auto enc_it = encoders_.find(llm::MultimodalType::Image); + if (enc_it != encoders_.end()) { + image_encoder = enc_it->second.get(); + } mm_prefiller_ = std::make_unique( - module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get()); + module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get(), + image_encoder); mm_token_generator_ = std::make_unique( tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true, std::move(eos_ids), stats_ptr); From ce6856d6f4ad4f26795ab6e3be3399c8667f4883 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 14:54:57 +0100 Subject: [PATCH 40/46] test: add TextRunnerTests and VLMTests suites, register in CMake and run_tests.sh --- .../common/rnexecutorch/tests/CMakeLists.txt | 24 ++++ .../tests/integration/LLMTest.cpp | 45 ------- .../integration/MultimodalRunnerTest.cpp | 118 ++++++++++++++++++ .../tests/integration/TextRunnerTest.cpp | 101 +++++++++++++++ .../common/rnexecutorch/tests/run_tests.sh | 4 + 5 files changed, 247 insertions(+), 45 deletions(-) create mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp create mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index 159f00159..ebf390691 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -223,6 +223,30 @@ add_rn_test(LLMTests integration/LLMTest.cpp LIBS tokenizers_deps opencv_deps ) +add_rn_test(TextRunnerTests integration/TextRunnerTest.cpp + SOURCES + ${COMMON_DIR}/runner/base_llm_runner.cpp + ${COMMON_DIR}/runner/text_runner.cpp + ${COMMON_DIR}/runner/text_prefiller.cpp + ${COMMON_DIR}/runner/text_decoder_runner.cpp + ${COMMON_DIR}/runner/sampler.cpp + ${COMMON_DIR}/runner/arange_util.cpp + LIBS tokenizers_deps +) + +add_rn_test(VLMTests integration/MultimodalRunnerTest.cpp + SOURCES + ${COMMON_DIR}/runner/base_llm_runner.cpp + ${COMMON_DIR}/runner/multimodal_runner.cpp + ${COMMON_DIR}/runner/multimodal_prefiller.cpp + ${COMMON_DIR}/runner/text_decoder_runner.cpp + ${COMMON_DIR}/runner/sampler.cpp + ${COMMON_DIR}/runner/arange_util.cpp + ${COMMON_DIR}/runner/encoders/vision_encoder.cpp + ${IMAGE_UTILS_SOURCES} + LIBS tokenizers_deps opencv_deps +) + add_rn_test(TextToImageTests integration/TextToImageTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/text_to_image/TextToImage.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index cad94fa10..65bd1917a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -212,48 +212,3 @@ TEST(BaseLLMRunnerTest, ResetZerosPos) { runner.reset(); EXPECT_EQ(runner.pos_, 0); } - -#include - -TEST(TextRunnerTest, LoadsSuccessfully) { - auto module = std::make_unique<::executorch::extension::Module>( - "smolLm2_135M_8da4w.pte", - ::executorch::extension::Module::LoadMode::File); - - example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json"); - auto err = runner.load(); - EXPECT_EQ(err, ::executorch::runtime::Error::Ok); - EXPECT_TRUE(runner.is_loaded()); -} - -TEST(TextRunnerTest, SetTemperaturePropagatesToDecoder) { - auto module = std::make_unique<::executorch::extension::Module>( - "smolLm2_135M_8da4w.pte", - ::executorch::extension::Module::LoadMode::File); - - example::TextRunner runner(module.get(), nullptr, "smollm_tokenizer.json"); - runner.load(); - EXPECT_NO_THROW(runner.set_temperature(0.5f)); - EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f); -} - -#include - -TEST(MultimodalRunnerTest, LoadFailsWithClearErrorWhenCapabilityMismatch) { - // smolLm2_135M_8da4w.pte is text-only — declaring vision capability should - // throw - auto module = std::make_unique<::executorch::extension::Module>( - "smolLm2_135M_8da4w.pte", - ::executorch::extension::Module::LoadMode::File); - - std::map> - encoders; - encoders[executorch::extension::llm::MultimodalType::Image] = - std::make_unique(module.get()); - - example::MultimodalRunner runner(std::move(module), "smollm_tokenizer.json", - std::move(encoders)); - - EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError); -} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp new file mode 100644 index 000000000..fbd9da03c --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp @@ -0,0 +1,118 @@ +#include +#include +#include + +#include +#include +#include +#include +#include + +using ::executorch::extension::Module; +using ::executorch::extension::llm::MultimodalType; +using ::executorch::extension::llm::VisionEncoder; +using ::executorch::runtime::Error; + +constexpr auto kTextModel = "smolLm2_135M_8da4w.pte"; +constexpr auto kTextTokenizer = "smollm_tokenizer.json"; +constexpr auto kVLMModel = "lfm2_5_vl_quantized_xnnpack_v2.pte"; +constexpr auto kVLMTokenizer = "tokenizer_2.5.json"; +constexpr auto kTestImage = "test_image.jpg"; + +static std::map> +makeVisionEncoders(Module *module) { + std::map> + encoders; + encoders[MultimodalType::Image] = std::make_unique(module); + return encoders; +} + +// ============================================================================ +// Error-path tests (text-only SmolLM2 — no vision_encoder method) +// ============================================================================ + +TEST(MultimodalRunnerTest, LoadFailsWhenVisionEncoderMissing) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + auto encoders = makeVisionEncoders(module.get()); + example::MultimodalRunner runner(std::move(module), kTextTokenizer, + std::move(encoders)); + EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError); +} + +TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + auto encoders = makeVisionEncoders(module.get()); + example::MultimodalRunner runner(std::move(module), kTextTokenizer, + std::move(encoders)); + EXPECT_FALSE(runner.is_loaded()); +} + +// ============================================================================ +// Integration tests (require VLM .pte) +// ============================================================================ + +class VLMTest : public ::testing::Test { +protected: + std::unique_ptr runner_; + + void SetUp() override { + auto module = std::make_unique(kVLMModel, Module::LoadMode::File); + auto encoders = makeVisionEncoders(module.get()); + runner_ = std::make_unique( + std::move(module), kVLMTokenizer, std::move(encoders)); + auto err = runner_->load(); + ASSERT_EQ(err, Error::Ok) << "VLM model load failed"; + } +}; + +TEST_F(VLMTest, LoadSucceedsWithRealVLMModel) { + EXPECT_TRUE(runner_->is_loaded()); +} + +TEST_F(VLMTest, MetadataApplied_KVCache) { + EXPECT_TRUE(runner_->config_.enable_kv_cache); +} + +TEST_F(VLMTest, GenerateTextOnlyInputWorks) { + runner_->set_temperature(0.0f); + auto err = runner_->generate( + "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n"); + EXPECT_EQ(err, Error::Ok); + EXPECT_GT(runner_->pos_, 0); +} + +TEST_F(VLMTest, GenerateWithImageProducesTokens) { + runner_->set_temperature(0.0f); + + std::vector<::executorch::extension::llm::MultimodalInput> inputs = { + ::executorch::extension::llm::make_image_input(kTestImage), + ::executorch::extension::llm::make_text_input( + "<|im_start|>user\nDescribe this image briefly." + "<|im_end|>\n<|im_start|>assistant\n"), + }; + + auto err = runner_->generate_internal(inputs, nullptr); + EXPECT_EQ(err, Error::Ok); + EXPECT_GT(runner_->pos_, 0); +} + +TEST_F(VLMTest, EmbeddingCacheHitOnRepeatedImage) { + runner_->set_temperature(0.0f); + + // First call — cache miss, runs vision_encoder + std::vector<::executorch::extension::llm::MultimodalInput> inputs = { + ::executorch::extension::llm::make_image_input(kTestImage), + ::executorch::extension::llm::make_text_input( + "<|im_start|>user\nWhat is this?<|im_end|>\n<|im_start|>assistant\n"), + }; + runner_->generate_internal(inputs, nullptr); + runner_->reset(); + + // Second call — same image path, should hit cache + // (no functional assertion possible without instrumenting the encoder, + // but this at least verifies it doesn't crash or error) + auto err = runner_->generate_internal(inputs, nullptr); + EXPECT_EQ(err, Error::Ok); +} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp new file mode 100644 index 000000000..3253758cc --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp @@ -0,0 +1,101 @@ +#include +#include + +#include +#include +#include + +using ::executorch::extension::Module; +using ::executorch::runtime::Error; + +constexpr auto kTextModel = "smolLm2_135M_8da4w.pte"; +constexpr auto kTextTokenizer = "smollm_tokenizer.json"; +constexpr auto kSystemPrompt = "You are a helpful assistant. Assist the user " + "to the best of your abilities."; + +static std::string formatChatML(const std::string &systemPrompt, + const std::string &userMessage) { + return "<|im_start|>system\n" + systemPrompt + "<|im_end|>\n" + + "<|im_start|>user\n" + userMessage + "<|im_end|>\n" + + "<|im_start|>assistant\n"; +} + +TEST(TextRunnerTest, ConstructorAndLoadSucceeds) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + auto err = runner.load(); + EXPECT_EQ(err, Error::Ok); + EXPECT_TRUE(runner.is_loaded()); +} + +TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) { + // SmolLM2-135M exports enable_dynamic_shape = 1 + // After load(), config_.enable_dynamic_shape must be true (our fix) + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.load(); + EXPECT_TRUE(runner.config_.enable_dynamic_shape); +} + +TEST(TextRunnerTest, MetadataApplied_KVCache) { + // SmolLM2-135M exports use_kv_cache = 1 + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.load(); + EXPECT_TRUE(runner.config_.enable_kv_cache); +} + +TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.load(); + runner.set_temperature(0.3f); + EXPECT_FLOAT_EQ(runner.config_.temperature, 0.3f); +} + +TEST(TextRunnerTest, ResetZerosPos) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.pos_ = 42; + runner.reset(); + EXPECT_EQ(runner.pos_, 0); +} + +TEST(TextRunnerTest, GenerateProducesTokens) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.load(); + runner.set_temperature(0.0f); + + std::string prompt = formatChatML(kSystemPrompt, "Say: hello"); + auto err = runner.generate(prompt); + EXPECT_EQ(err, Error::Ok); + EXPECT_GT(runner.pos_, 0); +} + +TEST(TextRunnerTest, ParallelPrefillEnabled) { + // Confirms the fix: enable_dynamic_shape from metadata now unconditionally + // applied + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.load(); + EXPECT_TRUE(runner.config_.enable_dynamic_shape); +} + +TEST(TextRunnerTest, StopHaltsGeneration) { + auto module = std::make_unique(kTextModel, Module::LoadMode::File); + example::TextRunner runner(std::move(module), kTextTokenizer); + runner.load(); + runner.set_temperature(0.0f); + + int token_count = 0; + std::string prompt = formatChatML(kSystemPrompt, "Count to one hundred"); + runner.generate(prompt, {}, [&](const std::string &) { + token_count++; + if (token_count >= 3) { + runner.stop(); + } + }); + EXPECT_GT(token_count, 0); + EXPECT_LE(token_count, 5); // stopped early +} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index 360aa9d11..324841d9b 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -29,6 +29,8 @@ TEST_EXECUTABLES=( "TokenizerModuleTests" "SpeechToTextTests" "LLMTests" + "TextRunnerTests" + "VLMTests" "ImageSegmentationTests" "TextToImageTests" "OCRTests" @@ -60,6 +62,8 @@ MODELS=( "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.6.0/tokenizer.json" "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte" "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json" + "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_v2.pte" + "tokenizer_2.5.json|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json" "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte" "xnnpack_crnn_english.pte|https://huggingface.co/software-mansion/react-native-executorch-recognizer-crnn.en/resolve/v0.7.0/xnnpack/english/xnnpack_crnn_english.pte" "xnnpack_craft_quantized.pte|https://huggingface.co/software-mansion/react-native-executorch-detector-craft/resolve/v0.7.0/xnnpack/xnnpack_craft.pte" From 4184bb3dbf132d7946643e030f6df01c3400f201 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 15:44:42 +0100 Subject: [PATCH 41/46] refactor: unify multimodal/text paths in sendMessage, add getVisualTokenCount JSI Co-Authored-By: Claude Sonnet 4.6 --- .../host_objects/ModelHostObject.h | 5 ++ .../common/rnexecutorch/models/llm/LLM.cpp | 7 ++ .../common/rnexecutorch/models/llm/LLM.h | 1 + .../common/runner/base_llm_runner.h | 1 + .../common/runner/encoders/iencoder.h | 4 ++ .../common/runner/encoders/vision_encoder.cpp | 20 ++++++ .../common/runner/encoders/vision_encoder.h | 1 + .../common/runner/multimodal_runner.cpp | 8 +++ .../common/runner/multimodal_runner.h | 3 +- .../src/controllers/LLMController.ts | 68 ++++++++----------- .../react-native-executorch/src/types/llm.ts | 6 +- 11 files changed, 81 insertions(+), 43 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index a4af6eb8f..35b34ed56 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -159,6 +159,11 @@ template class ModelHostObject : public JsiHostObject { std::string, std::vector, std::shared_ptr)>(&Model::generate)>, "generateMultimodal")); + + addFunctions(JSI_EXPORT_FUNCTION( + ModelHostObject, + synchronousHostFunction<&Model::getVisualTokenCount>, + "getVisualTokenCount")); } if constexpr (meta::SameAs) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 8cdeffbe0..a634b372b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -166,6 +166,13 @@ size_t LLM::getPromptTokenCount() const noexcept { return runner_->stats_.num_prompt_tokens; } +int32_t LLM::getVisualTokenCount() const { + if (!runner_ || !runner_->is_loaded()) { + return 0; + } + return runner_->get_visual_token_count(); +} + int32_t LLM::countTextTokens(std::string text) const { if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError( diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index b341e3811..e73b7771d 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -35,6 +35,7 @@ class LLM : public BaseModel { size_t getGeneratedTokenCount() const noexcept; size_t getPromptTokenCount() const noexcept; int32_t countTextTokens(std::string text) const; + int32_t getVisualTokenCount() const; size_t getMemoryLowerBound() const noexcept; void setCountInterval(size_t countInterval); void setTemperature(float temperature); diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h index 7d2eef285..d888256ec 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.h +++ b/packages/react-native-executorch/common/runner/base_llm_runner.h @@ -51,6 +51,7 @@ class BaseLLMRunner { void reset(); int32_t count_text_tokens(const std::string &text) const; int32_t get_max_context_length() const; + virtual int32_t get_visual_token_count() const { return 0; } // Writes config_ then propagates to subclass impl void set_temperature(float temperature) noexcept; diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h index 3f46ef775..78abe80ce 100644 --- a/packages/react-native-executorch/common/runner/encoders/iencoder.h +++ b/packages/react-native-executorch/common/runner/encoders/iencoder.h @@ -16,6 +16,10 @@ class IEncoder { // Encodes one input segment, returns embeddings EValue virtual ::executorch::runtime::Result<::executorch::runtime::EValue> encode(const MultimodalInput &input) = 0; + + // Returns the number of tokens produced per encoded input (e.g. visual + // tokens per image). Returns 0 if not loaded or unknown. + virtual int32_t encoderTokenCount() const { return 0; } }; } // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 44beb2a7c..191182b12 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -57,6 +57,26 @@ bool VisionEncoder::is_loaded() const { return module_->is_method_loaded(kVisionEncoderMethod); } +int32_t VisionEncoder::encoderTokenCount() const { + if (!is_loaded()) { + return 0; + } + auto meta_result = module_->method_meta(kVisionEncoderMethod); + if (!meta_result.ok()) { + return 0; + } + auto output_meta = meta_result->output_tensor_meta(0); + if (!output_meta.ok()) { + return 0; + } + // Output shape is [1, num_visual_tokens, embed_dim] + auto sizes = output_meta->sizes(); + if (sizes.size() < 2) { + return 0; + } + return static_cast(sizes[1]); +} + Result VisionEncoder::encode(const MultimodalInput &input) { if (!is_loaded()) { return Error::InvalidState; diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index 5af0491bd..c7adb118a 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -18,6 +18,7 @@ class VisionEncoder : public IEncoder { bool is_loaded() const override; ::executorch::runtime::Result<::executorch::runtime::EValue> encode(const MultimodalInput &input) override; + int32_t encoderTokenCount() const override; private: ::executorch::extension::Module *module_; diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp index 3d70e04b5..f0b836248 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -18,6 +18,14 @@ MultimodalRunner::MultimodalRunner( : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config), encoders_(std::move(encoders)) {} +int32_t MultimodalRunner::get_visual_token_count() const { + auto it = encoders_.find(llm::MultimodalType::Image); + if (it == encoders_.end()) { + return 0; + } + return it->second->encoderTokenCount(); +} + bool MultimodalRunner::is_loaded() const { if (!mm_prefiller_ || !mm_token_generator_) return false; diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h index 4190127e6..6139c0fc2 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.h +++ b/packages/react-native-executorch/common/runner/multimodal_runner.h @@ -28,6 +28,7 @@ class MultimodalRunner : public BaseLLMRunner { .temperature = 0.8F, .topp = 0.9F}); bool is_loaded() const override; + int32_t get_visual_token_count() const override; ::executorch::runtime::Error generate_internal( const std::vector<::executorch::extension::llm::MultimodalInput> &inputs, @@ -37,7 +38,7 @@ class MultimodalRunner : public BaseLLMRunner { ::executorch::runtime::Error load_subcomponents() override; void stop_impl() override; void set_temperature_impl(float) override { - } // config_ already updated by base + } // config_ already updated by base void set_topp_impl(float) override {} // config_ already updated by base void set_count_interval_impl(size_t count_interval) override; void set_time_interval_impl(size_t time_interval) override; diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index c52e537e4..0a4629a0b 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -284,7 +284,8 @@ export class LLMController { public async generate( messages: Message[], - tools?: LLMTool[] + tools?: LLMTool[], + imagePaths?: string[] ): Promise { if (!this._isReady) { throw new RnExecutorchError( @@ -312,7 +313,7 @@ export class LLMController { { tools_in_user_message: false, add_generation_prompt: true } ); - return await this.forward(renderedChat); + return await this.forward(renderedChat, imagePaths); } public async sendMessage( @@ -328,27 +329,21 @@ export class LLMController { const updatedHistory = [...this._messageHistory, newMessage]; this.messageHistoryCallback(updatedHistory); - let response: string; - - const isMultimodal = updatedHistory.some((m) => m.mediaPath); - - // For multimodal messages, convert mediaPath into structured content so + // For messages with images, convert mediaPath into structured content so // the chat template emits placeholders in the right position. - const historyForTemplate = isMultimodal - ? updatedHistory.map((m) => - m.mediaPath - ? { - ...m, - content: [ - { type: 'image' }, - { type: 'text', text: m.content }, - ] as any, - } - : m - ) - : updatedHistory; - - const IMAGE_VISUAL_TOKENS = 256; + const historyForTemplate = updatedHistory.map((m) => + m.mediaPath + ? { + ...m, + content: [ + { type: 'image' }, + { type: 'text', text: m.content }, + ] as any, + } + : m + ); + + const visualTokenCount = this.nativeModule.getVisualTokenCount(); const countTokensCallback = (messages: Message[]) => { const rendered = this.applyChatTemplate( messages, @@ -359,7 +354,7 @@ export class LLMController { ); const textTokens = this.nativeModule.countTextTokens(rendered); const imageCount = messages.filter((m) => m.mediaPath).length; - return textTokens + imageCount * (IMAGE_VISUAL_TOKENS - 1); + return textTokens + imageCount * (visualTokenCount - 1); }; const maxContextLength = this.nativeModule.getMaxContextLength(); const messageHistoryWithPrompt = @@ -370,24 +365,15 @@ export class LLMController { countTokensCallback ); - if (isMultimodal) { - const renderedPrompt = this.applyChatTemplate( - messageHistoryWithPrompt, - this.tokenizerConfig, - this.toolsConfig?.tools, - // eslint-disable-next-line camelcase - { tools_in_user_message: false, add_generation_prompt: true } - ); - const imagePaths = messageHistoryWithPrompt - .filter((m) => m.mediaPath) - .map((m) => m.mediaPath!); - response = await this.forward(renderedPrompt, imagePaths); - } else { - response = await this.generate( - messageHistoryWithPrompt, - this.toolsConfig?.tools - ); - } + const imagePaths = messageHistoryWithPrompt + .filter((m) => m.mediaPath) + .map((m) => m.mediaPath!); + + const response = await this.generate( + messageHistoryWithPrompt, + this.toolsConfig?.tools, + imagePaths.length > 0 ? imagePaths : undefined + ); if (!this.toolsConfig || this.toolsConfig.displayToolCalls) { this.messageHistoryCallback([ diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index aea9817bb..f906f8b3f 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -109,7 +109,11 @@ export interface LLMTypeBase { * @param tools - Optional array of tools that can be used during generation. * @returns The generated tokens as `string`. */ - generate: (messages: Message[], tools?: LLMTool[]) => Promise; + generate: ( + messages: Message[], + tools?: LLMTool[], + imagePaths?: string[] + ) => Promise; /** * Returns the number of total tokens from the previous generation. This is a sum of prompt tokens and generated tokens. * From c88d97c30c01fda4d5b4dfd53c398b038c273dc3 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 16:02:43 +0100 Subject: [PATCH 42/46] refactor: replace example namespace with rnexecutorch::llm::runner in runner classes Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/models/llm/LLM.cpp | 14 +++++------ .../common/rnexecutorch/models/llm/LLM.h | 2 +- .../tests/integration/LLMTest.cpp | 2 +- .../integration/MultimodalRunnerTest.cpp | 14 +++++------ .../tests/integration/TextRunnerTest.cpp | 24 ++++++++++++------- .../common/runner/base_llm_runner.cpp | 4 ++-- .../common/runner/base_llm_runner.h | 4 ++-- .../common/runner/multimodal_runner.cpp | 8 +++---- .../common/runner/multimodal_runner.h | 14 +++++------ .../common/runner/text_runner.cpp | 4 ++-- .../common/runner/text_runner.h | 4 ++-- 11 files changed, 50 insertions(+), 44 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index a634b372b..94d4aa1ec 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -11,6 +11,7 @@ namespace rnexecutorch::models::llm { namespace llm = ::executorch::extension::llm; +namespace runner = ::rnexecutorch::llm::runner; namespace fs = std::filesystem; using namespace facebook; using executorch::extension::module::Module; @@ -22,17 +23,17 @@ LLM::LLM(const std::string &modelSource, const std::string &tokenizerSource, : BaseModel(modelSource, callInvoker, Module::LoadMode::File) { if (capabilities.empty()) { - runner_ = std::make_unique(std::move(module_), - tokenizerSource); + runner_ = std::make_unique(std::move(module_), + tokenizerSource); } else { - std::map> encoders; + std::map> encoders; for (const auto &cap : capabilities) { if (cap == "vision") { - encoders[llm::MultimodalType::Image] = + encoders[runner::MultimodalType::Image] = std::make_unique(module_.get()); } } - runner_ = std::make_unique( + runner_ = std::make_unique( std::move(module_), tokenizerSource, std::move(encoders)); } @@ -51,7 +52,6 @@ std::string LLM::generate(std::string input, throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Runner is not loaded"); } - std::string output; auto nativeCallback = [this, callback, &output](const std::string &token) { output += token; @@ -77,7 +77,7 @@ std::string LLM::generate(std::string prompt, throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Runner is not loaded"); } - if (!dynamic_cast(runner_.get())) { + if (!dynamic_cast(runner_.get())) { throw RnExecutorchError( RnExecutorchErrorCode::InvalidUserInput, "This is a text-only model. Call generate(prompt, cb)."); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index e73b7771d..514760908 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -44,7 +44,7 @@ class LLM : public BaseModel { int32_t getMaxContextLength() const; private: - std::unique_ptr runner_; + std::unique_ptr<::rnexecutorch::llm::runner::BaseLLMRunner> runner_; }; } // namespace models::llm diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index 65bd1917a..acd667118 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -176,7 +176,7 @@ TEST(VisionEncoderTest, LoadFailsWithClearErrorWhenMethodMissing) { #include // Minimal concrete subclass — only used in tests to verify base class behavior -class StubRunner : public example::BaseLLMRunner { +class StubRunner : public rnexecutorch::llm::runner::BaseLLMRunner { public: using BaseLLMRunner::BaseLLMRunner; bool is_loaded() const override { return loaded_; } diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp index fbd9da03c..038fa7f6e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp @@ -9,9 +9,9 @@ #include using ::executorch::extension::Module; -using ::executorch::extension::llm::MultimodalType; using ::executorch::extension::llm::VisionEncoder; using ::executorch::runtime::Error; +using ::rnexecutorch::llm::runner::MultimodalType; constexpr auto kTextModel = "smolLm2_135M_8da4w.pte"; constexpr auto kTextTokenizer = "smollm_tokenizer.json"; @@ -36,16 +36,16 @@ makeVisionEncoders(Module *module) { TEST(MultimodalRunnerTest, LoadFailsWhenVisionEncoderMissing) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); auto encoders = makeVisionEncoders(module.get()); - example::MultimodalRunner runner(std::move(module), kTextTokenizer, - std::move(encoders)); + rnexecutorch::llm::runner::MultimodalRunner runner( + std::move(module), kTextTokenizer, std::move(encoders)); EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError); } TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); auto encoders = makeVisionEncoders(module.get()); - example::MultimodalRunner runner(std::move(module), kTextTokenizer, - std::move(encoders)); + rnexecutorch::llm::runner::MultimodalRunner runner( + std::move(module), kTextTokenizer, std::move(encoders)); EXPECT_FALSE(runner.is_loaded()); } @@ -55,12 +55,12 @@ TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) { class VLMTest : public ::testing::Test { protected: - std::unique_ptr runner_; + std::unique_ptr runner_; void SetUp() override { auto module = std::make_unique(kVLMModel, Module::LoadMode::File); auto encoders = makeVisionEncoders(module.get()); - runner_ = std::make_unique( + runner_ = std::make_unique( std::move(module), kVLMTokenizer, std::move(encoders)); auto err = runner_->load(); ASSERT_EQ(err, Error::Ok) << "VLM model load failed"; diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp index 3253758cc..169310ed3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp @@ -22,7 +22,8 @@ static std::string formatChatML(const std::string &systemPrompt, TEST(TextRunnerTest, ConstructorAndLoadSucceeds) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); auto err = runner.load(); EXPECT_EQ(err, Error::Ok); EXPECT_TRUE(runner.is_loaded()); @@ -32,7 +33,8 @@ TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) { // SmolLM2-135M exports enable_dynamic_shape = 1 // After load(), config_.enable_dynamic_shape must be true (our fix) auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.load(); EXPECT_TRUE(runner.config_.enable_dynamic_shape); } @@ -40,14 +42,16 @@ TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) { TEST(TextRunnerTest, MetadataApplied_KVCache) { // SmolLM2-135M exports use_kv_cache = 1 auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.load(); EXPECT_TRUE(runner.config_.enable_kv_cache); } TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.load(); runner.set_temperature(0.3f); EXPECT_FLOAT_EQ(runner.config_.temperature, 0.3f); @@ -55,7 +59,8 @@ TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) { TEST(TextRunnerTest, ResetZerosPos) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.pos_ = 42; runner.reset(); EXPECT_EQ(runner.pos_, 0); @@ -63,7 +68,8 @@ TEST(TextRunnerTest, ResetZerosPos) { TEST(TextRunnerTest, GenerateProducesTokens) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.load(); runner.set_temperature(0.0f); @@ -77,14 +83,16 @@ TEST(TextRunnerTest, ParallelPrefillEnabled) { // Confirms the fix: enable_dynamic_shape from metadata now unconditionally // applied auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.load(); EXPECT_TRUE(runner.config_.enable_dynamic_shape); } TEST(TextRunnerTest, StopHaltsGeneration) { auto module = std::make_unique(kTextModel, Module::LoadMode::File); - example::TextRunner runner(std::move(module), kTextTokenizer); + rnexecutorch::llm::runner::TextRunner runner(std::move(module), + kTextTokenizer); runner.load(); runner.set_temperature(0.0f); diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp index fcb647ec5..37adde77a 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp +++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp @@ -6,7 +6,7 @@ #include #include -namespace example { +namespace rnexecutorch::llm::runner { using namespace executorch::extension::llm; using ::executorch::extension::Module; @@ -159,4 +159,4 @@ int32_t BaseLLMRunner::resolve_max_new_tokens(int32_t num_prompt_tokens, return std::max(0, result); } -} // namespace example +} // namespace rnexecutorch::llm::runner diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h index d888256ec..161463580 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.h +++ b/packages/react-native-executorch/common/runner/base_llm_runner.h @@ -15,7 +15,7 @@ #include #include -namespace example { +namespace rnexecutorch::llm::runner { namespace llm = ::executorch::extension::llm; @@ -86,4 +86,4 @@ class BaseLLMRunner { bool shouldStop_{false}; }; -} // namespace example +} // namespace rnexecutorch::llm::runner diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp index f0b836248..c211ee922 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -5,7 +5,7 @@ #include #include -namespace example { +namespace rnexecutorch::llm::runner { using namespace executorch::extension::llm; using ::executorch::extension::Module; @@ -19,7 +19,7 @@ MultimodalRunner::MultimodalRunner( encoders_(std::move(encoders)) {} int32_t MultimodalRunner::get_visual_token_count() const { - auto it = encoders_.find(llm::MultimodalType::Image); + auto it = encoders_.find(MultimodalType::Image); if (it == encoders_.end()) { return 0; } @@ -74,7 +74,7 @@ Error MultimodalRunner::load_subcomponents() { mm_decoder_runner_ = std::make_unique( module_, io_manager_.get()); llm::IEncoder *image_encoder = nullptr; - auto enc_it = encoders_.find(llm::MultimodalType::Image); + auto enc_it = encoders_.find(MultimodalType::Image); if (enc_it != encoders_.end()) { image_encoder = enc_it->second.get(); } @@ -162,4 +162,4 @@ void MultimodalRunner::set_time_interval_impl(size_t time_interval) { mm_token_generator_->set_time_interval(time_interval); } -} // namespace example +} // namespace rnexecutorch::llm::runner diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h index 6139c0fc2..58e676a94 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.h +++ b/packages/react-native-executorch/common/runner/multimodal_runner.h @@ -9,19 +9,17 @@ #include "text_token_generator.h" #include -namespace executorch::extension::llm { -// Tag enum for keying encoder map -enum class MultimodalType { Image, Audio }; -} // namespace executorch::extension::llm +namespace rnexecutorch::llm::runner { -namespace example { +// Tag enum for keying encoder map +enum class MultimodalType { Image }; class MultimodalRunner : public BaseLLMRunner { public: explicit MultimodalRunner( std::unique_ptr<::executorch::extension::Module> owned_module, const std::string &tokenizer_path, - std::map<::executorch::extension::llm::MultimodalType, + std::map> encoders, const ::executorch::extension::llm::GenerationConfig &config = { @@ -44,7 +42,7 @@ class MultimodalRunner : public BaseLLMRunner { void set_time_interval_impl(size_t time_interval) override; private: - std::map<::executorch::extension::llm::MultimodalType, + std::map> encoders_; std::unique_ptr<::executorch::extension::llm::MultimodalDecoderRunner> @@ -55,4 +53,4 @@ class MultimodalRunner : public BaseLLMRunner { mm_token_generator_; }; -} // namespace example +} // namespace rnexecutorch::llm::runner diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp index fa2225f3d..d61d70c41 100644 --- a/packages/react-native-executorch/common/runner/text_runner.cpp +++ b/packages/react-native-executorch/common/runner/text_runner.cpp @@ -6,7 +6,7 @@ #include #include -namespace example { +namespace rnexecutorch::llm::runner { using namespace executorch::extension::llm; using ::executorch::extension::Module; @@ -172,4 +172,4 @@ void TextRunner::set_time_interval_impl(size_t time_interval) { text_token_generator_->set_time_interval(time_interval); } -} // namespace example +} // namespace rnexecutorch::llm::runner diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h index 17394ee3f..857cf452f 100644 --- a/packages/react-native-executorch/common/runner/text_runner.h +++ b/packages/react-native-executorch/common/runner/text_runner.h @@ -6,7 +6,7 @@ #include "text_prefiller.h" #include "text_token_generator.h" -namespace example { +namespace rnexecutorch::llm::runner { class TextRunner : public BaseLLMRunner { public: @@ -38,4 +38,4 @@ class TextRunner : public BaseLLMRunner { text_token_generator_; }; -} // namespace example +} // namespace rnexecutorch::llm::runner From c7357d3b679c1f1f8ca550fb7e98c70774dcfa07 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 16:54:13 +0100 Subject: [PATCH 43/46] refactor: collapse BaseLLMRunner constructor, deduplicate eos_ids, read image shape from model metadata Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/models/llm/LLM.cpp | 4 +- .../tests/integration/LLMTest.cpp | 4 +- .../common/runner/base_llm_runner.cpp | 29 ++++-- .../common/runner/base_llm_runner.h | 17 ++-- .../common/runner/encoders/vision_encoder.cpp | 93 ++++++++----------- .../common/runner/encoders/vision_encoder.h | 9 ++ .../common/runner/multimodal_runner.cpp | 43 ++------- .../common/runner/multimodal_runner.h | 9 +- .../common/runner/text_runner.cpp | 23 +---- .../common/runner/text_runner.h | 9 +- 10 files changed, 100 insertions(+), 140 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 94d4aa1ec..2dd342702 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -77,7 +77,7 @@ std::string LLM::generate(std::string prompt, throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, "Runner is not loaded"); } - if (!dynamic_cast(runner_.get())) { + if (!runner_->is_multimodal()) { throw RnExecutorchError( RnExecutorchErrorCode::InvalidUserInput, "This is a text-only model. Call generate(prompt, cb)."); @@ -130,7 +130,7 @@ std::string LLM::generate(std::string prompt, } }; - auto error = runner_->generate_internal(inputs, nativeCallback); + auto error = runner_->generate(inputs, nativeCallback); if (error != Error::Ok) { throw RnExecutorchError(error, "Failed to generate multimodal response"); } diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp index acd667118..5ebb96fbe 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/integration/LLMTest.cpp @@ -200,14 +200,14 @@ class StubRunner : public rnexecutorch::llm::runner::BaseLLMRunner { }; TEST(BaseLLMRunnerTest, SetTemperatureWritesConfigAndCallsImpl) { - StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json"); + StubRunner runner(nullptr, "dummy_tokenizer.json"); runner.set_temperature(0.5f); EXPECT_FLOAT_EQ(runner.config_.temperature, 0.5f); EXPECT_FLOAT_EQ(runner.last_temp_, 0.5f); } TEST(BaseLLMRunnerTest, ResetZerosPos) { - StubRunner runner(nullptr, nullptr, "dummy_tokenizer.json"); + StubRunner runner(nullptr, "dummy_tokenizer.json"); runner.pos_ = 42; runner.reset(); EXPECT_EQ(runner.pos_, 0); diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.cpp b/packages/react-native-executorch/common/runner/base_llm_runner.cpp index 37adde77a..4a382a530 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.cpp +++ b/packages/react-native-executorch/common/runner/base_llm_runner.cpp @@ -1,7 +1,6 @@ // common/runner/base_llm_runner.cpp #include "base_llm_runner.h" #include "constants.h" -#include "util.h" #include #include #include @@ -12,12 +11,11 @@ using namespace executorch::extension::llm; using ::executorch::extension::Module; using ::executorch::runtime::Error; -BaseLLMRunner::BaseLLMRunner(Module *module, - std::unique_ptr owned_module, +BaseLLMRunner::BaseLLMRunner(std::unique_ptr module, const std::string &tokenizer_path, const llm::GenerationConfig &config) - : config_(config), module_(owned_module ? owned_module.get() : module), - owned_module_(std::move(owned_module)), tokenizer_path_(tokenizer_path), + : config_(config), module_(std::move(module)), + tokenizer_path_(tokenizer_path), tokenizer_(std::make_unique()), metadata_({ {kEnableDynamicShape, false}, @@ -68,14 +66,14 @@ Error BaseLLMRunner::load() { static_cast(metadata_.at(kEnableDynamicShape)); config_.enable_kv_cache = static_cast(metadata_.at(kUseKVCache)); - auto eos_ids = std::make_unique>(); + eos_ids_ = std::make_unique>(); if (method_names.count(kEosIds)) { for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { - eos_ids->emplace(static_cast(eos_id.toScalar().to())); + eos_ids_->emplace(static_cast(eos_id.toScalar().to())); } } - if (eos_ids->empty()) { - eos_ids->emplace(7); // fallback <|im_end|> + if (eos_ids_->empty()) { + eos_ids_->emplace(7); // fallback <|im_end|> } io_manager_ = std::make_unique(*module_); @@ -99,6 +97,19 @@ Error BaseLLMRunner::generate( return err; } +Error BaseLLMRunner::generate( + const std::vector &inputs, + std::function token_callback, + std::function stats_callback) { + + auto err = generate_internal(inputs, token_callback); + + if (stats_callback) + stats_callback(stats_); + + return err; +} + void BaseLLMRunner::stop() { stop_impl(); } void BaseLLMRunner::reset() { diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h index 161463580..c26a3c2d3 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.h +++ b/packages/react-native-executorch/common/runner/base_llm_runner.h @@ -22,8 +22,7 @@ namespace llm = ::executorch::extension::llm; class BaseLLMRunner { public: explicit BaseLLMRunner( - ::executorch::extension::Module *module, - std::unique_ptr<::executorch::extension::Module> owned_module, + std::unique_ptr<::executorch::extension::Module> module, const std::string &tokenizer_path, const llm::GenerationConfig &config = {.temperature = 0.8F, .topp = 0.9F}); @@ -32,17 +31,19 @@ class BaseLLMRunner { virtual bool is_loaded() const = 0; - // Loads tokenizer + metadata + eos, then calls load_subcomponents() virtual ::executorch::runtime::Error load(); - // Text convenience — wraps string in make_text_input, calls generate_internal ::executorch::runtime::Error generate(const std::string &prompt, const llm::GenerationConfig &generation_config = {}, std::function token_callback = {}, std::function stats_callback = {}); - // Multimodal entry point — subclasses implement this + ::executorch::runtime::Error + generate(const std::vector &inputs, + std::function token_callback = {}, + std::function stats_callback = {}); + virtual ::executorch::runtime::Error generate_internal( const std::vector &inputs, std::function token_callback) = 0; @@ -51,9 +52,9 @@ class BaseLLMRunner { void reset(); int32_t count_text_tokens(const std::string &text) const; int32_t get_max_context_length() const; + virtual bool is_multimodal() const { return false; } virtual int32_t get_visual_token_count() const { return 0; } - // Writes config_ then propagates to subclass impl void set_temperature(float temperature) noexcept; void set_topp(float topp) noexcept; void set_count_interval(size_t count_interval); @@ -77,12 +78,12 @@ class BaseLLMRunner { int32_t max_context_len, int32_t max_new_tokens = -1) const; - ::executorch::extension::Module *module_; - std::unique_ptr<::executorch::extension::Module> owned_module_; + std::unique_ptr<::executorch::extension::Module> module_; std::string tokenizer_path_; std::unique_ptr tokenizer_; std::unordered_map metadata_; std::unique_ptr io_manager_; + std::unique_ptr> eos_ids_; bool shouldStop_{false}; }; diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp index 191182b12..800c76cab 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.cpp @@ -15,10 +15,6 @@ using ::executorch::runtime::Error; using ::executorch::runtime::EValue; using ::executorch::runtime::Result; -// LFM2-VL vision encoder expects [1, 3, H, W] NCHW float32, values [0, 255] -static constexpr int kImageSize = 512; -static constexpr int kImageChannels = 3; - VisionEncoder::VisionEncoder(::executorch::extension::Module *module) : module_(module) {} @@ -30,17 +26,6 @@ Error VisionEncoder::load() { if (!method_names_result.ok()) { return method_names_result.error(); } - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, - "[VisionEncoder] Available methods:"); - for (const auto &name : *method_names_result) { - auto val = module_->get(name); - if (val.ok()) { - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name, "=", - val->toScalar().to()); - } else { - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, " -", name); - } - } if (method_names_result->count(kVisionEncoderMethod) == 0) { throw rnexecutorch::RnExecutorchError( @@ -77,6 +62,38 @@ int32_t VisionEncoder::encoderTokenCount() const { return static_cast(sizes[1]); } +Result VisionEncoder::getInputShape() const { + auto method_meta = ET_UNWRAP(module_->method_meta(kVisionEncoderMethod)); + auto input_meta = ET_UNWRAP(method_meta.input_tensor_meta(0)); + auto dims = input_meta.sizes(); + const bool with_batch = dims.size() == 4; + const int32_t offset = with_batch ? 1 : 0; + return ImageShape{ + .channels = static_cast(dims[offset]), + .height = static_cast(dims[offset + 1]), + .width = static_cast(dims[offset + 2]), + .with_batch = with_batch, + }; +} + +std::vector +VisionEncoder::preprocessImage(const std::string &path, + const ImageShape &shape) const { + cv::Mat mat = rnexecutorch::image_processing::readImage(path); + cv::resize(mat, mat, cv::Size(shape.width, shape.height)); + cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); + + const int32_t pixelCount = shape.height * shape.width; + std::vector chw(shape.channels * pixelCount); + for (int32_t i = 0; i < pixelCount; ++i) { + cv::Vec3b px = mat.at(i / shape.width, i % shape.width); + for (int32_t c = 0; c < shape.channels; ++c) { + chw[c * pixelCount + i] = static_cast(px[c]); + } + } + return chw; +} + Result VisionEncoder::encode(const MultimodalInput &input) { if (!is_loaded()) { return Error::InvalidState; @@ -87,57 +104,25 @@ Result VisionEncoder::encode(const MultimodalInput &input) { const std::string &path = input.get_image_path(); - // Return cached embedding if available auto it = embedding_cache_.find(path); if (it != embedding_cache_.end()) { - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Debug, - "[VisionEncoder] Cache hit for:", path); return it->second; } - // Load and preprocess image: resize → BGR→RGB → HWC uint8 → CHW float32 - cv::Mat mat = rnexecutorch::image_processing::readImage(path); - cv::resize(mat, mat, cv::Size(kImageSize, kImageSize)); - cv::cvtColor(mat, mat, cv::COLOR_BGR2RGB); - - std::vector chw(kImageChannels * kImageSize * kImageSize); - const int pixelCount = kImageSize * kImageSize; - for (int i = 0; i < pixelCount; ++i) { - cv::Vec3b px = mat.at(i / kImageSize, i % kImageSize); - for (int c = 0; c < kImageChannels; ++c) { - chw[c * pixelCount + i] = static_cast(px[c]); - } - } + auto shape = ET_UNWRAP(getInputShape()); + auto chw = preprocessImage(path, shape); - // Determine expected input shape (with or without batch dim) - auto method_meta_result = module_->method_meta(kVisionEncoderMethod); - if (!method_meta_result.ok()) { - return method_meta_result.error(); - } - auto input_meta_result = method_meta_result->input_tensor_meta(0); - if (!input_meta_result.ok()) { - return input_meta_result.error(); - } - auto expected_dims = input_meta_result->sizes(); - const bool with_batch = expected_dims.size() == 4; - - std::vector<::executorch::aten::SizesType> sizes = {kImageChannels, - kImageSize, kImageSize}; - if (with_batch) { + std::vector<::executorch::aten::SizesType> sizes = { + shape.channels, shape.height, shape.width}; + if (shape.with_batch) { sizes.insert(sizes.begin(), 1); } auto image_tensor = ::executorch::extension::from_blob( chw.data(), sizes, ::executorch::aten::ScalarType::Float); - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, - "[VisionEncoder] Running encode for:", path); - auto result = module_->execute(kVisionEncoderMethod, image_tensor); - if (!result.ok()) { - return result.error(); - } - - EValue embedding = (*result)[0]; + auto result = ET_UNWRAP(module_->execute(kVisionEncoderMethod, image_tensor)); + EValue embedding = result[0]; embedding_cache_.emplace(path, embedding); return embedding; } diff --git a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h index c7adb118a..8a54bfb6b 100644 --- a/packages/react-native-executorch/common/runner/encoders/vision_encoder.h +++ b/packages/react-native-executorch/common/runner/encoders/vision_encoder.h @@ -21,6 +21,15 @@ class VisionEncoder : public IEncoder { int32_t encoderTokenCount() const override; private: + struct ImageShape { + int32_t channels, height, width; + bool with_batch; + }; + + ::executorch::runtime::Result getInputShape() const; + std::vector preprocessImage(const std::string &path, + const ImageShape &shape) const; + ::executorch::extension::Module *module_; std::unordered_map embedding_cache_; diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp index c211ee922..7eda70870 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -12,10 +12,10 @@ using ::executorch::extension::Module; using ::executorch::runtime::Error; MultimodalRunner::MultimodalRunner( - std::unique_ptr owned_module, const std::string &tokenizer_path, + std::unique_ptr module, const std::string &tokenizer_path, std::map> encoders, const llm::GenerationConfig &config) - : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config), + : BaseLLMRunner(std::move(module), tokenizer_path, config), encoders_(std::move(encoders)) {} int32_t MultimodalRunner::get_visual_token_count() const { @@ -41,54 +41,29 @@ bool MultimodalRunner::is_loaded() const { Error MultimodalRunner::load_subcomponents() { rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading", encoders_.size(), "encoder(s)"); - // Load and validate all declared encoders — throws on mismatch for (auto &[type, encoder] : encoders_) { - rnexecutorch::log( - rnexecutorch::LOG_LEVEL::Debug, - "[MultimodalRunner] Loading encoder type:", static_cast(type)); encoder->load(); - rnexecutorch::log( - rnexecutorch::LOG_LEVEL::Info, - "[MultimodalRunner] Encoder loaded, type:", static_cast(type)); } llm::Stats *stats_ptr = &stats_; - auto eos_ids = std::make_unique>(); - const auto method_names = - ET_UNWRAP(module_->method_names(), "Failed reading method names"); - if (method_names.count(kEosIds)) { - for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { - eos_ids->emplace(static_cast(eos_id.toScalar().to())); - } - } - if (eos_ids->empty()) { - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Warn, - "[MultimodalRunner] get_eos_ids not found in model, " - "falling back to {7}"); - eos_ids->emplace(7); - } else { - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, - "[MultimodalRunner] EOS IDs loaded:", *eos_ids); - } mm_decoder_runner_ = std::make_unique( - module_, io_manager_.get()); + module_.get(), io_manager_.get()); llm::IEncoder *image_encoder = nullptr; auto enc_it = encoders_.find(MultimodalType::Image); if (enc_it != encoders_.end()) { image_encoder = enc_it->second.get(); } mm_prefiller_ = std::make_unique( - module_, mm_decoder_runner_.get(), tokenizer_.get(), io_manager_.get(), - image_encoder); + module_.get(), mm_decoder_runner_.get(), tokenizer_.get(), + io_manager_.get(), image_encoder); mm_token_generator_ = std::make_unique( tokenizer_.get(), mm_decoder_runner_.get(), /*use_kv_cache=*/true, - std::move(eos_ids), stats_ptr); + std::move(eos_ids_), stats_ptr); ET_CHECK_OK_OR_RETURN_ERROR(mm_prefiller_->load()); ET_CHECK_OK_OR_RETURN_ERROR(mm_token_generator_->load()); - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, - "[MultimodalRunner] All subcomponents loaded successfully"); + return Error::Ok; } @@ -114,10 +89,6 @@ Error MultimodalRunner::generate_internal( stats_.first_token_ms = llm::time_in_ms(); stats_.prompt_eval_end_ms = llm::time_in_ms(); stats_.num_prompt_tokens = pos_; - rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, - "[MultimodalRunner] Prefill took", - stats_.prompt_eval_end_ms - stats_.inference_start_ms, - "ms for", pos_, "tokens"); int32_t resolved_max_new = static_cast(config_.max_context_length - pos_); diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.h b/packages/react-native-executorch/common/runner/multimodal_runner.h index 58e676a94..f96916de3 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.h +++ b/packages/react-native-executorch/common/runner/multimodal_runner.h @@ -11,13 +11,12 @@ namespace rnexecutorch::llm::runner { -// Tag enum for keying encoder map enum class MultimodalType { Image }; class MultimodalRunner : public BaseLLMRunner { public: explicit MultimodalRunner( - std::unique_ptr<::executorch::extension::Module> owned_module, + std::unique_ptr<::executorch::extension::Module> module, const std::string &tokenizer_path, std::map> @@ -26,6 +25,7 @@ class MultimodalRunner : public BaseLLMRunner { .temperature = 0.8F, .topp = 0.9F}); bool is_loaded() const override; + bool is_multimodal() const override { return true; } int32_t get_visual_token_count() const override; ::executorch::runtime::Error generate_internal( @@ -35,9 +35,8 @@ class MultimodalRunner : public BaseLLMRunner { protected: ::executorch::runtime::Error load_subcomponents() override; void stop_impl() override; - void set_temperature_impl(float) override { - } // config_ already updated by base - void set_topp_impl(float) override {} // config_ already updated by base + void set_temperature_impl(float) override {} + void set_topp_impl(float) override {} void set_count_interval_impl(size_t count_interval) override; void set_time_interval_impl(size_t time_interval) override; diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp index d61d70c41..063775be4 100644 --- a/packages/react-native-executorch/common/runner/text_runner.cpp +++ b/packages/react-native-executorch/common/runner/text_runner.cpp @@ -12,10 +12,10 @@ using namespace executorch::extension::llm; using ::executorch::extension::Module; using ::executorch::runtime::Error; -TextRunner::TextRunner(std::unique_ptr owned_module, +TextRunner::TextRunner(std::unique_ptr module, const std::string &tokenizer_path, const llm::GenerationConfig &config) - : BaseLLMRunner(nullptr, std::move(owned_module), tokenizer_path, config) {} + : BaseLLMRunner(std::move(module), tokenizer_path, config) {} bool TextRunner::is_loaded() const { return module_ && module_->is_loaded() && tokenizer_ && @@ -26,24 +26,10 @@ bool TextRunner::is_loaded() const { Error TextRunner::load_subcomponents() { ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method("forward")); - // Re-detect eos_ids from the module (base class built them but doesn't pass - // them down yet — reconstruct with the same fallback logic). - auto eos_ids = std::make_unique>(); - const auto method_names = - ET_UNWRAP(module_->method_names(), "Failed reading method names"); - if (method_names.count(kEosIds)) { - for (const auto &eos_id : ET_UNWRAP(module_->execute(kEosIds))) { - eos_ids->emplace(static_cast(eos_id.toScalar().to())); - } - } - if (eos_ids->empty()) { - eos_ids->emplace(7); // fallback <|im_end|> - } - llm::Stats *stats_ptr = &stats_; text_decoder_runner_ = std::make_unique( - module_, io_manager_.get(), config_.temperature, config_.topp); + module_.get(), io_manager_.get(), config_.temperature, config_.topp); rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[TextRunner] Parallel prefill (enable_dynamic_shape):", config_.enable_dynamic_shape); @@ -52,7 +38,7 @@ Error TextRunner::load_subcomponents() { config_.enable_dynamic_shape, config_.max_seq_len); text_token_generator_ = std::make_unique( tokenizer_.get(), text_decoder_runner_.get(), config_.enable_kv_cache, - std::move(eos_ids), stats_ptr); + std::move(eos_ids_), stats_ptr); return Error::Ok; } @@ -62,7 +48,6 @@ Error TextRunner::generate_internal( std::function token_callback) { if (inputs.empty()) { - ET_LOG(Error, "MultimodalInput vector cannot be empty"); return Error::InvalidArgument; } diff --git a/packages/react-native-executorch/common/runner/text_runner.h b/packages/react-native-executorch/common/runner/text_runner.h index 857cf452f..5944943b9 100644 --- a/packages/react-native-executorch/common/runner/text_runner.h +++ b/packages/react-native-executorch/common/runner/text_runner.h @@ -10,11 +10,10 @@ namespace rnexecutorch::llm::runner { class TextRunner : public BaseLLMRunner { public: - explicit TextRunner( - std::unique_ptr<::executorch::extension::Module> owned_module, - const std::string &tokenizer_path, - const ::executorch::extension::llm::GenerationConfig &config = { - .temperature = 0.8F, .topp = 0.9F}); + explicit TextRunner(std::unique_ptr<::executorch::extension::Module> module, + const std::string &tokenizer_path, + const ::executorch::extension::llm::GenerationConfig + &config = {.temperature = 0.8F, .topp = 0.9F}); bool is_loaded() const override; From 69d454b21a2f963e612c5be832867fa05962c308 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 16:55:02 +0100 Subject: [PATCH 44/46] refactor: comments etc. --- .../common/runner/encoders/iencoder.h | 2 +- .../common/runner/multimodal_decoder_runner.h | 14 ++------------ .../common/runner/multimodal_input.h | 1 - .../common/runner/multimodal_prefiller.cpp | 1 - .../common/runner/multimodal_prefiller.h | 10 ++-------- .../src/controllers/LLMController.ts | 2 -- packages/react-native-executorch/src/types/llm.ts | 5 ++--- 7 files changed, 7 insertions(+), 28 deletions(-) diff --git a/packages/react-native-executorch/common/runner/encoders/iencoder.h b/packages/react-native-executorch/common/runner/encoders/iencoder.h index 78abe80ce..8a6bf7e51 100644 --- a/packages/react-native-executorch/common/runner/encoders/iencoder.h +++ b/packages/react-native-executorch/common/runner/encoders/iencoder.h @@ -13,7 +13,7 @@ class IEncoder { virtual ~IEncoder() = default; virtual ::executorch::runtime::Error load() = 0; virtual bool is_loaded() const = 0; - // Encodes one input segment, returns embeddings EValue + virtual ::executorch::runtime::Result<::executorch::runtime::EValue> encode(const MultimodalInput &input) = 0; diff --git a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h index 2eafe3901..3b6fe4660 100644 --- a/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h +++ b/packages/react-native-executorch/common/runner/multimodal_decoder_runner.h @@ -13,19 +13,12 @@ #include "constants.h" #include "text_decoder_runner.h" -namespace executorch { -namespace extension { -namespace llm { - -// Extends TextDecoderRunner to use the multi-method PTE layout: -// token_embedding method → embeddings -// text_decoder method → logits +namespace executorch::extension::llm { class MultimodalDecoderRunner : public TextDecoderRunner { public: explicit MultimodalDecoderRunner(Module *module, IOManager *io_manager) : TextDecoderRunner(module, io_manager) {} - // Step: embed single token, then decode. inline ::executorch::runtime::Result<::executorch::aten::Tensor> step(TensorPtr &tokens, int64_t start_pos) override { auto embed_result = module_->execute(kTokenEmbeddingMethod, tokens); @@ -35,7 +28,6 @@ class MultimodalDecoderRunner : public TextDecoderRunner { return decode((*embed_result)[0], start_pos); } - // Decode an embedding EValue to logits. inline ::executorch::runtime::Result<::executorch::aten::Tensor> decode(const ::executorch::runtime::EValue &embeddings, int64_t start_pos) { auto start_pos_tensor = ::executorch::extension::from_blob( @@ -68,6 +60,4 @@ class MultimodalDecoderRunner : public TextDecoderRunner { } }; -} // namespace llm -} // namespace extension -} // namespace executorch +} // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/common/runner/multimodal_input.h b/packages/react-native-executorch/common/runner/multimodal_input.h index 1d56f5f5a..e7d28bdd6 100644 --- a/packages/react-native-executorch/common/runner/multimodal_input.h +++ b/packages/react-native-executorch/common/runner/multimodal_input.h @@ -19,7 +19,6 @@ namespace executorch { namespace extension { namespace llm { -// Tagged struct to distinguish image paths from text strings in the variant. struct ImagePath { std::string path; }; diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp index a9a4715a7..f2bce9bc5 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.cpp @@ -31,7 +31,6 @@ MultimodalPrefiller::MultimodalPrefiller( Result MultimodalPrefiller::prefill(const MultimodalInput &input, int64_t &start_pos) { - // Keep backing storage alive for the duration of the prefill call. EValue encoder_output; std::vector padded_tokens_storage; TensorPtr sliced_embed_storage; diff --git a/packages/react-native-executorch/common/runner/multimodal_prefiller.h b/packages/react-native-executorch/common/runner/multimodal_prefiller.h index 4effee7b7..5f1978943 100644 --- a/packages/react-native-executorch/common/runner/multimodal_prefiller.h +++ b/packages/react-native-executorch/common/runner/multimodal_prefiller.h @@ -16,12 +16,8 @@ #include #include -namespace executorch { -namespace extension { -namespace llm { +namespace executorch::extension::llm { -// Prefills all multimodal inputs (image + text segments) into the KV cache. -// Implements the same padding logic as the ET repo's multimodal_prefiller.cpp. class MultimodalPrefiller { public: explicit MultimodalPrefiller(Module *module, @@ -46,6 +42,4 @@ class MultimodalPrefiller { IEncoder *image_encoder_; }; -} // namespace llm -} // namespace extension -} // namespace executorch +} // namespace executorch::extension::llm diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 0a4629a0b..378817833 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -329,8 +329,6 @@ export class LLMController { const updatedHistory = [...this._messageHistory, newMessage]; this.messageHistoryCallback(updatedHistory); - // For messages with images, convert mediaPath into structured content so - // the chat template emits placeholders in the right position. const historyForTemplate = updatedHistory.map((m) => m.mediaPath ? { diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index f906f8b3f..15b070bc5 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -12,8 +12,7 @@ export type LLMCapability = 'vision' | 'audio'; * @category Types */ export type MediaArg = - ('vision' extends C[number] ? { imagePath?: string } : object) & - ('audio' extends C[number] ? { audioPath?: string } : object); + 'vision' extends C[number] ? { imagePath?: string } : object; /** * Properties for initializing and configuring a Large Language Model (LLM) instance. @@ -154,7 +153,7 @@ export interface LLMTypeMultimodal< * After model responds, `messageHistory` will be updated. * * @param message - The message string to send. - * @param media - Optional media object (e.g. `{ imagePath }` for vision, `{ audioPath }` for audio). + * @param media - Optional media object (e.g. `{ imagePath }` for vision. * @returns The model's response as a `string`. */ sendMessage: (message: string, media?: MediaArg) => Promise; From 6a3857b5d3985c66becef9166fe017c18beb2726 Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 17:12:14 +0100 Subject: [PATCH 45/46] fix: cap VLM generation tokens, propagate encoder load errors, pass image_token from config Co-Authored-By: Claude Sonnet 4.6 --- .../rnexecutorch/host_objects/ModelHostObject.h | 2 +- .../common/rnexecutorch/models/llm/LLM.cpp | 12 ++++++++---- .../common/rnexecutorch/models/llm/LLM.h | 3 ++- .../common/rnexecutorch/tests/run_tests.sh | 2 +- .../common/runner/base_llm_runner.h | 1 - .../common/runner/multimodal_runner.cpp | 8 ++++---- .../common/runner/text_runner.cpp | 1 - .../src/controllers/LLMController.ts | 1 + .../src/hooks/natural_language_processing/useLLM.ts | 4 ++-- packages/react-native-executorch/src/types/llm.ts | 2 +- 10 files changed, 20 insertions(+), 16 deletions(-) diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h index 35b34ed56..88a0e0dd3 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h @@ -156,7 +156,7 @@ template class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION( ModelHostObject, promiseHostFunction, + std::string, std::vector, std::string, std::shared_ptr)>(&Model::generate)>, "generateMultimodal")); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp index 2dd342702..e929ead9b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.cpp @@ -72,6 +72,7 @@ std::string LLM::generate(std::string input, std::string LLM::generate(std::string prompt, std::vector imagePaths, + std::string imageToken, std::shared_ptr callback) { if (!runner_ || !runner_->is_loaded()) { throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, @@ -82,17 +83,20 @@ std::string LLM::generate(std::string prompt, RnExecutorchErrorCode::InvalidUserInput, "This is a text-only model. Call generate(prompt, cb)."); } + if (imageToken.empty()) { + imageToken = ""; + } - // Split rendered prompt on "" placeholders and interleave with images. - static constexpr const char *kImageToken = ""; - static constexpr size_t kImageTokenLen = 7; // strlen("") + // Split rendered prompt on imageToken placeholders and interleave with + // images. + const size_t kImageTokenLen = imageToken.size(); std::vector inputs; size_t imageIdx = 0; size_t searchPos = 0; while (true) { - size_t found = prompt.find(kImageToken, searchPos); + size_t found = prompt.find(imageToken, searchPos); if (found == std::string::npos) { // Remaining text after last image (or entire prompt if no images) if (searchPos < prompt.size()) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h index 514760908..d4e44ec8d 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/llm/LLM.h @@ -24,9 +24,10 @@ class LLM : public BaseModel { std::string generate(std::string prompt, std::shared_ptr callback); - // Multimodal: pre-rendered prompt string with placeholders + + // Multimodal: pre-rendered prompt string with imageToken placeholders + // ordered list of image paths (one per placeholder) std::string generate(std::string prompt, std::vector imagePaths, + std::string imageToken, std::shared_ptr callback); void interrupt(); diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index 324841d9b..941885e54 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -62,7 +62,7 @@ MODELS=( "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.6.0/tokenizer.json" "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte" "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json" - "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_v2.pte" + "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_latest.pte" "tokenizer_2.5.json|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json" "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte" "xnnpack_crnn_english.pte|https://huggingface.co/software-mansion/react-native-executorch-recognizer-crnn.en/resolve/v0.7.0/xnnpack/english/xnnpack_crnn_english.pte" diff --git a/packages/react-native-executorch/common/runner/base_llm_runner.h b/packages/react-native-executorch/common/runner/base_llm_runner.h index c26a3c2d3..2dd929be7 100644 --- a/packages/react-native-executorch/common/runner/base_llm_runner.h +++ b/packages/react-native-executorch/common/runner/base_llm_runner.h @@ -84,7 +84,6 @@ class BaseLLMRunner { std::unordered_map metadata_; std::unique_ptr io_manager_; std::unique_ptr> eos_ids_; - bool shouldStop_{false}; }; } // namespace rnexecutorch::llm::runner diff --git a/packages/react-native-executorch/common/runner/multimodal_runner.cpp b/packages/react-native-executorch/common/runner/multimodal_runner.cpp index 7eda70870..c2aa69204 100644 --- a/packages/react-native-executorch/common/runner/multimodal_runner.cpp +++ b/packages/react-native-executorch/common/runner/multimodal_runner.cpp @@ -42,7 +42,7 @@ Error MultimodalRunner::load_subcomponents() { rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "[MultimodalRunner] Loading", encoders_.size(), "encoder(s)"); for (auto &[type, encoder] : encoders_) { - encoder->load(); + ET_CHECK_OK_OR_RETURN_ERROR(encoder->load()); } llm::Stats *stats_ptr = &stats_; @@ -90,9 +90,9 @@ Error MultimodalRunner::generate_internal( stats_.prompt_eval_end_ms = llm::time_in_ms(); stats_.num_prompt_tokens = pos_; - int32_t resolved_max_new = - static_cast(config_.max_context_length - pos_); - resolved_max_new = std::max(0, resolved_max_new); + int32_t resolved_max_new = resolve_max_new_tokens( + static_cast(pos_), config_.max_seq_len, + config_.max_context_length, config_.max_new_tokens); std::vector seed_tokens = {prefill_next_token}; auto wrapped_callback = [&](const std::string &piece) { diff --git a/packages/react-native-executorch/common/runner/text_runner.cpp b/packages/react-native-executorch/common/runner/text_runner.cpp index 063775be4..d535dba6c 100644 --- a/packages/react-native-executorch/common/runner/text_runner.cpp +++ b/packages/react-native-executorch/common/runner/text_runner.cpp @@ -69,7 +69,6 @@ Error TextRunner::generate_internal( }; stats_.inference_start_ms = llm::time_in_ms(); - shouldStop_ = false; int64_t context_len_left = static_cast(config_.max_context_length) - pos_; diff --git a/packages/react-native-executorch/src/controllers/LLMController.ts b/packages/react-native-executorch/src/controllers/LLMController.ts index 378817833..13a6c4c34 100644 --- a/packages/react-native-executorch/src/controllers/LLMController.ts +++ b/packages/react-native-executorch/src/controllers/LLMController.ts @@ -237,6 +237,7 @@ export class LLMController { ? await this.nativeModule.generateMultimodal( input, imagePaths, + this.tokenizerConfig?.image_token ?? '', this.onToken ) : await this.nativeModule.generate(input, this.onToken); diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts index 877f3a02d..72c7f4d96 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useLLM.ts @@ -97,9 +97,9 @@ export function useLLM({ ); const generate = useCallback( - (messages: Message[], tools?: LLMTool[]) => { + (messages: Message[], tools?: LLMTool[], imagePaths?: string[]) => { setResponse(''); - return controllerInstance.generate(messages, tools); + return controllerInstance.generate(messages, tools, imagePaths); }, [controllerInstance] ); diff --git a/packages/react-native-executorch/src/types/llm.ts b/packages/react-native-executorch/src/types/llm.ts index 15b070bc5..ac57355f1 100644 --- a/packages/react-native-executorch/src/types/llm.ts +++ b/packages/react-native-executorch/src/types/llm.ts @@ -5,7 +5,7 @@ import { ResourceSource } from './common'; * Capabilities a multimodal LLM can have. * @category Types */ -export type LLMCapability = 'vision' | 'audio'; +export type LLMCapability = 'vision'; /** * Derives the media argument shape for `sendMessage` from a capabilities tuple. From 551a30656326663e0f1904f2b4730946ca8bf8ce Mon Sep 17 00:00:00 2001 From: Norbert Klockiewicz Date: Thu, 5 Mar 2026 17:16:15 +0100 Subject: [PATCH 46/46] revert: remove TextRunnerTests and VLMTests suites Co-Authored-By: Claude Sonnet 4.6 --- .../common/rnexecutorch/tests/CMakeLists.txt | 24 ---- .../integration/MultimodalRunnerTest.cpp | 118 ------------------ .../tests/integration/TextRunnerTest.cpp | 109 ---------------- .../common/rnexecutorch/tests/run_tests.sh | 4 - 4 files changed, 255 deletions(-) delete mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp delete mode 100644 packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index ebf390691..159f00159 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -223,30 +223,6 @@ add_rn_test(LLMTests integration/LLMTest.cpp LIBS tokenizers_deps opencv_deps ) -add_rn_test(TextRunnerTests integration/TextRunnerTest.cpp - SOURCES - ${COMMON_DIR}/runner/base_llm_runner.cpp - ${COMMON_DIR}/runner/text_runner.cpp - ${COMMON_DIR}/runner/text_prefiller.cpp - ${COMMON_DIR}/runner/text_decoder_runner.cpp - ${COMMON_DIR}/runner/sampler.cpp - ${COMMON_DIR}/runner/arange_util.cpp - LIBS tokenizers_deps -) - -add_rn_test(VLMTests integration/MultimodalRunnerTest.cpp - SOURCES - ${COMMON_DIR}/runner/base_llm_runner.cpp - ${COMMON_DIR}/runner/multimodal_runner.cpp - ${COMMON_DIR}/runner/multimodal_prefiller.cpp - ${COMMON_DIR}/runner/text_decoder_runner.cpp - ${COMMON_DIR}/runner/sampler.cpp - ${COMMON_DIR}/runner/arange_util.cpp - ${COMMON_DIR}/runner/encoders/vision_encoder.cpp - ${IMAGE_UTILS_SOURCES} - LIBS tokenizers_deps opencv_deps -) - add_rn_test(TextToImageTests integration/TextToImageTest.cpp SOURCES ${RNEXECUTORCH_DIR}/models/text_to_image/TextToImage.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp deleted file mode 100644 index 038fa7f6e..000000000 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/MultimodalRunnerTest.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include -#include - -using ::executorch::extension::Module; -using ::executorch::extension::llm::VisionEncoder; -using ::executorch::runtime::Error; -using ::rnexecutorch::llm::runner::MultimodalType; - -constexpr auto kTextModel = "smolLm2_135M_8da4w.pte"; -constexpr auto kTextTokenizer = "smollm_tokenizer.json"; -constexpr auto kVLMModel = "lfm2_5_vl_quantized_xnnpack_v2.pte"; -constexpr auto kVLMTokenizer = "tokenizer_2.5.json"; -constexpr auto kTestImage = "test_image.jpg"; - -static std::map> -makeVisionEncoders(Module *module) { - std::map> - encoders; - encoders[MultimodalType::Image] = std::make_unique(module); - return encoders; -} - -// ============================================================================ -// Error-path tests (text-only SmolLM2 — no vision_encoder method) -// ============================================================================ - -TEST(MultimodalRunnerTest, LoadFailsWhenVisionEncoderMissing) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - auto encoders = makeVisionEncoders(module.get()); - rnexecutorch::llm::runner::MultimodalRunner runner( - std::move(module), kTextTokenizer, std::move(encoders)); - EXPECT_THROW(runner.load(), rnexecutorch::RnExecutorchError); -} - -TEST(MultimodalRunnerTest, IsLoadedReturnsFalseBeforeLoad) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - auto encoders = makeVisionEncoders(module.get()); - rnexecutorch::llm::runner::MultimodalRunner runner( - std::move(module), kTextTokenizer, std::move(encoders)); - EXPECT_FALSE(runner.is_loaded()); -} - -// ============================================================================ -// Integration tests (require VLM .pte) -// ============================================================================ - -class VLMTest : public ::testing::Test { -protected: - std::unique_ptr runner_; - - void SetUp() override { - auto module = std::make_unique(kVLMModel, Module::LoadMode::File); - auto encoders = makeVisionEncoders(module.get()); - runner_ = std::make_unique( - std::move(module), kVLMTokenizer, std::move(encoders)); - auto err = runner_->load(); - ASSERT_EQ(err, Error::Ok) << "VLM model load failed"; - } -}; - -TEST_F(VLMTest, LoadSucceedsWithRealVLMModel) { - EXPECT_TRUE(runner_->is_loaded()); -} - -TEST_F(VLMTest, MetadataApplied_KVCache) { - EXPECT_TRUE(runner_->config_.enable_kv_cache); -} - -TEST_F(VLMTest, GenerateTextOnlyInputWorks) { - runner_->set_temperature(0.0f); - auto err = runner_->generate( - "<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n"); - EXPECT_EQ(err, Error::Ok); - EXPECT_GT(runner_->pos_, 0); -} - -TEST_F(VLMTest, GenerateWithImageProducesTokens) { - runner_->set_temperature(0.0f); - - std::vector<::executorch::extension::llm::MultimodalInput> inputs = { - ::executorch::extension::llm::make_image_input(kTestImage), - ::executorch::extension::llm::make_text_input( - "<|im_start|>user\nDescribe this image briefly." - "<|im_end|>\n<|im_start|>assistant\n"), - }; - - auto err = runner_->generate_internal(inputs, nullptr); - EXPECT_EQ(err, Error::Ok); - EXPECT_GT(runner_->pos_, 0); -} - -TEST_F(VLMTest, EmbeddingCacheHitOnRepeatedImage) { - runner_->set_temperature(0.0f); - - // First call — cache miss, runs vision_encoder - std::vector<::executorch::extension::llm::MultimodalInput> inputs = { - ::executorch::extension::llm::make_image_input(kTestImage), - ::executorch::extension::llm::make_text_input( - "<|im_start|>user\nWhat is this?<|im_end|>\n<|im_start|>assistant\n"), - }; - runner_->generate_internal(inputs, nullptr); - runner_->reset(); - - // Second call — same image path, should hit cache - // (no functional assertion possible without instrumenting the encoder, - // but this at least verifies it doesn't crash or error) - auto err = runner_->generate_internal(inputs, nullptr); - EXPECT_EQ(err, Error::Ok); -} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp deleted file mode 100644 index 169310ed3..000000000 --- a/packages/react-native-executorch/common/rnexecutorch/tests/integration/TextRunnerTest.cpp +++ /dev/null @@ -1,109 +0,0 @@ -#include -#include - -#include -#include -#include - -using ::executorch::extension::Module; -using ::executorch::runtime::Error; - -constexpr auto kTextModel = "smolLm2_135M_8da4w.pte"; -constexpr auto kTextTokenizer = "smollm_tokenizer.json"; -constexpr auto kSystemPrompt = "You are a helpful assistant. Assist the user " - "to the best of your abilities."; - -static std::string formatChatML(const std::string &systemPrompt, - const std::string &userMessage) { - return "<|im_start|>system\n" + systemPrompt + "<|im_end|>\n" + - "<|im_start|>user\n" + userMessage + "<|im_end|>\n" + - "<|im_start|>assistant\n"; -} - -TEST(TextRunnerTest, ConstructorAndLoadSucceeds) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - auto err = runner.load(); - EXPECT_EQ(err, Error::Ok); - EXPECT_TRUE(runner.is_loaded()); -} - -TEST(TextRunnerTest, MetadataApplied_EnableDynamicShape) { - // SmolLM2-135M exports enable_dynamic_shape = 1 - // After load(), config_.enable_dynamic_shape must be true (our fix) - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.load(); - EXPECT_TRUE(runner.config_.enable_dynamic_shape); -} - -TEST(TextRunnerTest, MetadataApplied_KVCache) { - // SmolLM2-135M exports use_kv_cache = 1 - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.load(); - EXPECT_TRUE(runner.config_.enable_kv_cache); -} - -TEST(TextRunnerTest, SetTemperaturePropagatesAfterLoad) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.load(); - runner.set_temperature(0.3f); - EXPECT_FLOAT_EQ(runner.config_.temperature, 0.3f); -} - -TEST(TextRunnerTest, ResetZerosPos) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.pos_ = 42; - runner.reset(); - EXPECT_EQ(runner.pos_, 0); -} - -TEST(TextRunnerTest, GenerateProducesTokens) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.load(); - runner.set_temperature(0.0f); - - std::string prompt = formatChatML(kSystemPrompt, "Say: hello"); - auto err = runner.generate(prompt); - EXPECT_EQ(err, Error::Ok); - EXPECT_GT(runner.pos_, 0); -} - -TEST(TextRunnerTest, ParallelPrefillEnabled) { - // Confirms the fix: enable_dynamic_shape from metadata now unconditionally - // applied - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.load(); - EXPECT_TRUE(runner.config_.enable_dynamic_shape); -} - -TEST(TextRunnerTest, StopHaltsGeneration) { - auto module = std::make_unique(kTextModel, Module::LoadMode::File); - rnexecutorch::llm::runner::TextRunner runner(std::move(module), - kTextTokenizer); - runner.load(); - runner.set_temperature(0.0f); - - int token_count = 0; - std::string prompt = formatChatML(kSystemPrompt, "Count to one hundred"); - runner.generate(prompt, {}, [&](const std::string &) { - token_count++; - if (token_count >= 3) { - runner.stop(); - } - }); - EXPECT_GT(token_count, 0); - EXPECT_LE(token_count, 5); // stopped early -} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh index 941885e54..360aa9d11 100755 --- a/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh +++ b/packages/react-native-executorch/common/rnexecutorch/tests/run_tests.sh @@ -29,8 +29,6 @@ TEST_EXECUTABLES=( "TokenizerModuleTests" "SpeechToTextTests" "LLMTests" - "TextRunnerTests" - "VLMTests" "ImageSegmentationTests" "TextToImageTests" "OCRTests" @@ -62,8 +60,6 @@ MODELS=( "whisper_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-whisper-tiny.en/resolve/v0.6.0/tokenizer.json" "smolLm2_135M_8da4w.pte|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/smolLm-2-135M/quantized/smolLm2_135M_8da4w.pte" "smollm_tokenizer.json|https://huggingface.co/software-mansion/react-native-executorch-smolLm-2/resolve/v0.6.0/tokenizer.json" - "lfm2_5_vl_quantized_xnnpack_v2.pte|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/lfm2_5_vl_quantized_xnnpack_latest.pte" - "tokenizer_2.5.json|https://huggingface.co/nklockiewicz/lfm2-vl-et/resolve/main/tokenizer_2.5.json" "deeplabV3_xnnpack_fp32.pte|https://huggingface.co/software-mansion/react-native-executorch-deeplab-v3/resolve/v0.6.0/xnnpack/deeplabV3_xnnpack_fp32.pte" "xnnpack_crnn_english.pte|https://huggingface.co/software-mansion/react-native-executorch-recognizer-crnn.en/resolve/v0.7.0/xnnpack/english/xnnpack_crnn_english.pte" "xnnpack_craft_quantized.pte|https://huggingface.co/software-mansion/react-native-executorch-detector-craft/resolve/v0.7.0/xnnpack/xnnpack_craft.pte"