diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00f5224..e9515d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,10 @@ set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
add_subdirectory(deps/llama.cpp ${CMAKE_BINARY_DIR}/llama.cpp EXCLUDE_FROM_ALL)
+# --- libmtmd (multimodal/vision support from llama.cpp) ---
+set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "" FORCE)
+add_subdirectory(deps/llama.cpp/tools/mtmd ${CMAKE_BINARY_DIR}/mtmd EXCLUDE_FROM_ALL)
+
# --- sherpa-onnx (STT + TTS + VAD) ---
set(SHERPA_ONNX_ENABLE_C_API ON CACHE BOOL "Enable C API" FORCE)
set(SHERPA_ONNX_ENABLE_BINARY OFF CACHE BOOL "" FORCE)
@@ -99,8 +103,11 @@ add_library(rcli STATIC
src/engines/metalrt_engine.cpp
src/engines/metalrt_stt_engine.cpp
src/engines/metalrt_tts_engine.cpp
+ src/engines/vlm_engine.cpp
src/audio/audio_io.cpp
src/audio/mic_permission.mm
+ src/audio/camera_capture.mm
+ src/audio/screen_capture.mm
src/pipeline/orchestrator.cpp
src/pipeline/sentence_detector.cpp
src/tools/tool_engine.cpp
@@ -133,13 +140,14 @@ add_library(rcli STATIC
src/api/rcli_api.cpp
)
-set_source_files_properties(src/audio/mic_permission.mm
+set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm
PROPERTIES LANGUAGE CXX)
target_include_directories(rcli PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}/src
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/include
${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/include
+ ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/tools/mtmd
${CMAKE_CURRENT_SOURCE_DIR}/deps/sherpa-onnx/sherpa-onnx/c-api
${usearch_SOURCE_DIR}/include
)
@@ -147,12 +155,18 @@ target_include_directories(rcli PUBLIC
target_link_libraries(rcli PUBLIC
llama
ggml
+ mtmd
sherpa-onnx-c-api
"-framework CoreAudio"
"-framework AudioToolbox"
"-framework AudioUnit"
"-framework Foundation"
"-framework AVFoundation"
+ "-framework AppKit"
+ "-framework CoreImage"
+ "-framework CoreMedia"
+ "-framework CoreVideo"
+ "-framework CoreGraphics"
"-framework IOKit"
)
@@ -186,6 +200,27 @@ target_compile_definitions(rcli_cli PRIVATE
RCLI_VERSION="${PROJECT_VERSION}"
)
+# =============================================================================
+# rcli_overlay — standalone Cocoa helper for visual overlay window
+# =============================================================================
+add_executable(rcli_overlay
+ src/audio/rcli_overlay.m
+)
+
+set_source_files_properties(src/audio/rcli_overlay.m PROPERTIES LANGUAGE CXX)
+
+target_compile_options(rcli_overlay PRIVATE -x objective-c++)
+
+target_link_libraries(rcli_overlay PRIVATE
+ "-framework AppKit"
+ "-framework CoreGraphics"
+)
+
+set_target_properties(rcli_overlay PROPERTIES
+ OUTPUT_NAME "rcli_overlay"
+ RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
# =============================================================================
# rcli_test — test executable
# =============================================================================
diff --git a/README.md b/README.md
index dcefc11..972342a 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
-**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS pipeline running natively on Apple Silicon — 38 macOS actions via voice, local RAG over your documents, sub-200ms end-to-end latency. No cloud, no API keys.
+**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS + VLM pipeline running natively on Apple Silicon — 40 macOS actions via voice, local RAG over your documents, on-device vision (camera & screen analysis), sub-200ms end-to-end latency. No cloud, no API keys.
Powered by [MetalRT](#metalrt-gpu-engine), a proprietary GPU inference engine built by [RunAnywhere, Inc.](https://runanywhere.ai) specifically for Apple Silicon.
@@ -112,6 +112,9 @@ rcli # interactive TUI (push-to-talk + text)
rcli listen # continuous voice mode
rcli ask "open Safari" # one-shot command
rcli ask "play some jazz on Spotify"
+rcli vlm photo.jpg "what's in this image?" # vision analysis
+rcli camera # live camera VLM
+rcli screen # screen capture VLM
rcli metalrt # MetalRT GPU engine management
rcli llamacpp # llama.cpp engine management
```
@@ -149,7 +152,18 @@ A full STT + LLM + TTS pipeline running on Metal GPU with three concurrent threa
- **Tool Calling** — LLM-native tool call formats (Qwen3, LFM2, etc.)
- **Multi-turn Memory** — Sliding window conversation history with token-budget trimming
-### 38 macOS Actions
+### Vision (VLM)
+
+Analyze images, camera captures, and screen regions using on-device vision-language models. VLM runs on the llama.cpp engine via Metal GPU — no cloud.
+
+- **Image Analysis** — `rcli vlm photo.jpg "describe this"` for single-image queries
+- **Camera** — Press **V** in the TUI or run `rcli camera` for live camera analysis
+- **Screen Capture** — Press **S** in the TUI or run `rcli screen` to analyze screen regions
+- **Models** — Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — download on demand via `rcli models vlm`
+
+> **Note:** VLM is currently available on the llama.cpp engine. MetalRT VLM support is coming soon.
+
+### 40 macOS Actions
Control your Mac by voice or text. The LLM routes intent to actions executed locally via AppleScript and shell commands.
@@ -161,7 +175,7 @@ Control your Mac by voice or text. The LLM routes intent to actions executed loc
| **System** | `open_app`, `quit_app`, `set_volume`, `toggle_dark_mode`, `screenshot`, `lock_screen` |
| **Web** | `search_web`, `search_youtube`, `open_url`, `open_maps` |
-Run `rcli actions` to see all 38, or toggle them on/off in the TUI Actions panel.
+Run `rcli actions` to see all 40, or toggle them on/off in the TUI Actions panel.
> **Tip:** If tool calling feels unreliable, press **X** in the TUI to clear the conversation and reset context. With small LLMs, accumulated context can degrade tool-calling accuracy — a fresh context often fixes it.
@@ -181,7 +195,9 @@ A terminal dashboard with push-to-talk, live hardware monitoring, model manageme
| Key | Action |
|-----|--------|
| **SPACE** | Push-to-talk |
-| **M** | Models — browse, download, hot-swap LLM/STT/TTS |
+| **V** | Camera — capture and analyze with VLM |
+| **S** | Screen — capture and analyze a screen region with VLM |
+| **M** | Models — browse, download, hot-swap LLM/STT/TTS/VLM |
| **A** | Actions — browse, enable/disable macOS actions |
| **R** | RAG — ingest documents |
| **X** | Clear conversation and reset context |
@@ -207,7 +223,7 @@ MetalRT is distributed under a [proprietary license](https://github.com/Runanywh
## Supported Models
-RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch.
+RCLI supports 20+ models across LLM, STT, TTS, VLM, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch.
**LLM:** LFM2 1.2B (default), LFM2 350M, LFM2.5 1.2B, LFM2 2.6B, Qwen3 0.6B, Qwen3.5 0.8B/2B/4B, Qwen3 4B
@@ -215,10 +231,13 @@ RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run loca
**TTS:** Piper Lessac/Amy, KittenTTS Nano, Matcha LJSpeech, Kokoro English/Multi-lang
-**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings.
+**VLM:** Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — on-demand download via `rcli models vlm` (llama.cpp engine only)
+
+**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings. VLM models are downloaded on demand.
```bash
rcli models # interactive model management
+rcli models vlm # download/manage VLM models
rcli upgrade-llm # guided LLM upgrade
rcli voices # browse and switch TTS voices
rcli cleanup # remove unused models
@@ -247,10 +266,13 @@ All dependencies are vendored or CMake-fetched. Requires CMake 3.15+ and Apple C
rcli Interactive TUI (push-to-talk + text + trace)
rcli listen Continuous voice mode
rcli ask One-shot text command
+rcli vlm [prompt] Analyze an image with VLM
+rcli camera [prompt] Live camera capture + VLM analysis
+rcli screen [prompt] Screen capture + VLM analysis
rcli actions [name] List actions or show detail
rcli rag ingest Index documents for RAG
rcli rag query Query indexed documents
-rcli models [llm|stt|tts] Manage AI models
+rcli models [llm|stt|tts|vlm] Manage AI models
rcli voices Manage TTS voices
rcli metalrt MetalRT GPU engine management
rcli llamacpp llama.cpp engine management
diff --git a/src/api/rcli_api.cpp b/src/api/rcli_api.cpp
index 8baa3ef..f292c78 100644
--- a/src/api/rcli_api.cpp
+++ b/src/api/rcli_api.cpp
@@ -16,6 +16,7 @@
#include "rag/index_builder.h"
#include "pipeline/text_sanitizer.h"
#include "pipeline/sentence_detector.h"
+#include "audio/screen_capture.h"
#include
#include
#include
@@ -32,9 +33,15 @@
#include
#include
#include
+#include
+#include
+
+extern char** environ;
#include "actions/action_registry.h"
#include "actions/macos_actions.h"
+#include "engines/vlm_engine.h"
+#include "models/vlm_model_registry.h"
using namespace rastack;
@@ -109,6 +116,13 @@ struct RCLIEngine {
// so the context gauge shows stable, meaningful usage.
int ctx_main_prompt_tokens = 0;
+ // VLM (Vision Language Model) subsystem
+ VlmEngine vlm_engine;
+ bool vlm_initialized = false;
+ std::string last_vlm_response;
+ std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT"
+ std::string vlm_model_name; // e.g. "Qwen3 VL 2B"
+
std::mutex mutex;
bool initialized = false;
};
@@ -969,6 +983,113 @@ static std::vector try_parse_bare_tool_calls(
return calls;
}
+// Forward declaration (defined later in VLM section)
+static int vlm_init_locked(RCLIEngine* engine);
+
+// =============================================================================
+// Screen intent detection — intercept voice commands about the user's screen
+// =============================================================================
+
+static bool has_word(const std::string& text, const char* word) {
+ return text.find(word) != std::string::npos;
+}
+
+static bool is_screen_intent(const std::string& input) {
+ // Normalize to lowercase for matching
+ std::string lower = input;
+ for (auto& c : lower) c = (char)std::tolower((unsigned char)c);
+
+ // --- Tier 1: explicit screenshot keywords (always trigger) ---
+ if (has_word(lower, "screenshot") || has_word(lower, "screen capture") ||
+ has_word(lower, "screen shot"))
+ return true;
+
+ // --- Tier 2: "screen" + any vision/action verb ---
+ bool has_screen = has_word(lower, "screen");
+ if (has_screen) {
+ static const char* screen_verbs[] = {
+ "look", "see", "show", "what", "tell", "describe", "explain",
+ "check", "analyze", "read", "capture", "going on", "happening",
+ };
+ for (const auto* v : screen_verbs) {
+ if (has_word(lower, v)) return true;
+ }
+ }
+
+ // --- Tier 3: visual context phrases (no "screen" needed) ---
+ // "does this look good/right/ok", "how does this look", etc.
+ if (has_word(lower, "does this look") || has_word(lower, "how does this look"))
+ return true;
+ // "what am I looking at"
+ if (has_word(lower, "looking at") && has_word(lower, "what"))
+ return true;
+ // "can you see this/that", "what do you see", "what can you see"
+ if ((has_word(lower, "can you see") || has_word(lower, "do you see")) &&
+ !has_word(lower, "file") && !has_word(lower, "code") && !has_word(lower, "error"))
+ return true;
+ // "what's happening here", "explain what's happening"
+ if (has_word(lower, "happening here") || has_word(lower, "happening on"))
+ return true;
+
+ return false;
+}
+
+// Capture active window + analyze with VLM. Returns response or empty on failure.
+// Caller must hold engine->mutex.
+static std::string handle_screen_intent(RCLIEngine* engine, const std::string& user_text) {
+ // Generate a temp path
+ auto ts = std::chrono::system_clock::now().time_since_epoch().count();
+ std::string path = "/tmp/rcli_screen_" + std::to_string(ts) + ".jpg";
+
+ int rc;
+ const char* capture_source;
+ if (screen_capture_overlay_active()) {
+ // Visual mode: capture the overlay region
+ capture_source = "visual frame";
+ rc = screen_capture_overlay_region(path.c_str());
+ } else {
+ // Fallback: capture the previously active app's window
+ char target_app[256];
+ screen_capture_target_app_name(target_app, sizeof(target_app));
+ capture_source = target_app;
+ rc = screen_capture_behind_terminal(path.c_str());
+ }
+ LOG_INFO("RCLI", "[screen_intent] Capturing %s → %s", capture_source, path.c_str());
+ if (rc != 0) {
+ LOG_ERROR("RCLI", "[screen_intent] Screen capture failed");
+ return "I couldn't capture your screen. Please check screen recording permissions "
+ "in System Settings > Privacy & Security > Screen Recording.";
+ }
+
+ // Initialize VLM if needed
+ if (!engine->vlm_initialized) {
+ if (vlm_init_locked(engine) != 0) {
+ return "I can see you're asking about your screen, but VLM isn't available. "
+ "It requires the llama.cpp engine and a VLM model. "
+ "Switch with: rcli engine llamacpp, then download a model: rcli models vlm";
+ }
+ }
+
+ // Build a natural prompt from the user's words
+ std::string vlm_prompt = user_text;
+ if (vlm_prompt.empty()) {
+ vlm_prompt = "Describe what you see on this screen in detail.";
+ }
+
+ std::string result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr);
+
+ if (result.empty()) {
+ return "I captured your screen but the analysis failed. Please try again.";
+ }
+
+ // Prepend which app was captured so the user knows
+ std::string prefixed = "[Captured: " + std::string(capture_source) + "]\n" + result;
+
+ // Store for stats retrieval
+ engine->last_vlm_response = prefixed;
+ return prefixed;
+}
+
// =============================================================================
// Process command entry points
// =============================================================================
@@ -984,6 +1105,14 @@ const char* rcli_process_command(RCLIHandle handle, const char* text) {
LOG_TRACE("RCLI", "[process_command] engine->mutex acquired, input='%.40s'", text);
std::string input(text);
+ // --- Screen intent intercept: capture active window + VLM ---
+ if (is_screen_intent(input)) {
+ engine->last_response = handle_screen_intent(engine, input);
+ engine->conversation_history.emplace_back("user", input);
+ engine->conversation_history.emplace_back("assistant", engine->last_response);
+ return engine->last_response.c_str();
+ }
+
// --- MetalRT path: tool-aware inference via generate_raw (pre-formatted prompt) ---
if (engine->pipeline.using_metalrt()) {
auto& mrt = engine->pipeline.metalrt_llm();
@@ -1027,19 +1156,12 @@ const char* rcli_process_command(RCLIHandle handle, const char* text) {
full_prompt.compare(0, cached.size(), cached) == 0) {
std::string full_continuation = full_prompt.substr(cached.size());
- if (engine->metalrt_kv_continuation_len > 0 &&
- engine->metalrt_kv_continuation_len < full_continuation.size()) {
- std::string new_part = full_continuation.substr(engine->metalrt_kv_continuation_len);
- LOG_TRACE("RCLI", "[process_command] incremental continue "
- "(new=%zu chars, skip=%zu already in KV)",
- new_part.size(), engine->metalrt_kv_continuation_len);
- raw_output = mrt.generate_raw_continue(new_part, nullptr, false);
- } else {
- LOG_TRACE("RCLI", "[process_command] full continue "
- "(continuation=%zu chars)", full_continuation.size());
- raw_output = mrt.generate_raw_continue(full_continuation, nullptr, true);
- }
- engine->metalrt_kv_continuation_len = full_continuation.size();
+ // Always re-prefill full continuation from cached system prompt.
+ // Incremental continue (reset_cache=false) is unsafe because the KV
+ // cache includes generated tokens not tracked by continuation_len.
+ LOG_TRACE("RCLI", "[process_command] full continue "
+ "(continuation=%zu chars)", full_continuation.size());
+ raw_output = mrt.generate_raw_continue(full_continuation, nullptr, true);
} else {
LOG_TRACE("RCLI", "[process_command] calling mrt.generate_raw() ...");
raw_output = mrt.generate_raw(full_prompt);
@@ -1499,6 +1621,92 @@ const char* rcli_process_and_speak(RCLIHandle handle, const char* text,
engine->streaming_cancelled.store(false, std::memory_order_release);
std::string input(text);
+ // --- Screen intent intercept: capture + VLM + sentence-streamed TTS ---
+ if (is_screen_intent(input)) {
+ auto t_start_screen = std::chrono::steady_clock::now();
+ std::string response = handle_screen_intent(engine, input);
+ engine->last_response = response;
+ engine->conversation_history.emplace_back("user", input);
+ engine->conversation_history.emplace_back("assistant", response);
+
+ // Fire "response" callback so TUI displays the text
+ if (callback) {
+ callback("response", response.c_str(), user_data);
+ }
+
+ // Sentence-streamed TTS (same pattern as LLM path for low TTFA)
+ std::string clean_text = rastack::sanitize_for_tts(response);
+ if (!clean_text.empty()) {
+ if (!engine->pipeline.audio().is_running()) {
+ engine->pipeline.audio().start();
+ }
+ auto* rb = engine->pipeline.playback_ring_buffer();
+ if (rb) {
+ rb->clear();
+
+ // Split into sentences and synthesize each one
+ std::vector sentences;
+ rastack::SentenceDetector splitter([&](const std::string& s) {
+ sentences.push_back(s);
+ }, /*min_words=*/3);
+ // Feed the entire text token-by-token (word by word)
+ for (size_t i = 0; i < clean_text.size(); ) {
+ size_t end = clean_text.find(' ', i);
+ if (end == std::string::npos) end = clean_text.size();
+ else end++; // include space
+ splitter.feed(clean_text.substr(i, end - i));
+ i = end;
+ }
+ splitter.flush();
+
+ bool first_audio = false;
+ for (auto& sentence : sentences) {
+ if (engine->streaming_cancelled.load(std::memory_order_acquire)) break;
+
+ std::vector samples;
+ if (engine->pipeline.using_metalrt_tts()) {
+ samples = engine->pipeline.metalrt_tts().synthesize(sentence);
+ } else {
+ samples = engine->pipeline.tts().synthesize(sentence);
+ }
+
+ // Write with backpressure
+ size_t offset = 0;
+ while (offset < samples.size() &&
+ !engine->streaming_cancelled.load(std::memory_order_acquire)) {
+ size_t written = rb->write(samples.data() + offset, samples.size() - offset);
+ offset += written;
+ if (offset < samples.size()) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+ }
+ }
+
+ if (!first_audio) {
+ first_audio = true;
+ if (callback) {
+ auto now = std::chrono::steady_clock::now();
+ double ttfa_ms = std::chrono::duration(now - t_start_screen).count();
+ char buf[32];
+ snprintf(buf, sizeof(buf), "%.1f", ttfa_ms);
+ callback("first_audio", buf, user_data);
+ }
+ }
+ }
+
+ // Wait for playback to drain
+ size_t samples_per_frame = 256;
+ while (rb->available_read() > samples_per_frame &&
+ !engine->streaming_cancelled.load(std::memory_order_acquire)) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(20));
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(200));
+ }
+ }
+
+ if (callback) callback("complete", "{}", user_data);
+ return engine->last_response.c_str();
+ }
+
auto t_start = std::chrono::steady_clock::now();
// --- TTS worker thread (sentence queue → ring buffer → CoreAudio) ---
@@ -1711,19 +1919,14 @@ const char* rcli_process_and_speak(RCLIHandle handle, const char* text,
full_continuation.size(),
engine->metalrt_kv_continuation_len);
- if (engine->metalrt_kv_continuation_len > 0 &&
- engine->metalrt_kv_continuation_len < full_continuation.size()) {
- std::string new_part = full_continuation.substr(engine->metalrt_kv_continuation_len);
- LOG_DEBUG("RCLI", "[speak] incremental continue "
- "(new=%zu chars, skip=%zu already in KV)",
- new_part.size(), engine->metalrt_kv_continuation_len);
- response = mrt.generate_raw_continue(new_part, streaming_cb, false);
- } else {
- LOG_DEBUG("RCLI", "[speak] full continue "
- "(continuation=%zu chars)", full_continuation.size());
- response = mrt.generate_raw_continue(full_continuation, streaming_cb, true);
- }
- engine->metalrt_kv_continuation_len = full_continuation.size();
+ // Always truncate to cached system prompt and re-prefill the full
+ // continuation. The incremental path (reset_cache=false) is unsafe
+ // because the KV cache also contains generated-response tokens that
+ // metalrt_kv_continuation_len does not account for, which causes
+ // duplicate content in the KV and corrupts multi-turn attention.
+ LOG_DEBUG("RCLI", "[speak] full continue "
+ "(continuation=%zu chars)", full_continuation.size());
+ response = mrt.generate_raw_continue(full_continuation, streaming_cb, true);
} else {
LOG_DEBUG("RCLI", "[speak] cache MISS path — calling generate_raw() "
"(has_cache=%d prefix_match=%d)",
@@ -2745,6 +2948,243 @@ void rcli_get_context_info(RCLIHandle handle, int* out_prompt_tokens, int* out_c
}
}
+// =============================================================================
+// VLM (Vision Language Model)
+// =============================================================================
+
+// Recursively create directories (like mkdir -p)
+static bool mkdirs(const std::string& path) {
+ struct stat st;
+ if (stat(path.c_str(), &st) == 0) return S_ISDIR(st.st_mode);
+ // Recurse to create parent
+ auto slash = path.rfind('/');
+ if (slash != std::string::npos && slash > 0) {
+ if (!mkdirs(path.substr(0, slash))) return false;
+ }
+ return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST;
+}
+
+// Download a file using fork/exec to avoid shell injection
+static bool safe_download(const std::string& url, const std::string& dest) {
+ pid_t pid;
+ const char* argv[] = {
+ "curl", "-L", "--progress-bar", "-o", dest.c_str(), url.c_str(), nullptr
+ };
+ int status = 0;
+ posix_spawn_file_actions_t actions;
+ posix_spawn_file_actions_init(&actions);
+ if (posix_spawnp(&pid, "curl", &actions, nullptr,
+ const_cast(argv), environ) != 0) {
+ posix_spawn_file_actions_destroy(&actions);
+ return false;
+ }
+ posix_spawn_file_actions_destroy(&actions);
+ waitpid(pid, &status, 0);
+ return WIFEXITED(status) && WEXITSTATUS(status) == 0;
+}
+
+// Internal init (caller must hold engine->mutex)
+// VLM is only available on the llama.cpp engine. MetalRT VLM support coming soon.
+static int vlm_init_locked(RCLIEngine* engine) {
+ if (engine->vlm_initialized) return 0;
+
+ if (engine->models_dir.empty()) {
+ if (const char* home = getenv("HOME"))
+ engine->models_dir = std::string(home) + "/Library/RCLI/models";
+ else
+ engine->models_dir = "./models";
+ }
+
+ // VLM requires the llama.cpp engine
+ if (engine->initialized && engine->pipeline.using_metalrt()) {
+ LOG_ERROR("VLM", "VLM is currently available with the llama.cpp engine. Switch with: rcli engine llamacpp");
+ return -1;
+ }
+
+ // Check if any VLM model is installed (on-demand, no auto-download)
+ auto vlm_models = rcli::all_vlm_models();
+ rcli::VlmModelDef model_def;
+ bool found = false;
+
+ for (auto& m : vlm_models) {
+ if (rcli::is_vlm_model_installed(engine->models_dir, m)) {
+ model_def = m;
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ LOG_ERROR("VLM", "No VLM model installed. Download one with: rcli models vlm");
+ return -1;
+ }
+
+ // Initialize VLM engine with the installed model
+ VlmConfig config;
+ config.model_path = engine->models_dir + "/" + model_def.model_filename;
+ config.mmproj_path = engine->models_dir + "/" + model_def.mmproj_filename;
+ config.n_gpu_layers = 99;
+ config.n_ctx = 4096;
+ config.n_batch = 512;
+ config.n_threads = 1;
+ config.n_threads_batch = 8;
+ config.flash_attn = true;
+
+ if (!engine->vlm_engine.init(config)) {
+ LOG_ERROR("VLM", "Failed to initialize VLM engine");
+ return -1;
+ }
+
+ engine->vlm_initialized = true;
+ engine->vlm_backend_name = "llama.cpp (Metal GPU)";
+ engine->vlm_model_name = model_def.name;
+ LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str());
+ return 0;
+}
+
+int rcli_vlm_init(RCLIHandle handle) {
+ if (!handle) return -1;
+ auto* engine = static_cast(handle);
+ std::lock_guard lock(engine->mutex);
+ return vlm_init_locked(engine);
+}
+
+const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const char* prompt) {
+ if (!handle || !image_path) return "";
+ auto* engine = static_cast(handle);
+ std::lock_guard lock(engine->mutex);
+
+ if (!engine->vlm_initialized) {
+ if (vlm_init_locked(engine) != 0) {
+ engine->last_vlm_response = "VLM not available. Requires llama.cpp engine (rcli engine llamacpp) and a VLM model (rcli models vlm).";
+ return engine->last_vlm_response.c_str();
+ }
+ }
+
+ std::string text_prompt = prompt && prompt[0]
+ ? std::string(prompt)
+ : "Describe this image in detail.";
+
+ {
+ std::string result = engine->vlm_engine.analyze_image(
+ std::string(image_path), text_prompt, nullptr);
+
+ if (result.empty()) {
+ engine->last_vlm_response = "Error: Failed to analyze image.";
+ } else {
+ engine->last_vlm_response = result;
+ }
+ }
+ return engine->last_vlm_response.c_str();
+}
+
+int rcli_vlm_is_ready(RCLIHandle handle) {
+ if (!handle) return 0;
+ auto* engine = static_cast(handle);
+ return engine->vlm_initialized ? 1 : 0;
+}
+
+const char* rcli_vlm_backend_name(RCLIHandle handle) {
+ if (!handle) return "";
+ auto* engine = static_cast(handle);
+ return engine->vlm_backend_name.c_str();
+}
+
+const char* rcli_vlm_model_name(RCLIHandle handle) {
+ if (!handle) return "";
+ auto* engine = static_cast(handle);
+ return engine->vlm_model_name.c_str();
+}
+
+int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) {
+ if (!handle || !out_stats) return -1;
+ auto* engine = static_cast(handle);
+ if (!engine->vlm_initialized) return -1;
+
+ auto& s = engine->vlm_engine.last_stats();
+ out_stats->gen_tok_per_sec = s.gen_tps();
+ out_stats->generated_tokens = static_cast(s.generated_tokens);
+ out_stats->total_time_sec = (s.image_encode_us + s.generation_us) / 1e6;
+ out_stats->image_encode_ms = s.image_encode_us / 1000.0;
+ out_stats->first_token_ms = s.first_token_us / 1000.0;
+ return 0;
+}
+
+// =============================================================================
+// VLM GPU swap: enter/exit visual mode by swapping LLM ↔ VLM on GPU
+// =============================================================================
+
+int rcli_vlm_enter(RCLIHandle handle) {
+ if (!handle) return -1;
+ auto* engine = static_cast(handle);
+ std::lock_guard lock(engine->mutex);
+
+ if (engine->vlm_initialized) return 0;
+ return vlm_init_locked(engine);
+}
+
+int rcli_vlm_exit(RCLIHandle handle) {
+ if (!handle) return -1;
+ auto* engine = static_cast(handle);
+ std::lock_guard lock(engine->mutex);
+
+ if (engine->vlm_engine.is_initialized()) {
+ engine->vlm_engine.shutdown();
+ }
+
+ engine->vlm_initialized = false;
+ engine->vlm_backend_name.clear();
+ engine->vlm_model_name.clear();
+ LOG_INFO("VLM", "VLM unloaded");
+ return 0;
+}
+
+int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
+ const char* prompt,
+ RCLIEventCallback callback, void* user_data) {
+ if (!handle || !image_path) return -1;
+ auto* engine = static_cast(handle);
+ std::lock_guard lock(engine->mutex);
+
+ // Lazy-init VLM if not yet loaded
+ if (!engine->vlm_initialized) {
+ if (vlm_init_locked(engine) != 0) {
+ LOG_ERROR("VLM", "Failed to initialize VLM engine for streaming");
+ return -1;
+ }
+ }
+
+ std::string text_prompt = (prompt && prompt[0])
+ ? std::string(prompt) : "Describe this image in detail.";
+
+ // llama.cpp VLM streaming path
+ rastack::TokenCallback token_cb = nullptr;
+ if (callback) {
+ token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
+ if (!tok.text.empty()) {
+ callback("token", tok.text.c_str(), user_data);
+ }
+ };
+ }
+
+ std::string result = engine->vlm_engine.analyze_image(
+ std::string(image_path), text_prompt, token_cb);
+
+ engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result;
+
+ if (callback) {
+ callback("response", engine->last_vlm_response.c_str(), user_data);
+ auto& s = engine->vlm_engine.last_stats();
+ char stats_buf[256];
+ snprintf(stats_buf, sizeof(stats_buf),
+ "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}",
+ s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0);
+ callback("stats", stats_buf, user_data);
+ }
+
+ return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0;
+}
+
} // extern "C"
std::vector rcli_get_all_action_defs(RCLIHandle handle) {
diff --git a/src/api/rcli_api.h b/src/api/rcli_api.h
index 5a0e2d3..e6906d1 100644
--- a/src/api/rcli_api.h
+++ b/src/api/rcli_api.h
@@ -262,6 +262,60 @@ const char* rcli_get_stt_model(RCLIHandle handle);
// Both output pointers are optional (pass NULL to skip).
void rcli_get_context_info(RCLIHandle handle, int* out_prompt_tokens, int* out_ctx_size);
+// --- VLM (Vision Language Model) ---
+
+// Initialize the VLM engine with the default VLM model.
+// Lazily downloads the model if not present. Thread-safe.
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_init(RCLIHandle handle);
+
+// Analyze an image with an optional text prompt.
+// image_path: absolute path to an image file (jpg, png, bmp, gif, webp, tga).
+// prompt: text prompt (e.g. "Describe this image"). NULL defaults to "Describe this image in detail."
+// Returns the analysis text. Caller must NOT free the returned pointer.
+const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const char* prompt);
+
+// Check if the VLM engine is initialized and ready for image analysis.
+// Returns 1 if ready, 0 if not.
+int rcli_vlm_is_ready(RCLIHandle handle);
+
+// Get the name of the active VLM backend (e.g. "llama.cpp (Metal GPU)" or "MetalRT").
+// Returns "" if VLM is not initialized.
+const char* rcli_vlm_backend_name(RCLIHandle handle);
+
+// Get the name of the active VLM model (e.g. "Qwen3 VL 2B Instruct").
+// Returns "" if VLM is not initialized.
+const char* rcli_vlm_model_name(RCLIHandle handle);
+
+// VLM performance stats from the last analysis call.
+typedef struct {
+ double gen_tok_per_sec; // Generation tokens/second
+ int generated_tokens; // Total tokens generated
+ double total_time_sec; // Total wall time (image encode + prompt eval + generation)
+ double image_encode_ms; // Time to encode image through vision projector
+ double first_token_ms; // Time-to-first-token (prompt eval + image encode)
+} RCLIVlmStats;
+
+// Get stats from the last VLM analysis. Returns 0 on success.
+int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats);
+
+// Swap MetalRT LLM out and VLM in on the GPU (for visual mode).
+// Unloads the LLM model, loads the MetalRT VLM model.
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_enter(RCLIHandle handle);
+
+// Swap MetalRT VLM out and LLM back in on the GPU (exit visual mode).
+// Unloads the VLM model, reloads the LLM and re-caches the system prompt.
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_exit(RCLIHandle handle);
+
+// Streaming VLM image analysis (use after rcli_vlm_enter).
+// Fires callback with events: "token", "response", "stats".
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
+ const char* prompt,
+ RCLIEventCallback callback, void* user_data);
+
#ifdef __cplusplus
}
#endif
diff --git a/src/audio/camera_capture.h b/src/audio/camera_capture.h
new file mode 100644
index 0000000..1d5ade4
--- /dev/null
+++ b/src/audio/camera_capture.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Capture a single frame from the default camera and save as JPEG.
+// output_path: where to save the JPEG (e.g. "/tmp/rcli_camera.jpg").
+// Returns 0 on success, -1 on failure.
+int camera_capture_photo(const char* output_path);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/audio/camera_capture.mm b/src/audio/camera_capture.mm
new file mode 100644
index 0000000..a4cdf8b
--- /dev/null
+++ b/src/audio/camera_capture.mm
@@ -0,0 +1,142 @@
+#import
+#import
+#import
+#import
+#include "camera_capture.h"
+#include
+
+// Delegate that skips warmup frames then captures one properly-exposed frame
+@interface RCLISingleFrameCapture : NSObject
+@property (nonatomic, strong) NSString *outputPath;
+@property (nonatomic, assign) BOOL captured;
+@property (nonatomic, strong) dispatch_semaphore_t semaphore;
+@property (nonatomic, assign) int frameCount;
+@property (nonatomic, assign) int framesToSkip;
+@end
+
+@implementation RCLISingleFrameCapture
+
+- (void)captureOutput:(AVCaptureOutput *)output
+didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
+ fromConnection:(AVCaptureConnection *)connection {
+ if (self.captured) return;
+
+ // Skip initial frames to let auto-exposure/white-balance stabilize
+ self.frameCount++;
+ if (self.frameCount < self.framesToSkip) return;
+
+ self.captured = YES;
+
+ CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+ if (!imageBuffer) {
+ dispatch_semaphore_signal(self.semaphore);
+ return;
+ }
+
+ CIImage *ciImage = [CIImage imageWithCVImageBuffer:imageBuffer];
+ NSCIImageRep *rep = [NSCIImageRep imageRepWithCIImage:ciImage];
+ NSImage *nsImage = [[NSImage alloc] initWithSize:rep.size];
+ [nsImage addRepresentation:rep];
+
+ // Convert to JPEG at high quality
+ NSData *tiffData = [nsImage TIFFRepresentation];
+ NSBitmapImageRep *bitmapRep = [NSBitmapImageRep imageRepWithData:tiffData];
+ NSData *jpegData = [bitmapRep representationUsingType:NSBitmapImageFileTypeJPEG
+ properties:@{NSImageCompressionFactor: @0.92}];
+ [jpegData writeToFile:self.outputPath atomically:YES];
+
+ dispatch_semaphore_signal(self.semaphore);
+}
+
+@end
+
+int camera_capture_photo(const char* output_path) {
+ @autoreleasepool {
+ // Check camera permission
+ AVAuthorizationStatus status = [AVCaptureDevice authorizationStatusForMediaType:AVMediaTypeVideo];
+ if (status == AVAuthorizationStatusDenied || status == AVAuthorizationStatusRestricted) {
+ return -1;
+ }
+ if (status == AVAuthorizationStatusNotDetermined) {
+ dispatch_semaphore_t perm_sem = dispatch_semaphore_create(0);
+ __block BOOL granted = NO;
+ [AVCaptureDevice requestAccessForMediaType:AVMediaTypeVideo completionHandler:^(BOOL g) {
+ granted = g;
+ dispatch_semaphore_signal(perm_sem);
+ }];
+ dispatch_semaphore_wait(perm_sem, DISPATCH_TIME_FOREVER);
+ if (!granted) return -1;
+ }
+
+ // Find default camera
+ AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+ if (!device) return -1;
+
+ // Configure camera for best quality and let auto-exposure do its thing
+ NSError *error = nil;
+ if ([device lockForConfiguration:&error]) {
+ // Enable continuous auto-exposure and white balance
+ if ([device isExposureModeSupported:AVCaptureExposureModeContinuousAutoExposure]) {
+ device.exposureMode = AVCaptureExposureModeContinuousAutoExposure;
+ }
+ if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance]) {
+ device.whiteBalanceMode = AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance;
+ }
+ if ([device isFocusModeSupported:AVCaptureFocusModeContinuousAutoFocus]) {
+ device.focusMode = AVCaptureFocusModeContinuousAutoFocus;
+ }
+ [device unlockForConfiguration];
+ }
+
+ AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
+ if (!input) return -1;
+
+ AVCaptureSession *session = [[AVCaptureSession alloc] init];
+ // Use Photo preset for highest quality
+ if ([session canSetSessionPreset:AVCaptureSessionPresetPhoto]) {
+ session.sessionPreset = AVCaptureSessionPresetPhoto;
+ } else if ([session canSetSessionPreset:AVCaptureSessionPresetHigh]) {
+ session.sessionPreset = AVCaptureSessionPresetHigh;
+ } else {
+ session.sessionPreset = AVCaptureSessionPresetMedium;
+ }
+
+ if (![session canAddInput:input]) return -1;
+ [session addInput:input];
+
+ AVCaptureVideoDataOutput *videoOutput = [[AVCaptureVideoDataOutput alloc] init];
+ videoOutput.videoSettings = @{(NSString *)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA)};
+ videoOutput.alwaysDiscardsLateVideoFrames = YES;
+
+ RCLISingleFrameCapture *delegate = [[RCLISingleFrameCapture alloc] init];
+ delegate.outputPath = [NSString stringWithUTF8String:output_path];
+ delegate.captured = NO;
+ delegate.semaphore = dispatch_semaphore_create(0);
+ delegate.frameCount = 0;
+ // Skip ~60 frames (~2 seconds at 30fps) to let auto-exposure fully stabilize
+ delegate.framesToSkip = 60;
+
+ dispatch_queue_t queue = dispatch_queue_create("com.rcli.camera", DISPATCH_QUEUE_SERIAL);
+ [videoOutput setSampleBufferDelegate:delegate queue:queue];
+
+ if (![session canAddOutput:videoOutput]) return -1;
+ [session addOutput:videoOutput];
+
+ // Start capture — delegate will skip first 60 frames for AE stabilization
+ [session startRunning];
+
+ // Wait for frame capture (timeout 10 seconds — allows for warmup + capture)
+ long result = dispatch_semaphore_wait(delegate.semaphore,
+ dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC));
+
+ [session stopRunning];
+
+ if (result != 0) return -1; // timeout
+
+ // Verify the file was written
+ NSFileManager *fm = [NSFileManager defaultManager];
+ if (![fm fileExistsAtPath:delegate.outputPath]) return -1;
+
+ return 0;
+ }
+}
diff --git a/src/audio/rcli_overlay.m b/src/audio/rcli_overlay.m
new file mode 100644
index 0000000..274a3fc
--- /dev/null
+++ b/src/audio/rcli_overlay.m
@@ -0,0 +1,191 @@
+// rcli_overlay — standalone Cocoa app showing a draggable/resizable overlay
+// frame for screen capture. Communicates with parent RCLI via stdin/stdout.
+//
+// Commands (one per line on stdin):
+// frame → replies "x,y,w,h\n" (screen coords, top-left origin)
+// hide → sets alpha to 0 (for capture)
+// show → restores alpha to 1
+// quit → exits
+
+#import
+
+static const CGFloat kBorder = 6.0;
+static const CGFloat kRadius = 12.0;
+static const CGFloat kHandle = 18.0; // corner handle size
+static const CGFloat kEdgeGrab = 14.0; // invisible edge grab zone
+
+// ── Custom view: bold border + corner handles + label pill ─────────────
+@interface OverlayView : NSView
+@end
+
+@implementation OverlayView
+
+- (void)drawRect:(NSRect)dirtyRect {
+ [[NSColor clearColor] set];
+ NSRectFill(dirtyRect);
+
+ NSRect inner = NSInsetRect(self.bounds, kBorder, kBorder);
+ NSColor *green = [NSColor colorWithRed:0.15 green:0.9 blue:0.45 alpha:0.92];
+
+ // Outer glow
+ NSBezierPath *glow = [NSBezierPath bezierPathWithRoundedRect:inner
+ xRadius:kRadius yRadius:kRadius];
+ [glow setLineWidth:kBorder + 6];
+ [[green colorWithAlphaComponent:0.12] set];
+ [glow stroke];
+
+ // Main border — solid, thick, rounded
+ NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:inner
+ xRadius:kRadius yRadius:kRadius];
+ [border setLineWidth:kBorder];
+ [green set];
+ [border stroke];
+
+ // Corner handles — filled rounded squares with white dot
+ CGFloat hs = kHandle;
+ CGFloat off = kBorder / 2;
+ NSRect corners[4] = {
+ NSMakeRect(NSMinX(inner) - off, NSMinY(inner) - off, hs, hs),
+ NSMakeRect(NSMaxX(inner) + off - hs, NSMinY(inner) - off, hs, hs),
+ NSMakeRect(NSMinX(inner) - off, NSMaxY(inner) + off - hs, hs, hs),
+ NSMakeRect(NSMaxX(inner) + off - hs, NSMaxY(inner) + off - hs, hs, hs),
+ };
+ for (int i = 0; i < 4; i++) {
+ NSBezierPath *h = [NSBezierPath bezierPathWithRoundedRect:corners[i]
+ xRadius:4 yRadius:4];
+ [green set];
+ [h fill];
+ // White center dot
+ NSRect dot = NSInsetRect(corners[i], 5, 5);
+ [[NSColor colorWithWhite:1.0 alpha:0.85] set];
+ [[NSBezierPath bezierPathWithOvalInRect:dot] fill];
+ }
+
+ // Label pill — centered at top
+ NSString *label = @" RCLI Visual Mode ";
+ NSDictionary *attrs = @{
+ NSFontAttributeName: [NSFont systemFontOfSize:11 weight:NSFontWeightBold],
+ NSForegroundColorAttributeName: [NSColor blackColor],
+ };
+ NSSize sz = [label sizeWithAttributes:attrs];
+ CGFloat px = NSMidX(self.bounds) - sz.width / 2 - 6;
+ CGFloat py = NSMaxY(inner) - 2;
+ NSRect pill = NSMakeRect(px, py, sz.width + 12, sz.height + 6);
+ NSBezierPath *pillPath = [NSBezierPath bezierPathWithRoundedRect:pill
+ xRadius:10 yRadius:10];
+ [green set];
+ [pillPath fill];
+ [label drawAtPoint:NSMakePoint(px + 6, py + 3) withAttributes:attrs];
+}
+
+- (BOOL)acceptsFirstMouse:(NSEvent *)e { return YES; }
+@end
+
+// ── Custom window: borderless, transparent, floating, draggable ───────
+@interface OverlayWindow : NSWindow
+@end
+
+@implementation OverlayWindow
+- (instancetype)initWithRect:(NSRect)rect {
+ self = [super initWithContentRect:rect
+ styleMask:NSWindowStyleMaskBorderless |
+ NSWindowStyleMaskResizable
+ backing:NSBackingStoreBuffered
+ defer:NO];
+ if (self) {
+ self.opaque = NO;
+ self.backgroundColor = [NSColor clearColor];
+ self.level = NSFloatingWindowLevel;
+ self.hasShadow = NO;
+ self.movableByWindowBackground = YES;
+ self.contentView = [[OverlayView alloc] initWithFrame:rect];
+ self.collectionBehavior = NSWindowCollectionBehaviorCanJoinAllSpaces |
+ NSWindowCollectionBehaviorStationary;
+ self.minSize = NSMakeSize(120, 80);
+ }
+ return self;
+}
+- (BOOL)canBecomeKeyWindow { return YES; }
+- (BOOL)canBecomeMainWindow { return NO; }
+@end
+
+// ── Stdin reader (runs on a background thread) ────────────────────────
+@interface StdinReader : NSObject
+@property (nonatomic, strong) OverlayWindow *window;
+- (void)startReading;
+- (void)handleCommand:(NSString *)cmd;
+@end
+
+@implementation StdinReader
+
+- (void)startReading {
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ char buf[256];
+ while (fgets(buf, sizeof(buf), stdin)) {
+ NSString *cmd = [[NSString stringWithUTF8String:buf]
+ stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
+ if (cmd.length == 0) continue;
+ [self performSelectorOnMainThread:@selector(handleCommand:)
+ withObject:cmd
+ waitUntilDone:YES];
+ }
+ dispatch_async(dispatch_get_main_queue(), ^{
+ [NSApp terminate:nil];
+ });
+ });
+}
+
+- (void)handleCommand:(NSString *)cmd {
+ if ([cmd isEqualToString:@"frame"]) {
+ NSRect f = self.window.frame;
+ CGFloat screenH = [NSScreen mainScreen].frame.size.height;
+ int x = (int)f.origin.x;
+ int y = (int)(screenH - f.origin.y - f.size.height);
+ int w = (int)f.size.width;
+ int h = (int)f.size.height;
+ printf("%d,%d,%d,%d\n", x, y, w, h);
+ fflush(stdout);
+ } else if ([cmd isEqualToString:@"hide"]) {
+ [self.window setAlphaValue:0.0];
+ [NSThread sleepForTimeInterval:0.05];
+ printf("ok\n");
+ fflush(stdout);
+ } else if ([cmd isEqualToString:@"show"]) {
+ [self.window setAlphaValue:1.0];
+ printf("ok\n");
+ fflush(stdout);
+ } else if ([cmd isEqualToString:@"quit"]) {
+ [NSApp terminate:nil];
+ }
+}
+
+@end
+
+// ── Main ──────────────────────────────────────────────────────────────
+int main(int argc, const char *argv[]) {
+ @autoreleasepool {
+ NSApplication *app = [NSApplication sharedApplication];
+ [app setActivationPolicy:NSApplicationActivationPolicyAccessory];
+
+ NSScreen *scr = [NSScreen mainScreen];
+ NSRect sf = scr.frame;
+ CGFloat w = 800, h = 600;
+ CGFloat x = (sf.size.width - w) / 2;
+ CGFloat y = (sf.size.height - h) / 2;
+
+ OverlayWindow *win = [[OverlayWindow alloc]
+ initWithRect:NSMakeRect(x, y, w, h)];
+ [win makeKeyAndOrderFront:nil];
+ [app activateIgnoringOtherApps:YES];
+
+ StdinReader *reader = [[StdinReader alloc] init];
+ reader.window = win;
+ [reader startReading];
+
+ printf("ready\n");
+ fflush(stdout);
+
+ [app run];
+ }
+ return 0;
+}
diff --git a/src/audio/screen_capture.h b/src/audio/screen_capture.h
new file mode 100644
index 0000000..0cc5421
--- /dev/null
+++ b/src/audio/screen_capture.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --- Visual Mode (overlay frame) ---
+
+// Show the visual overlay window. User can drag/resize it over content.
+// x, y, w, h: initial position and size in screen coordinates (0 = defaults).
+void screen_capture_show_overlay(int x, int y, int w, int h);
+
+// Hide the visual overlay window.
+void screen_capture_hide_overlay(void);
+
+// Returns 1 if the overlay is currently visible.
+int screen_capture_overlay_active(void);
+
+// Capture the screen region behind the overlay (hides overlay briefly).
+// Returns 0 on success, -1 on failure.
+int screen_capture_overlay_region(const char* output_path);
+
+// --- Legacy capture functions ---
+
+// Capture the frontmost/active window and save as JPEG.
+int screen_capture_active_window(const char* output_path);
+
+// Capture the window behind our own terminal (for voice triggers).
+int screen_capture_behind_terminal(const char* output_path);
+
+// Capture the entire main display and save as JPEG (fallback).
+int screen_capture_full_screen(const char* output_path);
+
+// Convenience: tries overlay if active, then active window, then full screen.
+int screen_capture_screenshot(const char* output_path);
+
+// Get the name of the app targeted by screen_capture_behind_terminal.
+const char* screen_capture_target_app_name(char* buf, int buf_size);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/audio/screen_capture.mm b/src/audio/screen_capture.mm
new file mode 100644
index 0000000..e2f3ea8
--- /dev/null
+++ b/src/audio/screen_capture.mm
@@ -0,0 +1,425 @@
+#import
+#import
+#include "screen_capture.h"
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+extern char** environ;
+
+// ---------------------------------------------------------------------------
+// Helper: downscale a JPEG on disk if it exceeds max dimension (for VLM)
+// ---------------------------------------------------------------------------
+static void downscale_jpeg_if_needed(const char* path, int max_dim) {
+ @autoreleasepool {
+ NSString *nsPath = [NSString stringWithUTF8String:path];
+ NSData *data = [NSData dataWithContentsOfFile:nsPath];
+ if (!data) return;
+
+ NSBitmapImageRep *srcRep = [NSBitmapImageRep imageRepWithData:data];
+ if (!srcRep) return;
+
+ NSInteger w = srcRep.pixelsWide;
+ NSInteger h = srcRep.pixelsHigh;
+ if (w <= max_dim && h <= max_dim) return;
+
+ CGFloat scale = (CGFloat)max_dim / fmax((CGFloat)w, (CGFloat)h);
+ NSInteger nw = (NSInteger)floor(w * scale);
+ NSInteger nh = (NSInteger)floor(h * scale);
+
+ NSBitmapImageRep *dstRep = [[NSBitmapImageRep alloc]
+ initWithBitmapDataPlanes:NULL
+ pixelsWide:nw
+ pixelsHigh:nh
+ bitsPerSample:8
+ samplesPerPixel:4
+ hasAlpha:YES
+ isPlanar:NO
+ colorSpaceName:NSCalibratedRGBColorSpace
+ bytesPerRow:0
+ bitsPerPixel:0];
+
+ [NSGraphicsContext saveGraphicsState];
+ NSGraphicsContext *ctx = [NSGraphicsContext graphicsContextWithBitmapImageRep:dstRep];
+ [NSGraphicsContext setCurrentContext:ctx];
+ [ctx setImageInterpolation:NSImageInterpolationHigh];
+
+ NSImage *nsImage = [[NSImage alloc] initWithSize:NSMakeSize((CGFloat)w, (CGFloat)h)];
+ [nsImage addRepresentation:srcRep];
+ [nsImage drawInRect:NSMakeRect(0, 0, (CGFloat)nw, (CGFloat)nh)
+ fromRect:NSZeroRect
+ operation:NSCompositingOperationCopy
+ fraction:1.0];
+
+ [NSGraphicsContext restoreGraphicsState];
+
+ NSData *jpegData = [dstRep representationUsingType:NSBitmapImageFileTypeJPEG
+ properties:@{NSImageCompressionFactor: @0.85}];
+ if (jpegData && jpegData.length > 0) {
+ [jpegData writeToFile:nsPath atomically:YES];
+ }
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Helper: run screencapture with given args, verify output
+// ---------------------------------------------------------------------------
+static int run_screencapture(const char* const argv[], const char* output_path) {
+ pid_t pid;
+ int status = 0;
+ if (posix_spawnp(&pid, "screencapture", nullptr, nullptr,
+ const_cast(argv), environ) != 0) {
+ return -1;
+ }
+ waitpid(pid, &status, 0);
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) return -1;
+
+ struct stat st;
+ if (stat(output_path, &st) != 0 || st.st_size == 0) return -1;
+
+ downscale_jpeg_if_needed(output_path, 2048);
+ return 0;
+}
+
+// ===========================================================================
+// Visual overlay — spawns rcli_overlay helper process (separate Cocoa app)
+// because AppKit window management requires the main thread, which FTXUI owns.
+// Communication via stdin/stdout pipes.
+// ===========================================================================
+
+static pid_t g_overlay_pid = 0;
+static FILE *g_overlay_stdin = nullptr; // we write commands here
+static FILE *g_overlay_stdout = nullptr; // we read responses here
+static std::atomic g_overlay_visible{false};
+
+// Find rcli_overlay binary next to the rcli binary
+static std::string find_overlay_binary() {
+ // Try next to our own executable
+ char path[1024];
+ uint32_t size = sizeof(path);
+ if (_NSGetExecutablePath(path, &size) == 0) {
+ std::string dir(path);
+ auto slash = dir.rfind('/');
+ if (slash != std::string::npos) {
+ std::string candidate = dir.substr(0, slash + 1) + "rcli_overlay";
+ if (access(candidate.c_str(), X_OK) == 0) return candidate;
+ }
+ }
+ // Fallback: try PATH
+ return "rcli_overlay";
+}
+
+// Send a command to the overlay process and read the response line
+static std::string overlay_cmd(const char* cmd) {
+ if (!g_overlay_stdin || !g_overlay_stdout) return "";
+ fprintf(g_overlay_stdin, "%s\n", cmd);
+ fflush(g_overlay_stdin);
+ char buf[256] = {0};
+ if (fgets(buf, sizeof(buf), g_overlay_stdout)) {
+ // Strip trailing newline
+ size_t len = strlen(buf);
+ if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0';
+ return std::string(buf);
+ }
+ return "";
+}
+
+void screen_capture_show_overlay(int x, int y, int w, int h) {
+ (void)x; (void)y; (void)w; (void)h; // TODO: pass initial rect to helper
+
+ if (g_overlay_pid > 0) {
+ // Already running — just return
+ return;
+ }
+
+ std::string binary = find_overlay_binary();
+
+ // Create pipes: parent→child stdin, child→parent stdout
+ int pipe_in[2], pipe_out[2];
+ if (pipe(pipe_in) != 0 || pipe(pipe_out) != 0) return;
+
+ pid_t pid = fork();
+ if (pid == 0) {
+ // Child: wire up pipes
+ close(pipe_in[1]); // close write end of stdin pipe
+ close(pipe_out[0]); // close read end of stdout pipe
+ dup2(pipe_in[0], STDIN_FILENO);
+ dup2(pipe_out[1], STDOUT_FILENO);
+ close(pipe_in[0]);
+ close(pipe_out[1]);
+ // Redirect stderr to /dev/null to keep terminal clean
+ int devnull = open("/dev/null", O_WRONLY);
+ if (devnull >= 0) { dup2(devnull, STDERR_FILENO); close(devnull); }
+ execl(binary.c_str(), "rcli_overlay", nullptr);
+ _exit(1);
+ }
+
+ // Parent
+ close(pipe_in[0]);
+ close(pipe_out[1]);
+ g_overlay_pid = pid;
+ g_overlay_stdin = fdopen(pipe_in[1], "w");
+ g_overlay_stdout = fdopen(pipe_out[0], "r");
+
+ // Wait for "ready" from child
+ char buf[64] = {0};
+ if (g_overlay_stdout && fgets(buf, sizeof(buf), g_overlay_stdout)) {
+ g_overlay_visible.store(true);
+ }
+}
+
+void screen_capture_hide_overlay(void) {
+ if (g_overlay_pid <= 0) return;
+
+ overlay_cmd("quit");
+
+ // Clean up
+ if (g_overlay_stdin) { fclose(g_overlay_stdin); g_overlay_stdin = nullptr; }
+ if (g_overlay_stdout) { fclose(g_overlay_stdout); g_overlay_stdout = nullptr; }
+ int status;
+ waitpid(g_overlay_pid, &status, 0);
+ g_overlay_pid = 0;
+ g_overlay_visible.store(false);
+}
+
+int screen_capture_overlay_active(void) {
+ return g_overlay_visible.load() ? 1 : 0;
+}
+
+int screen_capture_overlay_region(const char* output_path) {
+ if (!g_overlay_visible.load() || g_overlay_pid <= 0) return -1;
+
+ // Get frame coordinates (top-left origin)
+ std::string frame_str = overlay_cmd("frame");
+ if (frame_str.empty()) return -1;
+
+ // Hide overlay for capture
+ overlay_cmd("hide");
+
+ // Capture the region
+ char region[128];
+ strlcpy(region, frame_str.c_str(), sizeof(region));
+ const char* argv[] = {
+ "screencapture", "-x", "-t", "jpg", "-R", region, output_path, nullptr
+ };
+ int result = run_screencapture(argv, output_path);
+
+ // Show overlay again
+ overlay_cmd("show");
+
+ return result;
+}
+
+// ---------------------------------------------------------------------------
+// Track the previously active app (before our terminal got focus)
+// Polls frontmostApplication every 200ms on a background thread.
+// NSWorkspace notifications don't work in CLI apps (no NSApplication run loop).
+// ---------------------------------------------------------------------------
+
+static std::atomic g_prev_active_pid{0};
+static pid_t g_our_terminal_pid = 0;
+static char g_prev_app_name[256] = {0};
+static std::mutex g_name_mutex;
+
+// Walk up process tree to find which ancestor owns a window (our terminal)
+static pid_t find_terminal_pid() {
+ @autoreleasepool {
+ pid_t cur = getpid();
+ pid_t ancestors[8];
+ int n = 0;
+ while (cur > 1 && n < 8) {
+ ancestors[n++] = cur;
+ struct kinfo_proc kp;
+ size_t length = sizeof(kp);
+ int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, cur };
+ if (sysctl(mib, 4, &kp, &length, NULL, 0) != 0) break;
+ pid_t ppid = kp.kp_eproc.e_ppid;
+ if (ppid == cur) break;
+ cur = ppid;
+ }
+
+ // Check which ancestor owns on-screen windows — that's the terminal
+ #pragma clang diagnostic push
+ #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+ CFArrayRef windowList = CGWindowListCopyWindowInfo(
+ kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements,
+ kCGNullWindowID);
+ #pragma clang diagnostic pop
+ if (windowList) {
+ NSArray *windows = CFBridgingRelease(windowList);
+ for (int i = n - 1; i >= 0; i--) {
+ for (NSDictionary *info in windows) {
+ pid_t ownerPid = [[info objectForKey:(NSString *)kCGWindowOwnerPID] intValue];
+ if (ownerPid == ancestors[i]) {
+ return ancestors[i];
+ }
+ }
+ }
+ }
+ return (n >= 3) ? ancestors[2] : getppid();
+ }
+}
+
+// Background poller — tracks which non-terminal app is frontmost
+__attribute__((constructor))
+static void start_app_tracking() {
+ @autoreleasepool {
+ g_our_terminal_pid = find_terminal_pid();
+
+ // Seed with current frontmost app if it's not our terminal
+ NSRunningApplication *front = [[NSWorkspace sharedWorkspace] frontmostApplication];
+ if (front && front.processIdentifier != g_our_terminal_pid) {
+ g_prev_active_pid.store(front.processIdentifier, std::memory_order_relaxed);
+ NSString *name = front.localizedName ?: @"unknown";
+ std::lock_guard lock(g_name_mutex);
+ strlcpy(g_prev_app_name, [name UTF8String], sizeof(g_prev_app_name));
+ }
+
+ // Poll frontmostApplication every 200ms on a background thread
+ std::thread([]() {
+ pthread_setname_np("rcli.app_tracker");
+ pid_t last_seen_pid = 0;
+ while (true) {
+ @autoreleasepool {
+ NSRunningApplication *front =
+ [[NSWorkspace sharedWorkspace] frontmostApplication];
+ if (front) {
+ pid_t pid = front.processIdentifier;
+ // If a non-terminal app is frontmost and it changed, record it
+ if (pid != g_our_terminal_pid && pid != last_seen_pid) {
+ last_seen_pid = pid;
+ g_prev_active_pid.store(pid, std::memory_order_relaxed);
+ NSString *name = front.localizedName ?: @"unknown";
+ std::lock_guard lock(g_name_mutex);
+ strlcpy(g_prev_app_name, [name UTF8String],
+ sizeof(g_prev_app_name));
+ }
+ }
+ }
+ usleep(200000); // 200ms
+ }
+ }).detach();
+ }
+}
+
+// ---------------------------------------------------------------------------
+// Window lookup helpers
+// ---------------------------------------------------------------------------
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+static bool is_normal_window(NSDictionary *info) {
+ NSDictionary *bounds = [info objectForKey:(NSString *)kCGWindowBounds];
+ if (!bounds) return false;
+ CGFloat w = [[bounds objectForKey:@"Width"] floatValue];
+ CGFloat h = [[bounds objectForKey:@"Height"] floatValue];
+ return (w >= 100 && h >= 100);
+}
+
+// Find a normal window belonging to a specific PID
+static CGWindowID find_window_for_pid(pid_t target_pid) {
+ CFArrayRef windowList = CGWindowListCopyWindowInfo(
+ kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements,
+ kCGNullWindowID);
+ if (!windowList) return kCGNullWindowID;
+
+ NSArray *windows = CFBridgingRelease(windowList);
+ for (NSDictionary *info in windows) {
+ pid_t ownerPid = [[info objectForKey:(NSString *)kCGWindowOwnerPID] intValue];
+ if (ownerPid != target_pid) continue;
+ if (!is_normal_window(info)) continue;
+ return [[info objectForKey:(NSString *)kCGWindowNumber] unsignedIntValue];
+ }
+ return kCGNullWindowID;
+}
+
+// Find the frontmost normal window of the frontmost app
+static CGWindowID get_frontmost_window_id() {
+ @autoreleasepool {
+ NSRunningApplication *frontApp = [[NSWorkspace sharedWorkspace] frontmostApplication];
+ if (!frontApp) return kCGNullWindowID;
+ return find_window_for_pid(frontApp.processIdentifier);
+ }
+}
+
+// Find the window of the previously active app (before terminal got focus)
+static CGWindowID get_previous_app_window_id() {
+ @autoreleasepool {
+ pid_t prev_pid = g_prev_active_pid.load(std::memory_order_relaxed);
+ if (prev_pid <= 0) return kCGNullWindowID;
+ return find_window_for_pid(prev_pid);
+ }
+}
+
+#pragma clang diagnostic pop
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+static int capture_window_id(CGWindowID wid, const char* output_path) {
+ if (wid == kCGNullWindowID) return -1;
+ char wid_str[32];
+ snprintf(wid_str, sizeof(wid_str), "%u", wid);
+ const char* argv[] = {
+ "screencapture", "-x", "-t", "jpg", "-l", wid_str, output_path, nullptr
+ };
+ return run_screencapture(argv, output_path);
+}
+
+int screen_capture_active_window(const char* output_path) {
+ CGWindowID wid = get_frontmost_window_id();
+ if (wid == kCGNullWindowID) {
+ return screen_capture_full_screen(output_path);
+ }
+ return capture_window_id(wid, output_path);
+}
+
+int screen_capture_behind_terminal(const char* output_path) {
+ // Use the tracked previously-active app (before terminal got focus)
+ {
+ std::lock_guard lock(g_name_mutex);
+ pid_t prev = g_prev_active_pid.load(std::memory_order_relaxed);
+ fprintf(stderr, "[Screen] Targeting: %s (PID %d)\n",
+ g_prev_app_name[0] ? g_prev_app_name : "none", prev);
+ }
+ CGWindowID wid = get_previous_app_window_id();
+ if (wid == kCGNullWindowID) {
+ fprintf(stderr, "[Screen] No previous app window found, falling back to full screen\n");
+ return screen_capture_full_screen(output_path);
+ }
+ return capture_window_id(wid, output_path);
+}
+
+int screen_capture_full_screen(const char* output_path) {
+ const char* argv[] = {
+ "screencapture", "-x", "-t", "jpg", output_path, nullptr
+ };
+ return run_screencapture(argv, output_path);
+}
+
+int screen_capture_screenshot(const char* output_path) {
+ // Prefer overlay if active, then active window, then full screen
+ if (screen_capture_overlay_active()) {
+ return screen_capture_overlay_region(output_path);
+ }
+ return screen_capture_active_window(output_path);
+}
+
+const char* screen_capture_target_app_name(char* buf, int buf_size) {
+ std::lock_guard lock(g_name_mutex);
+ if (g_prev_app_name[0]) {
+ strlcpy(buf, g_prev_app_name, buf_size);
+ } else {
+ strlcpy(buf, "unknown", buf_size);
+ }
+ return buf;
+}
diff --git a/src/cli/help.h b/src/cli/help.h
index bb9b37a..9ecca9b 100644
--- a/src/cli/help.h
+++ b/src/cli/help.h
@@ -19,6 +19,8 @@ inline void print_usage(const char* argv0) {
" %sask%s One-shot text command\n"
" %sactions%s [name] List all actions, or show detail for one\n"
" %saction%s [json] Execute a named action directly\n"
+ " %svlm%s
[prompt] Analyze image with Vision Language Model\n"
+ " %sscreen%s [prompt] Capture screenshot & analyze with VLM\n"
" %srag%s RAG: ingest docs, query, status\n"
" %ssetup%s Download AI models (~1GB)\n"
" %smodels%s Manage all AI models (LLM, STT, TTS)\n"
@@ -45,6 +47,10 @@ inline void print_usage(const char* argv0) {
" rcli ask \"open Safari\" # one-shot command\n"
" rcli ask \"create a note called Ideas\" # triggers action\n"
" rcli actions # see all actions\n"
+ " rcli vlm photo.jpg # analyze an image\n"
+ " rcli vlm photo.jpg \"What is this?\" # image with custom prompt\n"
+ " rcli screen # capture & analyze screen\n"
+ " rcli screen \"What app is open?\" # screen with custom prompt\n"
" rcli actions create_note # action detail\n"
" rcli setup # download models\n\n",
color::bold, color::orange, color::reset,
@@ -69,6 +75,8 @@ inline void print_usage(const char* argv0) {
color::green, color::reset,
color::green, color::reset,
color::green, color::reset,
+ color::green, color::reset,
+ color::green, color::reset,
color::dim, color::reset,
color::dim, color::reset);
}
@@ -130,7 +138,13 @@ inline void print_help_interactive() {
fprintf(stderr, " %sdo [text]%s execute action directly (no JSON needed)\n", color::bold, color::reset);
fprintf(stderr, " %srag status%s show indexed documents\n", color::bold, color::reset);
fprintf(stderr, " %srag ingest %s index docs for Q&A\n", color::bold, color::reset);
+ fprintf(stderr, " %scamera%s capture photo from webcam & analyze\n", color::bold, color::reset);
+ fprintf(stderr, " %sscreen%s capture screenshot & analyze\n", color::bold, color::reset);
fprintf(stderr, " %squit%s exit\n\n", color::bold, color::reset);
+ fprintf(stderr, " %s%s Vision:%s\n", color::bold, color::orange, color::reset);
+ fprintf(stderr, " Drag & drop an image file to analyze it with the VLM.\n");
+ fprintf(stderr, " Type %scamera%s to capture a photo from your webcam.\n", color::bold, color::reset);
+ fprintf(stderr, " Type %sscreen%s to capture and analyze your screen.\n\n", color::bold, color::reset);
fprintf(stderr, " %s%s Try:%s\n", color::bold, color::orange, color::reset);
fprintf(stderr, " %s\"Open Safari\" \"What's on my calendar?\" \"Set volume to 50\"%s\n\n",
color::dim, color::reset);
diff --git a/src/cli/main.cpp b/src/cli/main.cpp
index 4f49472..58cd4e1 100644
--- a/src/cli/main.cpp
+++ b/src/cli/main.cpp
@@ -27,6 +27,13 @@
#include "audio/mic_permission.h"
#include "core/personality.h"
#include "llama.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include "audio/camera_capture.h"
+#include "audio/screen_capture.h"
+#include
+
+extern char** environ;
// Defined in cli_common.h as a forward declaration; implemented here because
// it depends on the Objective-C mic_permission bridge compiled into this TU.
@@ -427,6 +434,229 @@ static int cmd_ask(const Args& args) {
return 0;
}
+// =============================================================================
+// VLM subcommand
+// =============================================================================
+
+static int cmd_vlm(const Args& args) {
+ if (args.arg1.empty() || args.help) {
+ fprintf(stderr, "\n Usage: rcli vlm [prompt]\n\n");
+ fprintf(stderr, " Analyze an image using a Vision Language Model.\n\n");
+ fprintf(stderr, " Examples:\n");
+ fprintf(stderr, " rcli vlm photo.jpg\n");
+ fprintf(stderr, " rcli vlm screenshot.png \"What text do you see?\"\n");
+ fprintf(stderr, " rcli vlm diagram.jpg \"Explain this diagram\"\n\n");
+ return args.help ? 0 : 1;
+ }
+
+ // Resolve image path
+ std::string image_path = args.arg1;
+ if (!image_path.empty() && image_path[0] == '~') {
+ if (const char* home = getenv("HOME"))
+ image_path = std::string(home) + image_path.substr(1);
+ }
+ // Make relative paths absolute
+ if (!image_path.empty() && image_path[0] != '/') {
+ char cwd[4096];
+ if (getcwd(cwd, sizeof(cwd)))
+ image_path = std::string(cwd) + "/" + image_path;
+ }
+
+ struct stat st;
+ if (stat(image_path.c_str(), &st) != 0) {
+ fprintf(stderr, "%s%sError: Image not found: %s%s\n",
+ color::bold, color::red, image_path.c_str(), color::reset);
+ return 1;
+ }
+
+ if (!rastack::VlmEngine::is_supported_image(image_path)) {
+ fprintf(stderr, "%s%sError: Unsupported image format. Supported: jpg, png, bmp, gif, webp, tga%s\n",
+ color::bold, color::red, color::reset);
+ return 1;
+ }
+
+ std::string prompt = args.arg2.empty() ? "Describe this image in detail." : args.arg2;
+
+ // Create engine with models_dir set (we only need VLM, not the full pipeline)
+ std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}";
+ g_engine = rcli_create(config_json.c_str());
+ if (!g_engine) return 1;
+
+ // Initialize VLM
+ fprintf(stderr, "%sInitializing VLM...%s\n", color::dim, color::reset);
+ if (rcli_vlm_init(g_engine) != 0) {
+ fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset);
+ fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n");
+ fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset);
+ fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+ rcli_destroy(g_engine);
+ return 1;
+ }
+
+ // Show which VLM backend is active
+ const char* backend = rcli_vlm_backend_name(g_engine);
+ const char* model = rcli_vlm_model_name(g_engine);
+ if (backend && backend[0]) {
+ fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n",
+ color::dim, color::reset, color::bold, model,
+ color::reset, color::dim, backend, color::reset);
+ }
+
+ fprintf(stderr, "%sAnalyzing image: %s%s\n", color::dim, image_path.c_str(), color::reset);
+
+ const char* response = rcli_vlm_analyze(g_engine, image_path.c_str(), prompt.c_str());
+ if (response && response[0]) {
+ fprintf(stdout, "%s\n", response);
+ RCLIVlmStats stats;
+ if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
+ fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n",
+ color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
+ stats.total_time_sec, stats.first_token_ms, color::reset);
+ }
+ } else {
+ fprintf(stderr, "%s%sError: VLM analysis failed%s\n",
+ color::bold, color::red, color::reset);
+ rcli_destroy(g_engine);
+ return 1;
+ }
+
+ rcli_destroy(g_engine);
+ return 0;
+}
+
+// =============================================================================
+// Camera subcommand — capture + analyze
+// =============================================================================
+
+static int cmd_camera(const Args& args) {
+ std::string prompt = args.arg1.empty() ? "Describe what you see in this photo in detail." : args.arg1;
+
+ fprintf(stderr, "%sCapturing photo from camera...%s\n", color::dim, color::reset);
+ std::string photo_path = "/tmp/rcli_camera.jpg";
+
+ int rc = camera_capture_photo(photo_path.c_str());
+ if (rc != 0) {
+ fprintf(stderr, "%s%sError: Camera capture failed. Check camera permissions.%s\n",
+ color::bold, color::red, color::reset);
+ return 1;
+ }
+ fprintf(stderr, "%sPhoto captured! Analyzing with VLM...%s\n", color::dim, color::reset);
+
+ std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}";
+ g_engine = rcli_create(config_json.c_str());
+ if (!g_engine) return 1;
+
+ if (rcli_vlm_init(g_engine) != 0) {
+ fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset);
+ fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n");
+ fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset);
+ fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+ rcli_destroy(g_engine);
+ return 1;
+ }
+
+ const char* backend = rcli_vlm_backend_name(g_engine);
+ const char* model = rcli_vlm_model_name(g_engine);
+ if (backend && backend[0]) {
+ fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n",
+ color::dim, color::reset, color::bold, model,
+ color::reset, color::dim, backend, color::reset);
+ }
+
+ const char* response = rcli_vlm_analyze(g_engine, photo_path.c_str(), prompt.c_str());
+ if (response && response[0]) {
+ fprintf(stdout, "%s\n", response);
+ if (!args.no_speak) {
+ rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers);
+ rcli_speak(g_engine, response);
+ }
+ RCLIVlmStats stats;
+ if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
+ fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n",
+ color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
+ stats.total_time_sec, stats.first_token_ms, color::reset);
+ }
+ {
+ pid_t pid;
+ const char* argv[] = {"open", photo_path.c_str(), nullptr};
+ posix_spawnp(&pid, "open", nullptr, nullptr,
+ const_cast(argv), environ);
+ }
+ } else {
+ fprintf(stderr, "%s%sError: VLM analysis failed%s\n",
+ color::bold, color::red, color::reset);
+ rcli_destroy(g_engine);
+ return 1;
+ }
+
+ rcli_destroy(g_engine);
+ return 0;
+}
+
+// =============================================================================
+// Screen subcommand — screenshot + analyze
+// =============================================================================
+
+static int cmd_screen(const Args& args) {
+ std::string prompt = args.arg1.empty()
+ ? "Describe what you see on this screen in detail." : args.arg1;
+
+ fprintf(stderr, "%sCapturing screenshot...%s\n", color::dim, color::reset);
+ std::string screen_path = "/tmp/rcli_screen.jpg";
+
+ int rc = screen_capture_screenshot(screen_path.c_str());
+ if (rc != 0) {
+ fprintf(stderr, "%s%sError: Screen capture failed. Check screen recording permissions.%s\n",
+ color::bold, color::red, color::reset);
+ return 1;
+ }
+ fprintf(stderr, "%sScreenshot captured! Analyzing with VLM...%s\n", color::dim, color::reset);
+
+ std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}";
+ g_engine = rcli_create(config_json.c_str());
+ if (!g_engine) return 1;
+
+ if (rcli_vlm_init(g_engine) != 0) {
+ fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset);
+ fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n");
+ fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset);
+ fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+ rcli_destroy(g_engine);
+ return 1;
+ }
+
+ const char* backend = rcli_vlm_backend_name(g_engine);
+ const char* model = rcli_vlm_model_name(g_engine);
+ if (backend && backend[0]) {
+ fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n",
+ color::dim, color::reset, color::bold, model,
+ color::reset, color::dim, backend, color::reset);
+ }
+
+ const char* response = rcli_vlm_analyze(g_engine, screen_path.c_str(), prompt.c_str());
+ if (response && response[0]) {
+ fprintf(stdout, "%s\n", response);
+ if (!args.no_speak) {
+ rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers);
+ rcli_speak(g_engine, response);
+ }
+ RCLIVlmStats stats;
+ if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
+ fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n",
+ color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
+ stats.total_time_sec, stats.first_token_ms, color::reset);
+ }
+ } else {
+ fprintf(stderr, "%s%sError: VLM analysis failed%s\n",
+ color::bold, color::red, color::reset);
+ rcli_destroy(g_engine);
+ return 1;
+ }
+
+ rcli_destroy(g_engine);
+ return 0;
+}
+
// =============================================================================
// RAG subcommands
// =============================================================================
@@ -654,16 +884,17 @@ static int cmd_metalrt(const Args& args) {
inst ? color::reset : "");
}
- // STT/TTS component models
+ // STT/TTS/VLM component models
size_t offset = mrt_models.size();
- fprintf(stderr, "\n %s— STT/TTS Components —%s\n", color::bold, color::reset);
+ fprintf(stderr, "\n %s— STT/TTS/VLM Components —%s\n", color::bold, color::reset);
fprintf(stderr, " %s# %-28s %-8s %-5s Status%s\n",
color::bold, "Model", "Size", "Type", color::reset);
for (size_t i = 0; i < comp_models.size(); i++) {
auto& cm = comp_models[i];
bool inst = rcli::is_metalrt_component_installed(cm);
- std::string type_label = (cm.component == "stt") ? "STT" : "TTS";
+ std::string type_label = (cm.component == "stt") ? "STT"
+ : (cm.component == "vlm") ? "VLM" : "TTS";
fprintf(stderr, " %s%zu%s %-28s %-8s %-5s %s%s%s\n",
color::bold, offset + i + 1, color::reset,
cm.name.c_str(),
@@ -917,6 +1148,7 @@ int main(int argc, char** argv) {
if (!args.verbose) {
llama_log_set([](enum ggml_log_level, const char*, void*) {}, nullptr);
+ mtmd_helper_log_set([](enum ggml_log_level, const char*, void*) {}, nullptr);
}
if (args.command.empty()) {
@@ -930,6 +1162,9 @@ int main(int argc, char** argv) {
if (args.command == "actions") return cmd_actions(args);
if (args.command == "action") return cmd_action(args);
if (args.command == "rag") return cmd_rag(args);
+ if (args.command == "vlm") return cmd_vlm(args);
+ if (args.command == "camera") return cmd_camera(args);
+ if (args.command == "screen") return cmd_screen(args);
if (args.command == "setup") return cmd_setup(args);
if (args.command == "models") return cmd_models(args);
if (args.command == "voices") return cmd_voices(args);
diff --git a/src/cli/model_pickers.h b/src/cli/model_pickers.h
index 949e25b..ec0b847 100644
--- a/src/cli/model_pickers.h
+++ b/src/cli/model_pickers.h
@@ -12,6 +12,7 @@
#include "models/model_registry.h"
#include "models/tts_model_registry.h"
#include "models/stt_model_registry.h"
+#include "models/vlm_model_registry.h"
#include "engines/metalrt_loader.h"
// =============================================================================
@@ -407,6 +408,83 @@ inline int pick_metalrt_stt() {
return 0;
}
+// =============================================================================
+// VLM picker
+// =============================================================================
+
+inline int pick_vlm(const std::string& models_dir) {
+ auto all = rcli::all_vlm_models();
+
+ fprintf(stderr, "\n %s%s VLM Models (Vision \xC2\xB7 llama.cpp)%s\n\n", color::bold, color::orange, color::reset);
+
+ fprintf(stderr, " %s# %-30s %-12s %s%s\n",
+ color::bold, "Model", "Size", "Status", color::reset);
+ fprintf(stderr, " %s── %-30s %-12s %s%s\n",
+ color::dim, "──────────────────────────────", "────────────", "──────────", color::reset);
+
+ for (size_t i = 0; i < all.size(); i++) {
+ auto& m = all[i];
+ bool installed = rcli::is_vlm_model_installed(models_dir, m);
+ std::string status;
+ if (installed) status = "\033[32minstalled\033[0m";
+ else status = "\033[2mnot installed\033[0m";
+ std::string label = m.name;
+ if (m.is_default) label += " (default)";
+ char size_str[32];
+ int total_mb = m.model_size_mb + m.mmproj_size_mb;
+ if (total_mb >= 1024)
+ snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0);
+ else
+ snprintf(size_str, sizeof(size_str), "%d MB", total_mb);
+ fprintf(stderr, " %s%-2zu%s %-30s %-12s %s\n",
+ installed ? "\033[32m" : "", i + 1, installed ? "\033[0m" : "",
+ label.c_str(), size_str, status.c_str());
+ }
+ fprintf(stderr, "\n %sCommands:%s [1-%zu] download/select | q cancel\n Choice: ",
+ color::bold, color::reset, all.size());
+ fflush(stderr);
+
+ int choice = read_picker_choice();
+ if (choice == 0 || choice == -1) { picker_no_changes(); return 0; }
+ if (choice < 1 || choice > (int)all.size()) { fprintf(stderr, "\n Invalid choice.\n\n"); return 1; }
+
+ auto& sel = all[choice - 1];
+ bool installed = rcli::is_vlm_model_installed(models_dir, sel);
+ if (installed) {
+ fprintf(stderr, "\n %s%s%s is already installed.%s\n\n",
+ color::bold, color::green, sel.name.c_str(), color::reset);
+ return 0;
+ }
+
+ int total_mb = sel.model_size_mb + sel.mmproj_size_mb;
+ char size_str[32];
+ if (total_mb >= 1024)
+ snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0);
+ else
+ snprintf(size_str, sizeof(size_str), "%d MB", total_mb);
+ fprintf(stderr, "\n %s%s%s%s is not installed (%s). Download? [Y/n]: ",
+ color::bold, color::yellow, sel.name.c_str(), color::reset, size_str);
+ fflush(stderr);
+ if (!confirm_download()) { picker_cancelled(); return 0; }
+
+ std::string model_path = models_dir + "/" + sel.model_filename;
+ std::string mmproj_path = models_dir + "/" + sel.mmproj_filename;
+ std::string cmd = "bash -c '"
+ "set -e; echo \" Downloading " + sel.name + " model...\"; echo \"\"; "
+ "curl -L -# -o \"" + model_path + "\" \"" + sel.model_url + "\"; "
+ "echo \"\"; echo \" Downloading vision projector...\"; echo \"\"; "
+ "curl -L -# -o \"" + mmproj_path + "\" \"" + sel.mmproj_url + "\"; "
+ "echo \"\"; echo \" Done!\"; '";
+ fprintf(stderr, "\n");
+ if (system(cmd.c_str()) != 0) {
+ fprintf(stderr, "\n %s%sDownload failed.%s\n\n", color::bold, color::red, color::reset);
+ return 1;
+ }
+ fprintf(stderr, "\n %s%sInstalled: %s%s\n Use: rcli vlm [prompt]\n\n",
+ color::bold, color::green, sel.name.c_str(), color::reset);
+ return 0;
+}
+
// =============================================================================
// Unified models dashboard
// =============================================================================
@@ -417,6 +495,7 @@ inline int cmd_models(const Args& args) {
if (args.arg1 == "llm") return pick_llm(models_dir);
if (args.arg1 == "stt") return pick_stt(models_dir);
if (args.arg1 == "tts") return pick_tts(models_dir);
+ if (args.arg1 == "vlm") return pick_vlm(models_dir);
if (args.arg1 == "metalrt-stt" || args.arg1 == "whisper") return pick_metalrt_stt();
if (args.help) {
@@ -426,12 +505,14 @@ inline int cmd_models(const Args& args) {
" models Unified model dashboard\n"
" models llm LLM model picker\n"
" models stt STT model picker\n"
- " models tts TTS voice picker\n\n"
+ " models tts TTS voice picker\n"
+ " models vlm VLM (vision) model picker\n\n"
" %sEXAMPLES%s\n"
" rcli models # dashboard — pick a modality\n"
" rcli models llm # switch LLM directly\n"
" rcli models stt # switch offline STT directly\n"
- " rcli models tts # switch TTS voice directly\n\n",
+ " rcli models tts # switch TTS voice directly\n"
+ " rcli models vlm # manage VLM models for image analysis\n\n",
color::bold, color::orange, color::reset,
color::bold, color::reset,
color::bold, color::reset);
@@ -483,6 +564,21 @@ inline int cmd_models(const Args& args) {
color::green, tts_name.c_str(), color::reset,
tts_inst, tts_all.size());
+ // VLM row
+ auto vlm_all = rcli::all_vlm_models();
+ int vlm_inst = 0;
+ std::string vlm_name = "not installed";
+ for (auto& m : vlm_all) {
+ if (rcli::is_vlm_model_installed(models_dir, m)) {
+ vlm_inst++;
+ if (vlm_name == "not installed") vlm_name = m.name;
+ }
+ }
+ fprintf(stderr, " %s4%s %sVLM (vision)%s %s%-28s%s %d / %zu\n",
+ color::green, color::reset, color::bold, color::reset,
+ vlm_inst > 0 ? color::green : color::dim, vlm_name.c_str(), color::reset,
+ vlm_inst, vlm_all.size());
+
// MetalRT Whisper row
auto mrt_comps = rcli::metalrt_component_models();
std::string mrt_stt_pref = rcli::read_selected_metalrt_stt_id();
@@ -498,7 +594,7 @@ inline int cmd_models(const Args& args) {
}
}
if (mrt_stt_pref.empty() && mrt_stt_inst > 0) mrt_stt_name = "auto (first installed)";
- fprintf(stderr, " %s4%s %sMetalRT STT%s %s%-28s%s %d / %d\n",
+ fprintf(stderr, " %s5%s %sMetalRT STT%s %s%-28s%s %d / %d\n",
color::green, color::reset, color::bold, color::reset,
color::green, mrt_stt_name.c_str(), color::reset,
mrt_stt_inst, mrt_stt_total);
@@ -521,7 +617,7 @@ inline int cmd_models(const Args& args) {
}
fprintf(stderr, " %sNote: STT streaming (Zipformer) is always active for live mic.%s\n\n",
color::dim, color::reset);
- fprintf(stderr, " %sSelect modality:%s 1 LLM | 2 STT | 3 TTS | 4 MetalRT STT | q cancel\n Choice: ",
+ fprintf(stderr, " %sSelect modality:%s 1 LLM | 2 STT | 3 TTS | 4 VLM | 5 MetalRT STT | q cancel\n Choice: ",
color::bold, color::reset);
fflush(stderr);
@@ -530,7 +626,8 @@ inline int cmd_models(const Args& args) {
if (choice == 1 || choice == -2) return pick_llm(models_dir); // -2 (a) → LLM as first
if (choice == 2) return pick_stt(models_dir);
if (choice == 3) return pick_tts(models_dir);
- if (choice == 4) return pick_metalrt_stt();
+ if (choice == 4) return pick_vlm(models_dir);
+ if (choice == 5) return pick_metalrt_stt();
fprintf(stderr, "\n Invalid choice.\n\n");
return 1;
@@ -595,10 +692,20 @@ inline int cmd_info() {
? "MetalRT (Metal GPU — LLM, STT, TTS on-device)"
: "llama.cpp + sherpa-onnx (ONNX Runtime)";
+ auto vlm_all_info = rcli::all_vlm_models();
+ auto [vlm_found, vlm_def] = rcli::find_installed_vlm(models_dir);
+ std::string vlm_info;
+ if (vlm_found) {
+ vlm_info = vlm_def.name + " (llama.cpp, Metal GPU)";
+ } else {
+ vlm_info = "not installed — run: rcli models vlm";
+ }
+
fprintf(stdout,
"\n%s%s RCLI%s %s%s%s\n\n"
" %sEngine:%s %s\n"
" %sLLM:%s %s\n"
+ " %sVLM:%s %s\n"
" %sSTT:%s %s\n"
" %sTTS:%s %s\n"
" %sVAD:%s Silero VAD\n"
@@ -610,6 +717,7 @@ inline int cmd_info() {
color::dim, RA_VERSION, color::reset,
color::bold, color::reset, engine_info.c_str(),
color::bold, color::reset, llm_info.c_str(),
+ color::bold, color::reset, vlm_info.c_str(),
color::bold, color::reset, stt_info.c_str(),
color::bold, color::reset, tts_info.c_str(),
color::bold, color::reset,
@@ -677,5 +785,24 @@ inline int cmd_info() {
if (!any_tts) fprintf(stdout, " (none — run: rcli setup)\n");
fprintf(stdout, "\n");
+ // Installed VLM
+ fprintf(stdout, " %sInstalled VLM:%s\n", color::bold, color::reset);
+ bool any_vlm = false;
+ for (auto& m : vlm_all_info) {
+ if (rcli::is_vlm_model_installed(models_dir, m)) {
+ char size_str[32];
+ int total_mb = m.model_size_mb + m.mmproj_size_mb;
+ if (total_mb >= 1024)
+ snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0);
+ else
+ snprintf(size_str, sizeof(size_str), "%d MB", total_mb);
+ fprintf(stdout, " %-28s %-7s installed\n",
+ m.name.c_str(), size_str);
+ any_vlm = true;
+ }
+ }
+ if (!any_vlm) fprintf(stdout, " (none — run: rcli models vlm)\n");
+ fprintf(stdout, "\n");
+
return 0;
}
diff --git a/src/cli/setup_cmds.h b/src/cli/setup_cmds.h
index f33dcc7..b5f85fb 100644
--- a/src/cli/setup_cmds.h
+++ b/src/cli/setup_cmds.h
@@ -178,13 +178,15 @@ inline int cmd_setup(const Args& args) {
if (!cm.default_install) continue;
std::string cm_dir = rcli::metalrt_models_dir() + "/" + cm.dir_name;
if (rcli::is_metalrt_component_installed(cm)) {
- std::string skip_label = (cm.component == "stt") ? "STT" : "TTS";
+ std::string skip_label = (cm.component == "stt") ? "STT"
+ : (cm.component == "vlm") ? "VLM" : "TTS";
fprintf(stderr, " %s%sMetalRT %s already installed:%s %s\n",
color::bold, color::green, skip_label.c_str(), color::reset, cm.name.c_str());
continue;
}
- std::string type_label = (cm.component == "stt") ? "STT" : "TTS";
+ std::string type_label = (cm.component == "stt") ? "STT"
+ : (cm.component == "vlm") ? "VLM" : "TTS";
fprintf(stderr, " %sDownloading MetalRT %s: %s (~%s)...%s\n",
color::dim, type_label.c_str(), cm.name.c_str(),
rcli::format_size(cm.size_mb).c_str(), color::reset);
diff --git a/src/cli/tui_app.h b/src/cli/tui_app.h
index 6ec4ed1..7b01d1e 100644
--- a/src/cli/tui_app.h
+++ b/src/cli/tui_app.h
@@ -12,8 +12,15 @@
#include "models/stt_model_registry.h"
#include "actions/action_registry.h"
#include "engines/metalrt_loader.h"
+#include "engines/vlm_engine.h"
+#include "audio/camera_capture.h"
+#include "audio/screen_capture.h"
+#include "models/vlm_model_registry.h"
#include "core/log.h"
#include "core/personality.h"
+#include
+
+extern char** environ;
#include
#include
@@ -432,7 +439,43 @@ class TuiApp {
if (c == "r" || c == "R") { enter_rag_mode(); return true; }
if (c == "d" || c == "D") { close_all_panels(); enter_cleanup_mode(); return true; }
if (c == "p" || c == "P") { enter_personality_mode(); return true; }
- // V key: voice mode removed — push-to-talk via SPACE is always active
+ // V key: capture photo from camera and analyze with VLM
+ if (c == "v" || c == "V") {
+ run_camera_vlm("Describe what you see in this photo in detail.");
+ return true;
+ }
+ // S key: toggle visual mode (VLM only on llama.cpp engine)
+ if (c == "s" || c == "S") {
+ if (screen_capture_overlay_active()) {
+ screen_capture_hide_overlay();
+ add_system_message("Exiting visual mode...");
+ screen_->Post(Event::Custom);
+ std::thread([this]() {
+ rcli_vlm_exit(engine_);
+ add_system_message("Visual mode OFF");
+ screen_->Post(Event::Custom);
+ }).detach();
+ } else {
+ add_system_message("Entering visual mode, loading VLM...");
+ screen_->Post(Event::Custom);
+ std::thread([this]() {
+ if (rcli_vlm_init(engine_) == 0) {
+ const char* vbe = rcli_vlm_backend_name(engine_);
+ const char* vmodel = rcli_vlm_model_name(engine_);
+ screen_capture_show_overlay(0, 0, 0, 0);
+ std::string msg = "Visual mode ON";
+ if (vbe && vbe[0])
+ msg += std::string(" — ") + vmodel + " via " + vbe;
+ msg += ". Drag/resize the green frame, then ask a question";
+ add_system_message(msg);
+ } else {
+ add_system_message("VLM requires the llama.cpp engine. Switch with: rcli engine llamacpp, then download a model via [M] \xe2\x86\x92 VLM Models");
+ }
+ screen_->Post(Event::Custom);
+ }).detach();
+ }
+ return true;
+ }
if (c == "t" || c == "T") {
tool_trace_enabled_ = !tool_trace_enabled_.load(std::memory_order_relaxed);
add_system_message(tool_trace_enabled_ ? "Tool call trace: ON" : "Tool call trace: OFF");
@@ -538,6 +581,11 @@ class TuiApp {
std::string user_text = transcript;
add_user_message(user_text);
+ // Visual mode: route voice to VLM screen analysis instead of LLM
+ if (screen_capture_overlay_active()) {
+ run_screen_vlm(user_text);
+ return;
+ }
voice_state_ = VoiceState::THINKING;
screen_->Post(Event::Custom);
@@ -1069,6 +1117,11 @@ class TuiApp {
else
right.push_back(text("[A] actions ") | dim);
right.push_back(text("[C] convo ") | dim);
+ right.push_back(text("[V] camera ") | dim);
+ if (screen_capture_overlay_active())
+ right.push_back(text("[S] visual ● ") | ftxui::color(ftxui::Color::Green));
+ else
+ right.push_back(text("[S] visual ") | dim);
right.push_back(text("[R] RAG ") | dim);
right.push_back(text("[P] personality ") | dim);
right.push_back(text("[D] cleanup ") | dim);
@@ -1458,6 +1511,7 @@ class TuiApp {
e.is_archive = false;
models_entries_.push_back(e);
}
+
} else {
// ---- llama.cpp engine: show GGUF models only ----
const auto* llm_active = rcli::resolve_active_model(dir, llm_all);
@@ -1501,6 +1555,21 @@ class TuiApp {
e.archive_dir = v.archive_dir;
models_entries_.push_back(e);
}
+
+ // VLM models (vision)
+ auto vlm_all = rcli::all_vlm_models();
+ { ModelEntry h; h.name = "VLM Models (Vision \xC2\xB7 llama.cpp)"; h.is_header = true; models_entries_.push_back(h); }
+ for (auto& m : vlm_all) {
+ ModelEntry e;
+ e.name = m.name; e.id = m.id; e.modality = "VLM";
+ e.size_mb = m.model_size_mb + m.mmproj_size_mb;
+ e.installed = rcli::is_vlm_model_installed(dir, m);
+ e.is_active = false; // VLM is lazy-loaded, no "active" concept
+ e.is_default = m.is_default; e.is_recommended = m.is_default;
+ e.description = m.description;
+ e.url = m.model_url; e.filename = m.model_filename; e.is_archive = false;
+ models_entries_.push_back(e);
+ }
}
for (int i = 0; i < (int)models_entries_.size(); i++) {
@@ -1666,7 +1735,20 @@ class TuiApp {
bool archive = e.is_archive;
std::string archive_dir_name = e.archive_dir;
- std::thread([this, idx, dir, url, fname, mod, id, nm, archive, archive_dir_name]() {
+ // For VLM, also capture the mmproj URL
+ std::string vlm_mmproj_url, vlm_mmproj_fname;
+ if (mod == "VLM") {
+ auto vlm_models = rcli::all_vlm_models();
+ for (auto& vm : vlm_models) {
+ if (vm.id == id) {
+ vlm_mmproj_url = vm.mmproj_url;
+ vlm_mmproj_fname = vm.mmproj_filename;
+ break;
+ }
+ }
+ }
+ std::thread([this, idx, dir, url, fname, mod, id, nm, archive, archive_dir_name,
+ vlm_mmproj_url, vlm_mmproj_fname]() {
int rc;
if (archive) {
rc = system(("curl -sL '" + url + "' | tar xj -C '" + dir + "' 2>/dev/null").c_str());
@@ -1677,6 +1759,12 @@ class TuiApp {
if (stat(src.c_str(), &st) == 0 && stat(dst.c_str(), &st) != 0)
rename(src.c_str(), dst.c_str());
}
+ } else if (mod == "VLM" && !vlm_mmproj_url.empty()) {
+ // VLM needs two files: language model + mmproj
+ rc = system(("curl -sL -o '" + dir + "/" + fname + "' '" + url + "' 2>/dev/null").c_str());
+ if (rc == 0) {
+ rc = system(("curl -sL -o '" + dir + "/" + vlm_mmproj_fname + "' '" + vlm_mmproj_url + "' 2>/dev/null").c_str());
+ }
} else {
rc = system(("curl -sL -o '" + dir + "/" + fname + "' '" + url + "' 2>/dev/null").c_str());
}
@@ -1698,6 +1786,9 @@ class TuiApp {
} else {
if (mod == "STT") rcli::write_selected_stt_id(id);
else if (mod == "TTS") rcli::write_selected_tts_id(id);
+ else if (mod == "VLM") {
+ // VLM doesn't need selection — just mark installed
+ }
models_message_ = "Downloaded & selected: " + nm + ". Restart RCLI to apply.";
models_msg_color_ = theme_.success;
}
@@ -2143,6 +2234,117 @@ class TuiApp {
// process_input
// ====================================================================
+ void run_camera_vlm(const std::string& prompt) {
+ add_system_message("Capturing photo from camera...");
+ voice_state_ = VoiceState::THINKING;
+ std::string prompt_copy = prompt;
+ std::thread([this, prompt_copy]() {
+ std::string photo_path = "/tmp/rcli_camera_" +
+ std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg";
+ int rc = camera_capture_photo(photo_path.c_str());
+ if (rc != 0) {
+ add_response("(Camera capture failed. Check camera permissions in System Settings > Privacy & Security > Camera.)", "");
+ voice_state_ = VoiceState::IDLE;
+ screen_->Post(Event::Custom);
+ return;
+ }
+ add_system_message("Photo captured! Loading VLM...");
+ screen_->Post(Event::Custom);
+
+ const char* response = rcli_vlm_analyze(
+ engine_, photo_path.c_str(), prompt_copy.c_str());
+
+ // Show which backend handled it
+ const char* vbe = rcli_vlm_backend_name(engine_);
+ const char* vmodel = rcli_vlm_model_name(engine_);
+ if (vbe && vbe[0]) {
+ add_system_message(std::string("VLM: ") + vmodel + " via " + vbe);
+ screen_->Post(Event::Custom);
+ }
+
+ if (response && response[0]) {
+ add_response(response, "VLM");
+ voice_state_ = VoiceState::SPEAKING;
+ screen_->Post(Event::Custom);
+ rcli_speak(engine_, response);
+ RCLIVlmStats stats;
+ if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+ char buf[128];
+ snprintf(buf, sizeof(buf), "⚡ %.1f tok/s | %d tokens | %.1fs total",
+ stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+ add_system_message(buf);
+ }
+ } else {
+ add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", "");
+ }
+ voice_state_ = VoiceState::IDLE;
+ {
+ pid_t pid;
+ const char* argv[] = {"open", photo_path.c_str(), nullptr};
+ posix_spawnp(&pid, "open", nullptr, nullptr,
+ const_cast(argv), environ);
+ }
+ screen_->Post(Event::Custom);
+ }).detach();
+ }
+
+ void run_screen_vlm(const std::string& prompt) {
+ char app_name[256];
+ screen_capture_target_app_name(app_name, sizeof(app_name));
+ add_system_message(std::string("Capturing screenshot of ") + app_name + "...");
+ voice_state_ = VoiceState::THINKING;
+ std::string prompt_copy = prompt;
+ std::thread([this, prompt_copy]() {
+ std::string screen_path = "/tmp/rcli_screen_" +
+ std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg";
+ int rc = screen_capture_screenshot(screen_path.c_str());
+ if (rc != 0) {
+ add_response("(Screen capture failed. Check screen recording permissions.)", "");
+ voice_state_ = VoiceState::IDLE;
+ screen_->Post(Event::Custom);
+ return;
+ }
+ add_system_message("Loading VLM...");
+ screen_->Post(Event::Custom);
+
+ std::string accumulated;
+ auto stream_cb = [](const char* event, const char* data, void* ud) {
+ auto* accum = static_cast(ud);
+ if (std::strcmp(event, "token") == 0) {
+ accum->append(data);
+ }
+ };
+ int vlm_rc = rcli_vlm_analyze_stream(engine_, screen_path.c_str(),
+ prompt_copy.c_str(), stream_cb, &accumulated);
+
+ // Show which backend handled it
+ const char* vbe = rcli_vlm_backend_name(engine_);
+ const char* vmodel = rcli_vlm_model_name(engine_);
+ if (vbe && vbe[0]) {
+ add_system_message(std::string("VLM: ") + vmodel + " via " + vbe);
+ screen_->Post(Event::Custom);
+ }
+
+ if (vlm_rc == 0 && !accumulated.empty()) {
+ add_response(accumulated, "VLM");
+ voice_state_ = VoiceState::SPEAKING;
+ screen_->Post(Event::Custom);
+ rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr);
+ RCLIVlmStats stats;
+ if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+ char buf[128];
+ snprintf(buf, sizeof(buf), "⚡ %.1f tok/s | %d tokens | %.1fs total",
+ stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+ add_system_message(buf);
+ }
+ } else {
+ add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", "");
+ }
+ voice_state_ = VoiceState::IDLE;
+ screen_->Post(Event::Custom);
+ }).detach();
+ }
+
void process_input(const std::string& input) {
if (input.empty()) return;
@@ -2202,6 +2404,26 @@ class TuiApp {
return;
}
+ if (cmd == "visual") {
+ if (screen_capture_overlay_active()) {
+ screen_capture_hide_overlay();
+ add_system_message("Visual mode OFF");
+ } else {
+ screen_capture_show_overlay(0, 0, 0, 0);
+ add_system_message("Visual mode ON — drag/resize the green frame, then ask a question");
+ }
+ return;
+ }
+
+ if (cmd == "screen" || cmd == "screenshot") {
+ run_screen_vlm("Describe what you see on this screen in detail.");
+ return;
+ }
+
+ if (cmd == "camera" || cmd == "photo" || cmd == "webcam") {
+ run_camera_vlm("Describe what you see in this photo in detail.");
+ return;
+ }
if (!engine_) {
add_response("Engine not initialized.", "");
@@ -2340,6 +2562,34 @@ class TuiApp {
struct stat path_st;
if (!resolved.empty() && resolved[0] == '/' && stat(resolved.c_str(), &path_st) == 0) {
+ // Check if this is an image file → route to VLM analysis
+ if (S_ISREG(path_st.st_mode) && rastack::VlmEngine::is_supported_image(resolved)) {
+ add_system_message("Image detected: " + resolved);
+ add_system_message("Analyzing image with VLM...");
+ voice_state_ = VoiceState::THINKING;
+ std::string path_copy = resolved;
+ std::thread([this, path_copy]() {
+ const char* response = rcli_vlm_analyze(
+ engine_, path_copy.c_str(), "Describe this image in detail.");
+ if (response && response[0]) {
+ add_response(response, "VLM");
+ RCLIVlmStats stats;
+ if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+ char buf[128];
+ snprintf(buf, sizeof(buf), "⚡ %.1f tok/s | %d tokens | %.1fs total",
+ stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+ add_system_message(buf);
+ }
+ } else {
+ add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", "");
+ }
+ voice_state_ = VoiceState::IDLE;
+ screen_->Post(Event::Custom);
+ }).detach();
+ return;
+ }
+
+ // Non-image path → RAG ingest
add_system_message("Detected path: " + resolved);
add_system_message("Indexing for RAG... this may take a moment.");
std::string path_copy = resolved;
diff --git a/src/engines/metalrt_loader.cpp b/src/engines/metalrt_loader.cpp
index 7dd5363..ba0f1c8 100644
--- a/src/engines/metalrt_loader.cpp
+++ b/src/engines/metalrt_loader.cpp
@@ -186,6 +186,22 @@ bool MetalRTLoader::load() {
LOG_DEBUG("MetalRT", "TTS symbols: tts_create=%p tts_synthesize=%p tts_sample_rate=%p",
(void*)tts_create, (void*)tts_synthesize, (void*)tts_sample_rate);
+ // Vision (VLM) symbols (optional)
+ vision_create = resolve("metalrt_vision_create");
+ vision_destroy = resolve("metalrt_vision_destroy");
+ vision_load = resolve("metalrt_vision_load");
+ vision_analyze = resolve("metalrt_vision_analyze");
+ vision_analyze_stream = resolve("metalrt_vision_analyze_stream");
+ vision_generate = resolve("metalrt_vision_generate");
+ vision_generate_stream = resolve("metalrt_vision_generate_stream");
+ vision_reset = resolve("metalrt_vision_reset");
+ vision_model_name = resolve("metalrt_vision_model_name");
+ vision_device_name = resolve("metalrt_vision_device_name");
+ vision_free_result = resolve("metalrt_vision_free_result");
+
+ LOG_DEBUG("MetalRT", "VLM symbols: vision_create=%p vision_analyze=%p vision_stream=%p",
+ (void*)vision_create, (void*)vision_analyze, (void*)vision_analyze_stream);
+
if (!fn_abi_version_ || !create || !destroy || !load_model || !generate) {
LOG_ERROR("MetalRT", "dylib missing required LLM symbols: abi=%p create=%p destroy=%p load=%p gen=%p",
(void*)fn_abi_version_, (void*)create, (void*)destroy, (void*)load_model, (void*)generate);
diff --git a/src/engines/metalrt_loader.h b/src/engines/metalrt_loader.h
index 6d6b0b8..41247ed 100644
--- a/src/engines/metalrt_loader.h
+++ b/src/engines/metalrt_loader.h
@@ -128,6 +128,47 @@ class MetalRTLoader {
TtsFreeAudioFn tts_free_audio = nullptr;
TtsSampleRateFn tts_sample_rate = nullptr;
+ // --- Vision (VLM) function pointers ---
+
+ struct MetalRTVisionResult {
+ const char* text;
+ const char* thinking;
+ const char* response;
+ int prompt_tokens;
+ int generated_tokens;
+ double vision_encode_ms;
+ double prefill_ms;
+ double decode_ms;
+ double tps;
+ };
+
+ struct MetalRTVisionOptions {
+ int max_tokens;
+ int top_k;
+ float temperature;
+ bool think;
+ };
+
+ using VisionAnalyzeFn = MetalRTVisionResult (*)(void*, const char*, const char*, const MetalRTVisionOptions*);
+ using VisionAnalyzeStreamFn = MetalRTVisionResult (*)(void*, const char*, const char*, MetalRTStreamCb, void*, const MetalRTVisionOptions*);
+ using VisionGenerateFn = MetalRTVisionResult (*)(void*, const char*, const MetalRTVisionOptions*);
+ using VisionGenerateStreamFn = MetalRTVisionResult (*)(void*, const char*, MetalRTStreamCb, void*, const MetalRTVisionOptions*);
+ using VisionFreeResultFn = void (*)(MetalRTVisionResult);
+
+ CreateFn vision_create = nullptr;
+ DestroyFn vision_destroy = nullptr;
+ LoadFn vision_load = nullptr;
+ VisionAnalyzeFn vision_analyze = nullptr;
+ VisionAnalyzeStreamFn vision_analyze_stream = nullptr;
+ VisionGenerateFn vision_generate = nullptr;
+ VisionGenerateStreamFn vision_generate_stream = nullptr;
+ ResetFn vision_reset = nullptr;
+ ModelNameFn vision_model_name = nullptr;
+ DeviceNameFn vision_device_name = nullptr;
+ VisionFreeResultFn vision_free_result = nullptr;
+
+ bool has_vision() const { return vision_create != nullptr && vision_analyze != nullptr; }
+
// --- Install / remove / version management ---
static bool install(const std::string& version = "latest");
diff --git a/src/engines/tts_engine.cpp b/src/engines/tts_engine.cpp
index cf5cd95..b139960 100644
--- a/src/engines/tts_engine.cpp
+++ b/src/engines/tts_engine.cpp
@@ -77,9 +77,26 @@ bool TtsEngine::init(const TtsConfig& config) {
return true;
}
+bool TtsEngine::reinit() {
+ if (!initialized_) return false;
+ LOG_DEBUG("TTS", "Reinitializing ONNX session to prevent audio degradation");
+ if (tts_) {
+ SherpaOnnxDestroyOfflineTts(tts_);
+ tts_ = nullptr;
+ }
+ initialized_ = false;
+ synth_count_ = 0;
+ return init(config_);
+}
+
std::vector TtsEngine::synthesize(const std::string& text) {
if (!initialized_ || !tts_) return {};
+ // Periodically reinit to prevent audio quality degradation
+ if (++synth_count_ >= kReinitInterval) {
+ reinit();
+ }
+
stats_ = TtsStats{};
int64_t t_start = now_us();
diff --git a/src/engines/tts_engine.h b/src/engines/tts_engine.h
index 40c36e9..90b9018 100644
--- a/src/engines/tts_engine.h
+++ b/src/engines/tts_engine.h
@@ -63,12 +63,18 @@ class TtsEngine {
// Change speaker at runtime (Kokoro multi-voice)
void set_speaker_id(int id) { config_.speaker_id = id; }
+ // Reinitialize the ONNX Runtime session to flush accumulated state.
+ // Call periodically to prevent audio degradation over long sessions.
+ bool reinit();
+
private:
const SherpaOnnxOfflineTts* tts_ = nullptr;
TtsConfig config_;
TtsStats stats_;
int sample_rate_ = 22050;
bool initialized_ = false;
+ int synth_count_ = 0; // synthesis calls since last reinit
+ static constexpr int kReinitInterval = 20; // reinit every N calls
};
} // namespace rastack
diff --git a/src/engines/vlm_engine.cpp b/src/engines/vlm_engine.cpp
new file mode 100644
index 0000000..1f2d09b
--- /dev/null
+++ b/src/engines/vlm_engine.cpp
@@ -0,0 +1,266 @@
+#include "engines/vlm_engine.h"
+#include "core/log.h"
+#include "llama.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include
+#include
+#include
+
+namespace rastack {
+
+VlmEngine::VlmEngine() = default;
+
+VlmEngine::~VlmEngine() {
+ shutdown();
+}
+
+void VlmEngine::shutdown() {
+ if (ctx_mtmd_) { mtmd_free(ctx_mtmd_); ctx_mtmd_ = nullptr; }
+ if (sampler_) { llama_sampler_free(sampler_); sampler_ = nullptr; }
+ if (ctx_) { llama_free(ctx_); ctx_ = nullptr; }
+ if (model_) { llama_model_free(model_); model_ = nullptr; }
+ vocab_ = nullptr;
+ initialized_ = false;
+ stats_ = VlmStats{};
+ LOG_DEBUG("VLM", "Shutdown complete");
+}
+
+bool VlmEngine::init(const VlmConfig& config) {
+ if (initialized_) shutdown();
+
+ config_ = config;
+
+ // Initialize backend (loads Metal, etc.) — safe to call multiple times
+ static std::once_flag backend_init_flag;
+ std::call_once(backend_init_flag, [] { ggml_backend_load_all(); });
+
+ // Load language model
+ llama_model_params model_params = llama_model_default_params();
+ model_params.n_gpu_layers = config.n_gpu_layers;
+ model_params.use_mmap = config.use_mmap;
+ model_params.use_mlock = config.use_mlock;
+
+ LOG_DEBUG("VLM", "Loading VLM model: %s", config.model_path.c_str());
+ model_ = llama_model_load_from_file(config.model_path.c_str(), model_params);
+ if (!model_) {
+ LOG_ERROR("VLM", "Failed to load VLM model");
+ return false;
+ }
+
+ vocab_ = llama_model_get_vocab(model_);
+
+ // Create inference context
+ llama_context_params ctx_params = llama_context_default_params();
+ ctx_params.n_ctx = config.n_ctx;
+ ctx_params.n_batch = config.n_batch;
+ ctx_params.n_threads = config.n_threads;
+ ctx_params.n_threads_batch = config.n_threads_batch;
+ ctx_params.no_perf = false;
+ ctx_params.flash_attn_type = config.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+ ctx_ = llama_init_from_model(model_, ctx_params);
+ if (!ctx_) {
+ LOG_ERROR("VLM", "Failed to create VLM context");
+ llama_model_free(model_);
+ model_ = nullptr;
+ return false;
+ }
+
+ // Initialize mtmd (vision projector)
+ LOG_DEBUG("VLM", "Loading vision projector: %s", config.mmproj_path.c_str());
+ mtmd_context_params mtmd_params = mtmd_context_params_default();
+ mtmd_params.use_gpu = (config.n_gpu_layers > 0);
+ mtmd_params.n_threads = config.n_threads_batch;
+ mtmd_params.flash_attn_type = config.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+ ctx_mtmd_ = mtmd_init_from_file(config.mmproj_path.c_str(), model_, mtmd_params);
+ if (!ctx_mtmd_) {
+ LOG_ERROR("VLM", "Failed to load vision projector (mmproj)");
+ llama_free(ctx_);
+ llama_model_free(model_);
+ ctx_ = nullptr;
+ model_ = nullptr;
+ return false;
+ }
+
+ if (!mtmd_support_vision(ctx_mtmd_)) {
+ LOG_ERROR("VLM", "Model does not support vision input");
+ mtmd_free(ctx_mtmd_);
+ llama_free(ctx_);
+ llama_model_free(model_);
+ ctx_mtmd_ = nullptr;
+ ctx_ = nullptr;
+ model_ = nullptr;
+ return false;
+ }
+
+ // Setup sampler chain
+ auto sparams = llama_sampler_chain_default_params();
+ sampler_ = llama_sampler_chain_init(sparams);
+ if (config.temperature > 0.0f) {
+ llama_sampler_chain_add(sampler_, llama_sampler_init_temp(config.temperature));
+ llama_sampler_chain_add(sampler_, llama_sampler_init_top_k(config.top_k));
+ llama_sampler_chain_add(sampler_, llama_sampler_init_top_p(config.top_p, 1));
+ llama_sampler_chain_add(sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+ } else {
+ llama_sampler_chain_add(sampler_, llama_sampler_init_greedy());
+ }
+
+ initialized_ = true;
+ LOG_INFO("VLM", "Initialized (vision support: yes)");
+ return true;
+}
+
+std::string VlmEngine::analyze_image(const std::string& image_path,
+ const std::string& prompt,
+ TokenCallback on_token) {
+ if (!initialized_) return "";
+
+ cancelled_.store(false, std::memory_order_relaxed);
+ stats_ = VlmStats{};
+
+ // Clear KV cache
+ llama_memory_clear(llama_get_memory(ctx_), true);
+ if (sampler_) llama_sampler_reset(sampler_);
+
+ // 1. Load image
+ LOG_DEBUG("VLM", "Loading image: %s", image_path.c_str());
+ mtmd_bitmap* bitmap = mtmd_helper_bitmap_init_from_file(ctx_mtmd_, image_path.c_str());
+ if (!bitmap) {
+ LOG_ERROR("VLM", "Failed to load image: %s", image_path.c_str());
+ return "";
+ }
+
+ // 2. Build prompt with media marker using ChatML template (Qwen3-VL format)
+ // The model expects: <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n\nprompt<|im_end|>\n<|im_start|>assistant\n
+ std::string marker = mtmd_default_marker();
+ std::string full_prompt =
+ "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+ "<|im_start|>user\n" + marker + "\n" + prompt + "<|im_end|>\n"
+ "<|im_start|>assistant\n";
+
+ mtmd_input_text input_text;
+ input_text.text = full_prompt.c_str();
+ input_text.add_special = true;
+ input_text.parse_special = true;
+
+ // 3. Tokenize (combines text tokens + image tokens)
+ mtmd_input_chunks* chunks = mtmd_input_chunks_init();
+ const mtmd_bitmap* bitmap_ptr = bitmap;
+
+ int64_t t_encode_start = now_us();
+ int32_t tokenize_result = mtmd_tokenize(ctx_mtmd_, chunks, &input_text, &bitmap_ptr, 1);
+ if (tokenize_result != 0) {
+ LOG_ERROR("VLM", "Failed to tokenize image+text (error=%d)", tokenize_result);
+ mtmd_input_chunks_free(chunks);
+ mtmd_bitmap_free(bitmap);
+ return "";
+ }
+
+ size_t n_tokens = mtmd_helper_get_n_tokens(chunks);
+ stats_.prompt_tokens = n_tokens;
+ LOG_DEBUG("VLM", "Tokenized: %zu total tokens (text + image)", n_tokens);
+
+ // 4. Evaluate all chunks (text + image encoding + decoding)
+ int64_t t_prompt_start = now_us();
+ llama_pos n_past = 0;
+ int32_t eval_result = mtmd_helper_eval_chunks(
+ ctx_mtmd_, ctx_, chunks,
+ n_past, // n_past
+ 0, // seq_id
+ config_.n_batch, // n_batch
+ true, // logits_last
+ &n_past // updated n_past
+ );
+
+ stats_.image_encode_us = now_us() - t_encode_start;
+ stats_.prompt_eval_us = now_us() - t_prompt_start;
+
+ // Clean up image resources
+ mtmd_input_chunks_free(chunks);
+ mtmd_bitmap_free(bitmap);
+
+ if (eval_result != 0) {
+ LOG_ERROR("VLM", "Failed to evaluate image+text chunks (error=%d)", eval_result);
+ return "";
+ }
+
+ LOG_DEBUG("VLM", "Image encoded in %.1fms, prompt eval in %.1fms",
+ stats_.image_encode_us / 1000.0, stats_.prompt_eval_us / 1000.0);
+
+ // 5. Generate tokens (same pattern as LlmEngine::generate)
+ std::string result;
+ int64_t t_gen_start = now_us();
+ bool first_token = true;
+
+ for (int i = 0; i < config_.max_tokens; i++) {
+ if (cancelled_.load(std::memory_order_relaxed)) {
+ LOG_DEBUG("VLM", "Generation cancelled");
+ break;
+ }
+
+ int32_t new_token = llama_sampler_sample(sampler_, ctx_, -1);
+
+ if (first_token) {
+ stats_.first_token_us = now_us() - t_prompt_start;
+ first_token = false;
+ }
+
+ if (llama_vocab_is_eog(vocab_, new_token)) {
+ break;
+ }
+
+ // Decode token to text
+ char buf[256];
+ int n = llama_token_to_piece(vocab_, new_token, buf, sizeof(buf), 0, true);
+ if (n < 0) continue;
+ std::string piece(buf, n);
+
+ result += piece;
+ stats_.generated_tokens++;
+
+ if (on_token) {
+ TokenOutput tok;
+ tok.text = piece;
+ tok.token_id = new_token;
+ tok.is_eos = false;
+ tok.is_tool_call = false;
+ on_token(tok);
+ }
+
+ // Feed token back for next iteration
+ llama_batch batch = llama_batch_get_one(&new_token, 1);
+ if (llama_decode(ctx_, batch) != 0) {
+ LOG_ERROR("VLM", "Failed to decode token");
+ break;
+ }
+ }
+
+ stats_.generation_us = now_us() - t_gen_start;
+
+ LOG_DEBUG("VLM", "Generated %lld tokens (%.1f tok/s), first token: %.1fms",
+ stats_.generated_tokens, stats_.gen_tps(),
+ stats_.first_token_us / 1000.0);
+
+ return result;
+}
+
+bool VlmEngine::is_supported_image(const std::string& path) {
+ // Get extension (case-insensitive)
+ auto dot = path.rfind('.');
+ if (dot == std::string::npos) return false;
+
+ std::string ext = path.substr(dot);
+ // Convert to lowercase
+ std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+
+ return ext == ".jpg" || ext == ".jpeg" ||
+ ext == ".png" || ext == ".bmp" ||
+ ext == ".gif" || ext == ".webp" ||
+ ext == ".tga";
+}
+
+} // namespace rastack
diff --git a/src/engines/vlm_engine.h b/src/engines/vlm_engine.h
new file mode 100644
index 0000000..57739a2
--- /dev/null
+++ b/src/engines/vlm_engine.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "core/types.h"
+#include
+#include
+#include
+
+// Forward declare llama types
+struct llama_model;
+struct llama_context;
+struct llama_sampler;
+struct llama_vocab;
+
+// Forward declare mtmd types
+struct mtmd_context;
+
+namespace rastack {
+
+struct VlmConfig {
+ std::string model_path; // Path to VLM language model GGUF
+ std::string mmproj_path; // Path to vision projector (mmproj) GGUF
+ int n_gpu_layers = 99;
+ int n_ctx = 4096; // VLM needs larger context for image tokens
+ int n_batch = 512;
+ int n_threads = 1;
+ int n_threads_batch = 8;
+ float temperature = 0.7f;
+ float top_p = 0.9f;
+ int top_k = 40;
+ int max_tokens = 512;
+ bool use_mmap = true;
+ bool use_mlock = false;
+ bool flash_attn = true;
+};
+
+struct VlmStats {
+ int64_t prompt_tokens = 0;
+ int64_t generated_tokens = 0;
+ int64_t prompt_eval_us = 0;
+ int64_t generation_us = 0;
+ int64_t image_encode_us = 0; // Time spent encoding the image
+ double prompt_tps() const { return prompt_tokens > 0 ? prompt_tokens * 1e6 / prompt_eval_us : 0; }
+ double gen_tps() const { return generated_tokens > 0 ? generated_tokens * 1e6 / generation_us : 0; }
+ int64_t first_token_us = 0;
+};
+
+class VlmEngine {
+public:
+ VlmEngine();
+ ~VlmEngine();
+
+ // Initialize model + vision projector
+ bool init(const VlmConfig& config);
+
+ // Release all resources
+ void shutdown();
+
+ // Analyze an image with a text prompt
+ // Returns the generated description/analysis text
+ std::string analyze_image(const std::string& image_path,
+ const std::string& prompt,
+ TokenCallback on_token = nullptr);
+
+ // Cancel ongoing generation
+ void cancel() { cancelled_.store(true, std::memory_order_release); }
+
+ // Get stats from last generation
+ const VlmStats& last_stats() const { return stats_; }
+
+ bool is_initialized() const { return initialized_; }
+
+ // Check if an image file is a supported format
+ static bool is_supported_image(const std::string& path);
+
+private:
+ llama_model* model_ = nullptr;
+ llama_context* ctx_ = nullptr;
+ llama_sampler* sampler_ = nullptr;
+ const llama_vocab* vocab_ = nullptr;
+ mtmd_context* ctx_mtmd_ = nullptr;
+
+ VlmConfig config_;
+ VlmStats stats_;
+ bool initialized_ = false;
+ std::atomic cancelled_{false};
+};
+
+} // namespace rastack
diff --git a/src/models/model_registry.h b/src/models/model_registry.h
index 79d3da4..e0084d1 100644
--- a/src/models/model_registry.h
+++ b/src/models/model_registry.h
@@ -287,7 +287,7 @@ inline bool is_metalrt_model_installed(const LlmModelDef& m) {
struct MetalRTComponentModel {
std::string id;
std::string name;
- std::string component; // "stt" or "tts"
+ std::string component; // "stt", "tts", or "vlm"
std::string hf_repo; // HuggingFace repo path (org/repo)
std::string hf_subdir; // subdirectory within repo (empty for flat repos)
std::string dir_name; // local dir under metalrt_models_dir()
@@ -350,6 +350,7 @@ inline std::vector metalrt_component_models() {
};
}
+
inline bool is_metalrt_component_installed(const MetalRTComponentModel& m) {
std::string dir = metalrt_models_dir() + "/" + m.dir_name;
if (access(dir.c_str(), R_OK) != 0) return false;
diff --git a/src/models/vlm_model_registry.h b/src/models/vlm_model_registry.h
new file mode 100644
index 0000000..5556d7a
--- /dev/null
+++ b/src/models/vlm_model_registry.h
@@ -0,0 +1,94 @@
+#pragma once
+// =============================================================================
+// RCLI VLM Model Registry
+// =============================================================================
+//
+// Registry of supported VLM (Vision Language Model) models.
+// Each model consists of a language model GGUF + an mmproj (vision projector) GGUF.
+//
+// =============================================================================
+
+#include
+#include
+#include
+
+namespace rcli {
+
+struct VlmModelDef {
+ std::string id; // Unique slug: "smolvlm-500m"
+ std::string name; // Display name: "SmolVLM 500M Instruct"
+ std::string model_filename; // Language model GGUF filename
+ std::string mmproj_filename; // Vision projector GGUF filename
+ std::string model_url; // HuggingFace download URL for language model
+ std::string mmproj_url; // HuggingFace download URL for mmproj
+ int model_size_mb; // Approximate model download size
+ int mmproj_size_mb; // Approximate mmproj download size
+ std::string description; // One-line description
+ bool is_default; // Default model for `rcli vlm`
+};
+
+inline std::vector all_vlm_models() {
+ return {
+ {
+ /* id */ "qwen3-vl-2b",
+ /* name */ "Qwen3 VL 2B Instruct",
+ /* model_filename */ "Qwen3-VL-2B-Instruct-Q8_0.gguf",
+ /* mmproj_filename */ "mmproj-Qwen3-VL-2B-Instruct-Q8_0.gguf",
+ /* model_url */ "https://huggingface.co/ggml-org/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3-VL-2B-Instruct-Q8_0.gguf",
+ /* mmproj_url */ "https://huggingface.co/ggml-org/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3-VL-2B-Instruct-Q8_0.gguf",
+ /* model_size_mb */ 1830,
+ /* mmproj_size_mb */ 445,
+ /* description */ "Qwen3 Vision-Language model. High quality image analysis.",
+ /* is_default */ false,
+ },
+ {
+ /* id */ "lfm2-vl-1.6b",
+ /* name */ "Liquid LFM2 VL 1.6B",
+ /* model_filename */ "LFM2-VL-1.6B-Q8_0.gguf",
+ /* mmproj_filename */ "mmproj-LFM2-VL-1.6B-Q8_0.gguf",
+ /* model_url */ "https://huggingface.co/LiquidAI/LFM2-VL-1.6B-GGUF/resolve/main/LFM2-VL-1.6B-Q8_0.gguf",
+ /* mmproj_url */ "https://huggingface.co/LiquidAI/LFM2-VL-1.6B-GGUF/resolve/main/mmproj-LFM2-VL-1.6B-Q8_0.gguf",
+ /* model_size_mb */ 1250,
+ /* mmproj_size_mb */ 210,
+ /* description */ "Liquid Foundation Model for vision. Fast, 128K context.",
+ /* is_default */ false,
+ },
+ {
+ /* id */ "smolvlm-500m",
+ /* name */ "SmolVLM 500M Instruct",
+ /* model_filename */ "SmolVLM-500M-Instruct-Q8_0.gguf",
+ /* mmproj_filename */ "mmproj-SmolVLM-500M-Instruct-Q8_0.gguf",
+ /* model_url */ "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/SmolVLM-500M-Instruct-Q8_0.gguf",
+ /* mmproj_url */ "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf",
+ /* model_size_mb */ 437,
+ /* mmproj_size_mb */ 109,
+ /* description */ "Smallest VLM. Fast image analysis, lower quality.",
+ /* is_default */ false,
+ },
+ };
+}
+
+inline std::pair get_default_vlm_model() {
+ auto models = all_vlm_models();
+ for (auto& m : models) {
+ if (m.is_default) return {true, m};
+ }
+ return {false, {}};
+}
+
+inline bool is_vlm_model_installed(const std::string& models_dir, const VlmModelDef& m) {
+ std::string model_path = models_dir + "/" + m.model_filename;
+ std::string mmproj_path = models_dir + "/" + m.mmproj_filename;
+ return access(model_path.c_str(), R_OK) == 0 &&
+ access(mmproj_path.c_str(), R_OK) == 0;
+}
+
+inline std::pair find_installed_vlm(const std::string& models_dir) {
+ auto models = all_vlm_models();
+ for (auto& m : models) {
+ if (is_vlm_model_installed(models_dir, m)) return {true, m};
+ }
+ return {false, {}};
+}
+
+} // namespace rcli
diff --git a/src/pipeline/orchestrator.h b/src/pipeline/orchestrator.h
index 8648374..51a5527 100644
--- a/src/pipeline/orchestrator.h
+++ b/src/pipeline/orchestrator.h
@@ -6,6 +6,7 @@
#include "core/ring_buffer.h"
#include "engines/stt_engine.h"
#include "engines/llm_engine.h"
+#include "engines/vlm_engine.h"
#include "engines/metalrt_engine.h"
#include "engines/metalrt_stt_engine.h"
#include "engines/metalrt_tts_engine.h"
@@ -93,12 +94,16 @@ class Orchestrator {
VadEngine& vad() { return vad_; }
ToolEngine& tools() { return tools_; }
AudioIO& audio() { return audio_; }
+ VlmEngine& vlm() { return vlm_; }
RingBuffer* playback_ring_buffer() { return playback_rb_.get(); }
// Active LLM backend
LlmBackend active_llm_backend() const { return active_backend_; }
bool using_metalrt() const { return active_backend_ == LlmBackend::METALRT; }
+ // Access the pipeline config (e.g. for MetalRT model dir during VLM swap)
+ const PipelineConfig& config() const { return config_; }
+
// Update the base system prompt (e.g. when personality changes)
void set_system_prompt(const std::string& prompt) { config_.system_prompt = prompt; }
@@ -168,6 +173,7 @@ class Orchestrator {
SttEngine stt_;
OfflineSttEngine offline_stt_; // Whisper for file pipeline
LlmEngine llm_;
+ VlmEngine vlm_;
MetalRTEngine metalrt_;
MetalRTSttEngine metalrt_stt_;
MetalRTTtsEngine metalrt_tts_;
diff --git a/src/pipeline/text_sanitizer.h b/src/pipeline/text_sanitizer.h
index b21b1a0..5c454a3 100644
--- a/src/pipeline/text_sanitizer.h
+++ b/src/pipeline/text_sanitizer.h
@@ -73,6 +73,33 @@ inline std::string sanitize_for_tts(const std::string& text) {
out = std::move(cleaned);
}
+ // 4b. Strip emote/action markers like *laughs*, *sighs*, *smiles*, etc.
+ // These are non-speakable stage directions that LLMs often generate.
+ {
+ std::string cleaned;
+ cleaned.reserve(out.size());
+ for (size_t i = 0; i < out.size(); i++) {
+ if (out[i] == '*') {
+ size_t close = out.find('*', i + 1);
+ if (close != std::string::npos && close - i <= 30) {
+ // Check it looks like an emote (single word or short phrase, no nested formatting)
+ bool is_emote = true;
+ for (size_t j = i + 1; j < close; j++) {
+ if (out[j] == '*' || out[j] == '\n') { is_emote = false; break; }
+ }
+ if (is_emote) {
+ i = close; // skip past closing *
+ // Also skip trailing space if present
+ if (i + 1 < out.size() && out[i + 1] == ' ') i++;
+ continue;
+ }
+ }
+ }
+ cleaned += out[i];
+ }
+ out = std::move(cleaned);
+ }
+
// 5. Strip markdown symbols and non-speakable formatting
{
std::string cleaned;
@@ -215,6 +242,84 @@ inline std::string sanitize_for_tts(const std::string& text) {
}
}
+ // 6c. Replace brand names / proper nouns that G2P spells letter-by-letter
+ // with phonetic approximations so TTS pronounces them naturally.
+ {
+ struct Phonetic { const char* from; const char* to; };
+ static const Phonetic table[] = {
+ {"Spotify", "Spotifye"},
+ {"spotify", "spotifye"},
+ {"SPOTIFY", "Spotifye"},
+ {"YouTube", "You Tube"},
+ {"Youtube", "You Tube"},
+ {"youtube", "you tube"},
+ {"YOUTUBE", "You Tube"},
+ {"WiFi", "Why Fye"},
+ {"wifi", "why fye"},
+ {"WIFI", "Why Fye"},
+ {"Wi-Fi", "Why Fye"},
+ {"iPhone", "eye phone"},
+ {"iphone", "eye phone"},
+ {"IPHONE", "eye phone"},
+ {"iPad", "eye pad"},
+ {"ipad", "eye pad"},
+ {"IPAD", "eye pad"},
+ {"macOS", "mac O S"},
+ {"MacOS", "mac O S"},
+ {"iOS", "eye O S"},
+ {"AirPods", "Air Pods"},
+ {"airpods", "air pods"},
+ {"AIRPODS", "Air Pods"},
+ {"ChatGPT", "Chat G P T"},
+ {"WhatsApp", "Whats App"},
+ {"whatsapp", "whats app"},
+ {"WHATSAPP", "Whats App"},
+ {"TikTok", "Tick Tock"},
+ {"tiktok", "tick tock"},
+ {"TIKTOK", "Tick Tock"},
+ {"LinkedIn", "Linked In"},
+ {"linkedin", "linked in"},
+ {"LINKEDIN", "Linked In"},
+ };
+ for (auto& p : table) {
+ std::string needle(p.from);
+ std::string replacement(p.to);
+ size_t pos = 0;
+ while ((pos = out.find(needle, pos)) != std::string::npos) {
+ bool left_ok = (pos == 0 || out[pos - 1] == ' ' || out[pos - 1] == '\n' ||
+ out[pos - 1] == '"' || out[pos - 1] == '\'');
+ size_t end = pos + needle.size();
+ bool right_ok = (end >= out.size() || out[end] == ' ' || out[end] == ',' ||
+ out[end] == '.' || out[end] == '!' || out[end] == '?' ||
+ out[end] == '\n' || out[end] == ';' || out[end] == ':' ||
+ out[end] == '\'' || out[end] == '"');
+ if (left_ok && right_ok) {
+ out.replace(pos, needle.size(), replacement);
+ pos += replacement.size();
+ } else {
+ pos += needle.size();
+ }
+ }
+ }
+ }
+
+ // 6d. Replace hyphens between letters/words with spaces so G2P does not
+ // spell out hyphenated compounds (e.g. "well-known" → "well known").
+ {
+ std::string cleaned;
+ cleaned.reserve(out.size());
+ for (size_t i = 0; i < out.size(); i++) {
+ if (out[i] == '-' && i > 0 && i + 1 < out.size() &&
+ std::isalpha((unsigned char)out[i - 1]) &&
+ std::isalpha((unsigned char)out[i + 1])) {
+ cleaned += ' ';
+ } else {
+ cleaned += out[i];
+ }
+ }
+ out = std::move(cleaned);
+ }
+
// 7. Collapse multiple whitespace to single space, trim
{
std::string cleaned;
diff --git a/src/test/test_pipeline.cpp b/src/test/test_pipeline.cpp
index a4b7bfb..d73a1b8 100644
--- a/src/test/test_pipeline.cpp
+++ b/src/test/test_pipeline.cpp
@@ -783,31 +783,36 @@ static void test_metalrt_llm(const std::string& models_dir) {
engine.reset_conversation();
engine.generate("hi");
- // Benchmark 3 prompts
- const char* prompts[] = {
- "What is 2+2?",
- "Write a haiku about the sea.",
- "Explain gravity in one sentence.",
- };
-
- TEST_SECTION("MetalRT LLM Inference (Metal GPU)");
- for (int i = 0; i < 3; i++) {
+ // Benchmark across max_tokens sweep: 64, 128, 256, 512, 1024, 2048
+ const int token_limits[] = { 64, 128, 256, 512, 1024, 2048 };
+ const char* prompt = "Write a detailed essay about the history and future of artificial intelligence, "
+ "covering early pioneers, neural networks, deep learning breakthroughs, "
+ "large language models, and predictions for the next decade.";
+
+ TEST_SECTION("MetalRT LLM Token Sweep Benchmark (Metal GPU)");
+ fprintf(stderr, "\n \033[1;33m%-12s %8s %12s %10s %12s %10s %10s\033[0m\n",
+ "max_tokens", "gen_tok", "decode_ms", "tok/s", "prefill_ms", "pf_tok/s", "wall_ms");
+ fprintf(stderr, " \033[33m%s\033[0m\n",
+ "------------ -------- ------------ ---------- ------------ ---------- ----------");
+
+ for (int limit : token_limits) {
+ engine.set_max_tokens(limit);
+ engine.set_ignore_eos(true);
engine.reset_conversation();
+
t0 = std::chrono::steady_clock::now();
- std::string result = engine.generate(prompts[i]);
+ std::string result = engine.generate(prompt);
double gen_ms = elapsed_ms(t0);
const auto& stats = engine.last_stats();
- TEST_INFO("--- Run %d ---", i + 1);
- TEST_INFO(" Prompt: \"%s\"", prompts[i]);
- TEST_INFO(" Response: \"%.*s%s\"", (int)std::min(result.size(), (size_t)80),
- result.c_str(), result.size() > 80 ? "..." : "");
- TEST_INFO(" Backend: MetalRT (Metal GPU)");
- TEST_INFO(" Prefill: %.1f ms (%d tokens, %.0f tok/s)",
- stats.prompt_eval_us / 1000.0, stats.prompt_tokens, stats.prompt_tps());
- TEST_INFO(" Decode: %.1f ms (%d tokens, %.0f tok/s)",
- stats.generation_us / 1000.0, stats.generated_tokens, stats.gen_tps());
- TEST_INFO(" Wall: %.1f ms", gen_ms);
+ fprintf(stderr, " %-12d %8d %10.1f ms %8.1f %10.1f ms %8.0f %8.1f ms\n",
+ limit,
+ stats.generated_tokens,
+ stats.generation_us / 1000.0,
+ stats.gen_tps(),
+ stats.prompt_eval_us / 1000.0,
+ stats.prompt_tps(),
+ gen_ms);
TEST("run produces output", !result.empty());
}
}