diff --git a/CMakeLists.txt b/CMakeLists.txt
index 00f5224..e9515d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,10 @@ set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
 add_subdirectory(deps/llama.cpp ${CMAKE_BINARY_DIR}/llama.cpp EXCLUDE_FROM_ALL)
 
+# --- libmtmd (multimodal/vision support from llama.cpp) ---
+set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "" FORCE)
+add_subdirectory(deps/llama.cpp/tools/mtmd ${CMAKE_BINARY_DIR}/mtmd EXCLUDE_FROM_ALL)
+
 # --- sherpa-onnx (STT + TTS + VAD) ---
 set(SHERPA_ONNX_ENABLE_C_API ON CACHE BOOL "Enable C API" FORCE)
 set(SHERPA_ONNX_ENABLE_BINARY OFF CACHE BOOL "" FORCE)
@@ -99,8 +103,11 @@ add_library(rcli STATIC
     src/engines/metalrt_engine.cpp
     src/engines/metalrt_stt_engine.cpp
     src/engines/metalrt_tts_engine.cpp
+    src/engines/vlm_engine.cpp
     src/audio/audio_io.cpp
     src/audio/mic_permission.mm
+    src/audio/camera_capture.mm
+    src/audio/screen_capture.mm
     src/pipeline/orchestrator.cpp
     src/pipeline/sentence_detector.cpp
     src/tools/tool_engine.cpp
@@ -133,13 +140,14 @@ add_library(rcli STATIC
     src/api/rcli_api.cpp
 )
 
-set_source_files_properties(src/audio/mic_permission.mm
+set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm
     PROPERTIES LANGUAGE CXX)
 
 target_include_directories(rcli PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/src
     ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/include
     ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/include
+    ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/tools/mtmd
     ${CMAKE_CURRENT_SOURCE_DIR}/deps/sherpa-onnx/sherpa-onnx/c-api
     ${usearch_SOURCE_DIR}/include
 )
@@ -147,12 +155,18 @@ target_include_directories(rcli PUBLIC
 target_link_libraries(rcli PUBLIC
     llama
     ggml
+    mtmd
     sherpa-onnx-c-api
     "-framework CoreAudio"
     "-framework AudioToolbox"
     "-framework AudioUnit"
     "-framework Foundation"
     "-framework AVFoundation"
+    "-framework AppKit"
+    "-framework CoreImage"
+    "-framework CoreMedia"
+    "-framework CoreVideo"
+    "-framework CoreGraphics"
     "-framework IOKit"
 )
 
@@ -186,6 +200,27 @@ target_compile_definitions(rcli_cli PRIVATE
     RCLI_VERSION="${PROJECT_VERSION}"
 )
 
+# =============================================================================
+# rcli_overlay — standalone Cocoa helper for visual overlay window
+# =============================================================================
+add_executable(rcli_overlay
+    src/audio/rcli_overlay.m
+)
+
+set_source_files_properties(src/audio/rcli_overlay.m PROPERTIES LANGUAGE CXX)
+
+target_compile_options(rcli_overlay PRIVATE -x objective-c++)
+
+target_link_libraries(rcli_overlay PRIVATE
+    "-framework AppKit"
+    "-framework CoreGraphics"
+)
+
+set_target_properties(rcli_overlay PROPERTIES
+    OUTPUT_NAME "rcli_overlay"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}"
+)
+
 # =============================================================================
 # rcli_test — test executable
 # =============================================================================
diff --git a/README.md b/README.md
index dcefc11..972342a 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
   <a href="LICENSE"><img src="https://img.shields.io/badge/license-MIT-blue" alt="MIT"></a>
 </p>
 
-**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS pipeline running natively on Apple Silicon — 38 macOS actions via voice, local RAG over your documents, sub-200ms end-to-end latency. No cloud, no API keys.
+**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS + VLM pipeline running natively on Apple Silicon — 40 macOS actions via voice, local RAG over your documents, on-device vision (camera & screen analysis), sub-200ms end-to-end latency. No cloud, no API keys.
 
 Powered by [MetalRT](#metalrt-gpu-engine), a proprietary GPU inference engine built by [RunAnywhere, Inc.](https://runanywhere.ai) specifically for Apple Silicon.
 
@@ -112,6 +112,9 @@ rcli                             # interactive TUI (push-to-talk + text)
 rcli listen                      # continuous voice mode
 rcli ask "open Safari"           # one-shot command
 rcli ask "play some jazz on Spotify"
+rcli vlm photo.jpg "what's in this image?"  # vision analysis
+rcli camera                      # live camera VLM
+rcli screen                      # screen capture VLM
 rcli metalrt                     # MetalRT GPU engine management
 rcli llamacpp                    # llama.cpp engine management
 ```
@@ -149,7 +152,18 @@ A full STT + LLM + TTS pipeline running on Metal GPU with three concurrent threa
 - **Tool Calling** — LLM-native tool call formats (Qwen3, LFM2, etc.)
 - **Multi-turn Memory** — Sliding window conversation history with token-budget trimming
 
-### 38 macOS Actions
+### Vision (VLM)
+
+Analyze images, camera captures, and screen regions using on-device vision-language models. VLM runs on the llama.cpp engine via Metal GPU — no cloud.
+
+- **Image Analysis** — `rcli vlm photo.jpg "describe this"` for single-image queries
+- **Camera** — Press **V** in the TUI or run `rcli camera` for live camera analysis
+- **Screen Capture** — Press **S** in the TUI or run `rcli screen` to analyze screen regions
+- **Models** — Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — download on demand via `rcli models vlm`
+
+> **Note:** VLM is currently available on the llama.cpp engine. MetalRT VLM support is coming soon.
+
+### 40 macOS Actions
 
 Control your Mac by voice or text. The LLM routes intent to actions executed locally via AppleScript and shell commands.
 
@@ -161,7 +175,7 @@ Control your Mac by voice or text. The LLM routes intent to actions executed loc
 | **System** | `open_app`, `quit_app`, `set_volume`, `toggle_dark_mode`, `screenshot`, `lock_screen` |
 | **Web** | `search_web`, `search_youtube`, `open_url`, `open_maps` |
 
-Run `rcli actions` to see all 38, or toggle them on/off in the TUI Actions panel.
+Run `rcli actions` to see all 40, or toggle them on/off in the TUI Actions panel.
 
 > **Tip:** If tool calling feels unreliable, press **X** in the TUI to clear the conversation and reset context. With small LLMs, accumulated context can degrade tool-calling accuracy — a fresh context often fixes it.
 
@@ -181,7 +195,9 @@ A terminal dashboard with push-to-talk, live hardware monitoring, model manageme
 | Key | Action |
 |-----|--------|
 | **SPACE** | Push-to-talk |
-| **M** | Models — browse, download, hot-swap LLM/STT/TTS |
+| **V** | Camera — capture and analyze with VLM |
+| **S** | Screen — capture and analyze a screen region with VLM |
+| **M** | Models — browse, download, hot-swap LLM/STT/TTS/VLM |
 | **A** | Actions — browse, enable/disable macOS actions |
 | **R** | RAG — ingest documents |
 | **X** | Clear conversation and reset context |
@@ -207,7 +223,7 @@ MetalRT is distributed under a [proprietary license](https://github.com/Runanywh
 
 ## Supported Models
 
-RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch.
+RCLI supports 20+ models across LLM, STT, TTS, VLM, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch.
 
 **LLM:** LFM2 1.2B (default), LFM2 350M, LFM2.5 1.2B, LFM2 2.6B, Qwen3 0.6B, Qwen3.5 0.8B/2B/4B, Qwen3 4B
 
@@ -215,10 +231,13 @@ RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run loca
 
 **TTS:** Piper Lessac/Amy, KittenTTS Nano, Matcha LJSpeech, Kokoro English/Multi-lang
 
-**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings.
+**VLM:** Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — on-demand download via `rcli models vlm` (llama.cpp engine only)
+
+**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings. VLM models are downloaded on demand.
 
 ```bash
 rcli models                  # interactive model management
+rcli models vlm              # download/manage VLM models
 rcli upgrade-llm             # guided LLM upgrade
 rcli voices                  # browse and switch TTS voices
 rcli cleanup                 # remove unused models
@@ -247,10 +266,13 @@ All dependencies are vendored or CMake-fetched. Requires CMake 3.15+ and Apple C
 rcli                          Interactive TUI (push-to-talk + text + trace)
 rcli listen                   Continuous voice mode
 rcli ask <text>               One-shot text command
+rcli vlm <image> [prompt]     Analyze an image with VLM
+rcli camera [prompt]          Live camera capture + VLM analysis
+rcli screen [prompt]          Screen capture + VLM analysis
 rcli actions [name]           List actions or show detail
 rcli rag ingest <dir>         Index documents for RAG
 rcli rag query <text>         Query indexed documents
-rcli models [llm|stt|tts]    Manage AI models
+rcli models [llm|stt|tts|vlm] Manage AI models
 rcli voices                   Manage TTS voices
 rcli metalrt                  MetalRT GPU engine management
 rcli llamacpp                 llama.cpp engine management
diff --git a/src/api/rcli_api.cpp b/src/api/rcli_api.cpp
index 8baa3ef..f292c78 100644
--- a/src/api/rcli_api.cpp
+++ b/src/api/rcli_api.cpp
@@ -16,6 +16,7 @@
 #include "rag/index_builder.h"
 #include "pipeline/text_sanitizer.h"
 #include "pipeline/sentence_detector.h"
+#include "audio/screen_capture.h"
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
@@ -32,9 +33,15 @@
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <fcntl.h>
+#include <spawn.h>
+#include <cerrno>
+
+extern char** environ;
 
 #include "actions/action_registry.h"
 #include "actions/macos_actions.h"
+#include "engines/vlm_engine.h"
+#include "models/vlm_model_registry.h"
 
 using namespace rastack;
 
@@ -109,6 +116,13 @@ struct RCLIEngine {
     // so the context gauge shows stable, meaningful usage.
     int ctx_main_prompt_tokens = 0;
 
+    // VLM (Vision Language Model) subsystem
+    VlmEngine vlm_engine;
+    bool vlm_initialized = false;
+    std::string last_vlm_response;
+    std::string vlm_backend_name;         // "llama.cpp (Metal GPU)" or "MetalRT"
+    std::string vlm_model_name;           // e.g. "Qwen3 VL 2B"
+
     std::mutex mutex;
     bool initialized = false;
 };
@@ -969,6 +983,113 @@ static std::vector<rastack::ToolCall> try_parse_bare_tool_calls(
     return calls;
 }
 
+// Forward declaration (defined later in VLM section)
+static int vlm_init_locked(RCLIEngine* engine);
+
+// =============================================================================
+// Screen intent detection — intercept voice commands about the user's screen
+// =============================================================================
+
+static bool has_word(const std::string& text, const char* word) {
+    return text.find(word) != std::string::npos;
+}
+
+static bool is_screen_intent(const std::string& input) {
+    // Normalize to lowercase for matching
+    std::string lower = input;
+    for (auto& c : lower) c = (char)std::tolower((unsigned char)c);
+
+    // --- Tier 1: explicit screenshot keywords (always trigger) ---
+    if (has_word(lower, "screenshot") || has_word(lower, "screen capture") ||
+        has_word(lower, "screen shot"))
+        return true;
+
+    // --- Tier 2: "screen" + any vision/action verb ---
+    bool has_screen = has_word(lower, "screen");
+    if (has_screen) {
+        static const char* screen_verbs[] = {
+            "look", "see", "show", "what", "tell", "describe", "explain",
+            "check", "analyze", "read", "capture", "going on", "happening",
+        };
+        for (const auto* v : screen_verbs) {
+            if (has_word(lower, v)) return true;
+        }
+    }
+
+    // --- Tier 3: visual context phrases (no "screen" needed) ---
+    // "does this look good/right/ok", "how does this look", etc.
+    if (has_word(lower, "does this look") || has_word(lower, "how does this look"))
+        return true;
+    // "what am I looking at"
+    if (has_word(lower, "looking at") && has_word(lower, "what"))
+        return true;
+    // "can you see this/that", "what do you see", "what can you see"
+    if ((has_word(lower, "can you see") || has_word(lower, "do you see")) &&
+        !has_word(lower, "file") && !has_word(lower, "code") && !has_word(lower, "error"))
+        return true;
+    // "what's happening here", "explain what's happening"
+    if (has_word(lower, "happening here") || has_word(lower, "happening on"))
+        return true;
+
+    return false;
+}
+
+// Capture active window + analyze with VLM. Returns response or empty on failure.
+// Caller must hold engine->mutex.
+static std::string handle_screen_intent(RCLIEngine* engine, const std::string& user_text) {
+    // Generate a temp path
+    auto ts = std::chrono::system_clock::now().time_since_epoch().count();
+    std::string path = "/tmp/rcli_screen_" + std::to_string(ts) + ".jpg";
+
+    int rc;
+    const char* capture_source;
+    if (screen_capture_overlay_active()) {
+        // Visual mode: capture the overlay region
+        capture_source = "visual frame";
+        rc = screen_capture_overlay_region(path.c_str());
+    } else {
+        // Fallback: capture the previously active app's window
+        char target_app[256];
+        screen_capture_target_app_name(target_app, sizeof(target_app));
+        capture_source = target_app;
+        rc = screen_capture_behind_terminal(path.c_str());
+    }
+    LOG_INFO("RCLI", "[screen_intent] Capturing %s → %s", capture_source, path.c_str());
+    if (rc != 0) {
+        LOG_ERROR("RCLI", "[screen_intent] Screen capture failed");
+        return "I couldn't capture your screen. Please check screen recording permissions "
+               "in System Settings > Privacy & Security > Screen Recording.";
+    }
+
+    // Initialize VLM if needed
+    if (!engine->vlm_initialized) {
+        if (vlm_init_locked(engine) != 0) {
+            return "I can see you're asking about your screen, but VLM isn't available. "
+                   "It requires the llama.cpp engine and a VLM model. "
+                   "Switch with: rcli engine llamacpp, then download a model: rcli models vlm";
+        }
+    }
+
+    // Build a natural prompt from the user's words
+    std::string vlm_prompt = user_text;
+    if (vlm_prompt.empty()) {
+        vlm_prompt = "Describe what you see on this screen in detail.";
+    }
+
+    std::string result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr);
+
+    if (result.empty()) {
+        return "I captured your screen but the analysis failed. Please try again.";
+    }
+
+    // Prepend which app was captured so the user knows
+    std::string prefixed = "[Captured: " + std::string(capture_source) + "]\n" + result;
+
+    // Store for stats retrieval
+    engine->last_vlm_response = prefixed;
+    return prefixed;
+}
+
 // =============================================================================
 // Process command entry points
 // =============================================================================
@@ -984,6 +1105,14 @@ const char* rcli_process_command(RCLIHandle handle, const char* text) {
     LOG_TRACE("RCLI", "[process_command] engine->mutex acquired, input='%.40s'", text);
     std::string input(text);
 
+    // --- Screen intent intercept: capture active window + VLM ---
+    if (is_screen_intent(input)) {
+        engine->last_response = handle_screen_intent(engine, input);
+        engine->conversation_history.emplace_back("user", input);
+        engine->conversation_history.emplace_back("assistant", engine->last_response);
+        return engine->last_response.c_str();
+    }
+
     // --- MetalRT path: tool-aware inference via generate_raw (pre-formatted prompt) ---
     if (engine->pipeline.using_metalrt()) {
         auto& mrt = engine->pipeline.metalrt_llm();
@@ -1027,19 +1156,12 @@ const char* rcli_process_command(RCLIHandle handle, const char* text) {
             full_prompt.compare(0, cached.size(), cached) == 0) {
             std::string full_continuation = full_prompt.substr(cached.size());
 
-            if (engine->metalrt_kv_continuation_len > 0 &&
-                engine->metalrt_kv_continuation_len < full_continuation.size()) {
-                std::string new_part = full_continuation.substr(engine->metalrt_kv_continuation_len);
-                LOG_TRACE("RCLI", "[process_command] incremental continue "
-                        "(new=%zu chars, skip=%zu already in KV)",
-                        new_part.size(), engine->metalrt_kv_continuation_len);
-                raw_output = mrt.generate_raw_continue(new_part, nullptr, false);
-            } else {
-                LOG_TRACE("RCLI", "[process_command] full continue "
-                        "(continuation=%zu chars)", full_continuation.size());
-                raw_output = mrt.generate_raw_continue(full_continuation, nullptr, true);
-            }
-            engine->metalrt_kv_continuation_len = full_continuation.size();
+            // Always re-prefill full continuation from cached system prompt.
+            // Incremental continue (reset_cache=false) is unsafe because the KV
+            // cache includes generated tokens not tracked by continuation_len.
+            LOG_TRACE("RCLI", "[process_command] full continue "
+                    "(continuation=%zu chars)", full_continuation.size());
+            raw_output = mrt.generate_raw_continue(full_continuation, nullptr, true);
         } else {
             LOG_TRACE("RCLI", "[process_command] calling mrt.generate_raw() ...");
             raw_output = mrt.generate_raw(full_prompt);
@@ -1499,6 +1621,92 @@ const char* rcli_process_and_speak(RCLIHandle handle, const char* text,
     engine->streaming_cancelled.store(false, std::memory_order_release);
     std::string input(text);
 
+    // --- Screen intent intercept: capture + VLM + sentence-streamed TTS ---
+    if (is_screen_intent(input)) {
+        auto t_start_screen = std::chrono::steady_clock::now();
+        std::string response = handle_screen_intent(engine, input);
+        engine->last_response = response;
+        engine->conversation_history.emplace_back("user", input);
+        engine->conversation_history.emplace_back("assistant", response);
+
+        // Fire "response" callback so TUI displays the text
+        if (callback) {
+            callback("response", response.c_str(), user_data);
+        }
+
+        // Sentence-streamed TTS (same pattern as LLM path for low TTFA)
+        std::string clean_text = rastack::sanitize_for_tts(response);
+        if (!clean_text.empty()) {
+            if (!engine->pipeline.audio().is_running()) {
+                engine->pipeline.audio().start();
+            }
+            auto* rb = engine->pipeline.playback_ring_buffer();
+            if (rb) {
+                rb->clear();
+
+                // Split into sentences and synthesize each one
+                std::vector<std::string> sentences;
+                rastack::SentenceDetector splitter([&](const std::string& s) {
+                    sentences.push_back(s);
+                }, /*min_words=*/3);
+                // Feed the entire text token-by-token (word by word)
+                for (size_t i = 0; i < clean_text.size(); ) {
+                    size_t end = clean_text.find(' ', i);
+                    if (end == std::string::npos) end = clean_text.size();
+                    else end++; // include space
+                    splitter.feed(clean_text.substr(i, end - i));
+                    i = end;
+                }
+                splitter.flush();
+
+                bool first_audio = false;
+                for (auto& sentence : sentences) {
+                    if (engine->streaming_cancelled.load(std::memory_order_acquire)) break;
+
+                    std::vector<float> samples;
+                    if (engine->pipeline.using_metalrt_tts()) {
+                        samples = engine->pipeline.metalrt_tts().synthesize(sentence);
+                    } else {
+                        samples = engine->pipeline.tts().synthesize(sentence);
+                    }
+
+                    // Write with backpressure
+                    size_t offset = 0;
+                    while (offset < samples.size() &&
+                           !engine->streaming_cancelled.load(std::memory_order_acquire)) {
+                        size_t written = rb->write(samples.data() + offset, samples.size() - offset);
+                        offset += written;
+                        if (offset < samples.size()) {
+                            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                        }
+                    }
+
+                    if (!first_audio) {
+                        first_audio = true;
+                        if (callback) {
+                            auto now = std::chrono::steady_clock::now();
+                            double ttfa_ms = std::chrono::duration<double, std::milli>(now - t_start_screen).count();
+                            char buf[32];
+                            snprintf(buf, sizeof(buf), "%.1f", ttfa_ms);
+                            callback("first_audio", buf, user_data);
+                        }
+                    }
+                }
+
+                // Wait for playback to drain
+                size_t samples_per_frame = 256;
+                while (rb->available_read() > samples_per_frame &&
+                       !engine->streaming_cancelled.load(std::memory_order_acquire)) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(20));
+                }
+                std::this_thread::sleep_for(std::chrono::milliseconds(200));
+            }
+        }
+
+        if (callback) callback("complete", "{}", user_data);
+        return engine->last_response.c_str();
+    }
+
     auto t_start = std::chrono::steady_clock::now();
 
     // --- TTS worker thread (sentence queue → ring buffer → CoreAudio) ---
@@ -1711,19 +1919,14 @@ const char* rcli_process_and_speak(RCLIHandle handle, const char* text,
                       full_continuation.size(),
                       engine->metalrt_kv_continuation_len);
 
-            if (engine->metalrt_kv_continuation_len > 0 &&
-                engine->metalrt_kv_continuation_len < full_continuation.size()) {
-                std::string new_part = full_continuation.substr(engine->metalrt_kv_continuation_len);
-                LOG_DEBUG("RCLI", "[speak] incremental continue "
-                        "(new=%zu chars, skip=%zu already in KV)",
-                        new_part.size(), engine->metalrt_kv_continuation_len);
-                response = mrt.generate_raw_continue(new_part, streaming_cb, false);
-            } else {
-                LOG_DEBUG("RCLI", "[speak] full continue "
-                        "(continuation=%zu chars)", full_continuation.size());
-                response = mrt.generate_raw_continue(full_continuation, streaming_cb, true);
-            }
-            engine->metalrt_kv_continuation_len = full_continuation.size();
+            // Always truncate to cached system prompt and re-prefill the full
+            // continuation.  The incremental path (reset_cache=false) is unsafe
+            // because the KV cache also contains generated-response tokens that
+            // metalrt_kv_continuation_len does not account for, which causes
+            // duplicate content in the KV and corrupts multi-turn attention.
+            LOG_DEBUG("RCLI", "[speak] full continue "
+                    "(continuation=%zu chars)", full_continuation.size());
+            response = mrt.generate_raw_continue(full_continuation, streaming_cb, true);
         } else {
             LOG_DEBUG("RCLI", "[speak] cache MISS path — calling generate_raw() "
                       "(has_cache=%d prefix_match=%d)",
@@ -2745,6 +2948,243 @@ void rcli_get_context_info(RCLIHandle handle, int* out_prompt_tokens, int* out_c
     }
 }
 
+// =============================================================================
+// VLM (Vision Language Model)
+// =============================================================================
+
+// Recursively create directories (like mkdir -p)
+static bool mkdirs(const std::string& path) {
+    struct stat st;
+    if (stat(path.c_str(), &st) == 0) return S_ISDIR(st.st_mode);
+    // Recurse to create parent
+    auto slash = path.rfind('/');
+    if (slash != std::string::npos && slash > 0) {
+        if (!mkdirs(path.substr(0, slash))) return false;
+    }
+    return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST;
+}
+
+// Download a file using fork/exec to avoid shell injection
+static bool safe_download(const std::string& url, const std::string& dest) {
+    pid_t pid;
+    const char* argv[] = {
+        "curl", "-L", "--progress-bar", "-o", dest.c_str(), url.c_str(), nullptr
+    };
+    int status = 0;
+    posix_spawn_file_actions_t actions;
+    posix_spawn_file_actions_init(&actions);
+    if (posix_spawnp(&pid, "curl", &actions, nullptr,
+                     const_cast<char* const*>(argv), environ) != 0) {
+        posix_spawn_file_actions_destroy(&actions);
+        return false;
+    }
+    posix_spawn_file_actions_destroy(&actions);
+    waitpid(pid, &status, 0);
+    return WIFEXITED(status) && WEXITSTATUS(status) == 0;
+}
+
+// Internal init (caller must hold engine->mutex)
+// VLM is only available on the llama.cpp engine. MetalRT VLM support coming soon.
+static int vlm_init_locked(RCLIEngine* engine) {
+    if (engine->vlm_initialized) return 0;
+
+    if (engine->models_dir.empty()) {
+        if (const char* home = getenv("HOME"))
+            engine->models_dir = std::string(home) + "/Library/RCLI/models";
+        else
+            engine->models_dir = "./models";
+    }
+
+    // VLM requires the llama.cpp engine
+    if (engine->initialized && engine->pipeline.using_metalrt()) {
+        LOG_ERROR("VLM", "VLM is currently available with the llama.cpp engine. Switch with: rcli engine llamacpp");
+        return -1;
+    }
+
+    // Check if any VLM model is installed (on-demand, no auto-download)
+    auto vlm_models = rcli::all_vlm_models();
+    rcli::VlmModelDef model_def;
+    bool found = false;
+
+    for (auto& m : vlm_models) {
+        if (rcli::is_vlm_model_installed(engine->models_dir, m)) {
+            model_def = m;
+            found = true;
+            break;
+        }
+    }
+
+    if (!found) {
+        LOG_ERROR("VLM", "No VLM model installed. Download one with: rcli models vlm");
+        return -1;
+    }
+
+    // Initialize VLM engine with the installed model
+    VlmConfig config;
+    config.model_path  = engine->models_dir + "/" + model_def.model_filename;
+    config.mmproj_path = engine->models_dir + "/" + model_def.mmproj_filename;
+    config.n_gpu_layers = 99;
+    config.n_ctx        = 4096;
+    config.n_batch      = 512;
+    config.n_threads       = 1;
+    config.n_threads_batch = 8;
+    config.flash_attn   = true;
+
+    if (!engine->vlm_engine.init(config)) {
+        LOG_ERROR("VLM", "Failed to initialize VLM engine");
+        return -1;
+    }
+
+    engine->vlm_initialized = true;
+    engine->vlm_backend_name = "llama.cpp (Metal GPU)";
+    engine->vlm_model_name = model_def.name;
+    LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str());
+    return 0;
+}
+
+int rcli_vlm_init(RCLIHandle handle) {
+    if (!handle) return -1;
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    std::lock_guard<std::mutex> lock(engine->mutex);
+    return vlm_init_locked(engine);
+}
+
+const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const char* prompt) {
+    if (!handle || !image_path) return "";
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    std::lock_guard<std::mutex> lock(engine->mutex);
+
+    if (!engine->vlm_initialized) {
+        if (vlm_init_locked(engine) != 0) {
+            engine->last_vlm_response = "VLM not available. Requires llama.cpp engine (rcli engine llamacpp) and a VLM model (rcli models vlm).";
+            return engine->last_vlm_response.c_str();
+        }
+    }
+
+    std::string text_prompt = prompt && prompt[0]
+        ? std::string(prompt)
+        : "Describe this image in detail.";
+
+    {
+        std::string result = engine->vlm_engine.analyze_image(
+            std::string(image_path), text_prompt, nullptr);
+
+        if (result.empty()) {
+            engine->last_vlm_response = "Error: Failed to analyze image.";
+        } else {
+            engine->last_vlm_response = result;
+        }
+    }
+    return engine->last_vlm_response.c_str();
+}
+
+int rcli_vlm_is_ready(RCLIHandle handle) {
+    if (!handle) return 0;
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    return engine->vlm_initialized ? 1 : 0;
+}
+
+const char* rcli_vlm_backend_name(RCLIHandle handle) {
+    if (!handle) return "";
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    return engine->vlm_backend_name.c_str();
+}
+
+const char* rcli_vlm_model_name(RCLIHandle handle) {
+    if (!handle) return "";
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    return engine->vlm_model_name.c_str();
+}
+
+int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) {
+    if (!handle || !out_stats) return -1;
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    if (!engine->vlm_initialized) return -1;
+
+    auto& s = engine->vlm_engine.last_stats();
+    out_stats->gen_tok_per_sec  = s.gen_tps();
+    out_stats->generated_tokens = static_cast<int>(s.generated_tokens);
+    out_stats->total_time_sec   = (s.image_encode_us + s.generation_us) / 1e6;
+    out_stats->image_encode_ms  = s.image_encode_us / 1000.0;
+    out_stats->first_token_ms   = s.first_token_us / 1000.0;
+    return 0;
+}
+
+// =============================================================================
+// VLM GPU swap: enter/exit visual mode by swapping LLM ↔ VLM on GPU
+// =============================================================================
+
+int rcli_vlm_enter(RCLIHandle handle) {
+    if (!handle) return -1;
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    std::lock_guard<std::mutex> lock(engine->mutex);
+
+    if (engine->vlm_initialized) return 0;
+    return vlm_init_locked(engine);
+}
+
+int rcli_vlm_exit(RCLIHandle handle) {
+    if (!handle) return -1;
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    std::lock_guard<std::mutex> lock(engine->mutex);
+
+    if (engine->vlm_engine.is_initialized()) {
+        engine->vlm_engine.shutdown();
+    }
+
+    engine->vlm_initialized = false;
+    engine->vlm_backend_name.clear();
+    engine->vlm_model_name.clear();
+    LOG_INFO("VLM", "VLM unloaded");
+    return 0;
+}
+
+int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
+                            const char* prompt,
+                            RCLIEventCallback callback, void* user_data) {
+    if (!handle || !image_path) return -1;
+    auto* engine = static_cast<RCLIEngine*>(handle);
+    std::lock_guard<std::mutex> lock(engine->mutex);
+
+    // Lazy-init VLM if not yet loaded
+    if (!engine->vlm_initialized) {
+        if (vlm_init_locked(engine) != 0) {
+            LOG_ERROR("VLM", "Failed to initialize VLM engine for streaming");
+            return -1;
+        }
+    }
+
+    std::string text_prompt = (prompt && prompt[0])
+        ? std::string(prompt) : "Describe this image in detail.";
+
+    // llama.cpp VLM streaming path
+    rastack::TokenCallback token_cb = nullptr;
+    if (callback) {
+        token_cb = [callback, user_data](const rastack::TokenOutput& tok) {
+            if (!tok.text.empty()) {
+                callback("token", tok.text.c_str(), user_data);
+            }
+        };
+    }
+
+    std::string result = engine->vlm_engine.analyze_image(
+        std::string(image_path), text_prompt, token_cb);
+
+    engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result;
+
+    if (callback) {
+        callback("response", engine->last_vlm_response.c_str(), user_data);
+        auto& s = engine->vlm_engine.last_stats();
+        char stats_buf[256];
+        snprintf(stats_buf, sizeof(stats_buf),
+                 "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}",
+                 s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0);
+        callback("stats", stats_buf, user_data);
+    }
+
+    return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0;
+}
+
 } // extern "C"
 
 std::vector<rcli::ActionDef> rcli_get_all_action_defs(RCLIHandle handle) {
diff --git a/src/api/rcli_api.h b/src/api/rcli_api.h
index 5a0e2d3..e6906d1 100644
--- a/src/api/rcli_api.h
+++ b/src/api/rcli_api.h
@@ -262,6 +262,60 @@ const char* rcli_get_stt_model(RCLIHandle handle);
 // Both output pointers are optional (pass NULL to skip).
 void rcli_get_context_info(RCLIHandle handle, int* out_prompt_tokens, int* out_ctx_size);
 
+// --- VLM (Vision Language Model) ---
+
+// Initialize the VLM engine with the default VLM model.
+// Lazily downloads the model if not present. Thread-safe.
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_init(RCLIHandle handle);
+
+// Analyze an image with an optional text prompt.
+// image_path: absolute path to an image file (jpg, png, bmp, gif, webp, tga).
+// prompt: text prompt (e.g. "Describe this image"). NULL defaults to "Describe this image in detail."
+// Returns the analysis text. Caller must NOT free the returned pointer.
+const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const char* prompt);
+
+// Check if the VLM engine is initialized and ready for image analysis.
+// Returns 1 if ready, 0 if not.
+int rcli_vlm_is_ready(RCLIHandle handle);
+
+// Get the name of the active VLM backend (e.g. "llama.cpp (Metal GPU)" or "MetalRT").
+// Returns "" if VLM is not initialized.
+const char* rcli_vlm_backend_name(RCLIHandle handle);
+
+// Get the name of the active VLM model (e.g. "Qwen3 VL 2B Instruct").
+// Returns "" if VLM is not initialized.
+const char* rcli_vlm_model_name(RCLIHandle handle);
+
+// VLM performance stats from the last analysis call.
+typedef struct {
+    double   gen_tok_per_sec;     // Generation tokens/second
+    int      generated_tokens;    // Total tokens generated
+    double   total_time_sec;      // Total wall time (image encode + prompt eval + generation)
+    double   image_encode_ms;     // Time to encode image through vision projector
+    double   first_token_ms;      // Time-to-first-token (prompt eval + image encode)
+} RCLIVlmStats;
+
+// Get stats from the last VLM analysis. Returns 0 on success.
+int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats);
+
+// Swap MetalRT LLM out and VLM in on the GPU (for visual mode).
+// Unloads the LLM model, loads the MetalRT VLM model.
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_enter(RCLIHandle handle);
+
+// Swap MetalRT VLM out and LLM back in on the GPU (exit visual mode).
+// Unloads the VLM model, reloads the LLM and re-caches the system prompt.
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_exit(RCLIHandle handle);
+
+// Streaming VLM image analysis (use after rcli_vlm_enter).
+// Fires callback with events: "token", "response", "stats".
+// Returns 0 on success, -1 on failure.
+int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path,
+                            const char* prompt,
+                            RCLIEventCallback callback, void* user_data);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/audio/camera_capture.h b/src/audio/camera_capture.h
new file mode 100644
index 0000000..1d5ade4
--- /dev/null
+++ b/src/audio/camera_capture.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Capture a single frame from the default camera and save as JPEG.
+// output_path: where to save the JPEG (e.g. "/tmp/rcli_camera.jpg").
+// Returns 0 on success, -1 on failure.
+int camera_capture_photo(const char* output_path);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/audio/camera_capture.mm b/src/audio/camera_capture.mm
new file mode 100644
index 0000000..a4cdf8b
--- /dev/null
+++ b/src/audio/camera_capture.mm
@@ -0,0 +1,142 @@
+#import <AVFoundation/AVFoundation.h>
+#import <CoreImage/CoreImage.h>
+#import <AppKit/AppKit.h>
+#import <dispatch/dispatch.h>
+#include "camera_capture.h"
+#include <atomic>
+
+// Delegate that skips warmup frames then captures one properly-exposed frame
+@interface RCLISingleFrameCapture : NSObject <AVCaptureVideoDataOutputSampleBufferDelegate>
+@property (nonatomic, strong) NSString *outputPath;
+@property (nonatomic, assign) BOOL captured;
+@property (nonatomic, strong) dispatch_semaphore_t semaphore;
+@property (nonatomic, assign) int frameCount;
+@property (nonatomic, assign) int framesToSkip;
+@end
+
+@implementation RCLISingleFrameCapture
+
+- (void)captureOutput:(AVCaptureOutput *)output
+didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer
+       fromConnection:(AVCaptureConnection *)connection {
+    if (self.captured) return;
+
+    // Skip initial frames to let auto-exposure/white-balance stabilize
+    self.frameCount++;
+    if (self.frameCount < self.framesToSkip) return;
+
+    self.captured = YES;
+
+    CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer);
+    if (!imageBuffer) {
+        dispatch_semaphore_signal(self.semaphore);
+        return;
+    }
+
+    CIImage *ciImage = [CIImage imageWithCVImageBuffer:imageBuffer];
+    NSCIImageRep *rep = [NSCIImageRep imageRepWithCIImage:ciImage];
+    NSImage *nsImage = [[NSImage alloc] initWithSize:rep.size];
+    [nsImage addRepresentation:rep];
+
+    // Convert to JPEG at high quality
+    NSData *tiffData = [nsImage TIFFRepresentation];
+    NSBitmapImageRep *bitmapRep = [NSBitmapImageRep imageRepWithData:tiffData];
+    NSData *jpegData = [bitmapRep representationUsingType:NSBitmapImageFileTypeJPEG
+                                               properties:@{NSImageCompressionFactor: @0.92}];
+    [jpegData writeToFile:self.outputPath atomically:YES];
+
+    dispatch_semaphore_signal(self.semaphore);
+}
+
+@end
+
+int camera_capture_photo(const char* output_path) {
+    @autoreleasepool {
+        // Check camera permission
+        AVAuthorizationStatus status = [AVCaptureDevice authorizationStatusForMediaType:AVMediaTypeVideo];
+        if (status == AVAuthorizationStatusDenied || status == AVAuthorizationStatusRestricted) {
+            return -1;
+        }
+        if (status == AVAuthorizationStatusNotDetermined) {
+            dispatch_semaphore_t perm_sem = dispatch_semaphore_create(0);
+            __block BOOL granted = NO;
+            [AVCaptureDevice requestAccessForMediaType:AVMediaTypeVideo completionHandler:^(BOOL g) {
+                granted = g;
+                dispatch_semaphore_signal(perm_sem);
+            }];
+            dispatch_semaphore_wait(perm_sem, DISPATCH_TIME_FOREVER);
+            if (!granted) return -1;
+        }
+
+        // Find default camera
+        AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo];
+        if (!device) return -1;
+
+        // Configure camera for best quality and let auto-exposure do its thing
+        NSError *error = nil;
+        if ([device lockForConfiguration:&error]) {
+            // Enable continuous auto-exposure and white balance
+            if ([device isExposureModeSupported:AVCaptureExposureModeContinuousAutoExposure]) {
+                device.exposureMode = AVCaptureExposureModeContinuousAutoExposure;
+            }
+            if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance]) {
+                device.whiteBalanceMode = AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance;
+            }
+            if ([device isFocusModeSupported:AVCaptureFocusModeContinuousAutoFocus]) {
+                device.focusMode = AVCaptureFocusModeContinuousAutoFocus;
+            }
+            [device unlockForConfiguration];
+        }
+
+        AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&error];
+        if (!input) return -1;
+
+        AVCaptureSession *session = [[AVCaptureSession alloc] init];
+        // Use Photo preset for highest quality
+        if ([session canSetSessionPreset:AVCaptureSessionPresetPhoto]) {
+            session.sessionPreset = AVCaptureSessionPresetPhoto;
+        } else if ([session canSetSessionPreset:AVCaptureSessionPresetHigh]) {
+            session.sessionPreset = AVCaptureSessionPresetHigh;
+        } else {
+            session.sessionPreset = AVCaptureSessionPresetMedium;
+        }
+
+        if (![session canAddInput:input]) return -1;
+        [session addInput:input];
+
+        AVCaptureVideoDataOutput *videoOutput = [[AVCaptureVideoDataOutput alloc] init];
+        videoOutput.videoSettings = @{(NSString *)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA)};
+        videoOutput.alwaysDiscardsLateVideoFrames = YES;
+
+        RCLISingleFrameCapture *delegate = [[RCLISingleFrameCapture alloc] init];
+        delegate.outputPath = [NSString stringWithUTF8String:output_path];
+        delegate.captured = NO;
+        delegate.semaphore = dispatch_semaphore_create(0);
+        delegate.frameCount = 0;
+        // Skip ~60 frames (~2 seconds at 30fps) to let auto-exposure fully stabilize
+        delegate.framesToSkip = 60;
+
+        dispatch_queue_t queue = dispatch_queue_create("com.rcli.camera", DISPATCH_QUEUE_SERIAL);
+        [videoOutput setSampleBufferDelegate:delegate queue:queue];
+
+        if (![session canAddOutput:videoOutput]) return -1;
+        [session addOutput:videoOutput];
+
+        // Start capture — delegate will skip first 60 frames for AE stabilization
+        [session startRunning];
+
+        // Wait for frame capture (timeout 10 seconds — allows for warmup + capture)
+        long result = dispatch_semaphore_wait(delegate.semaphore,
+            dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC));
+
+        [session stopRunning];
+
+        if (result != 0) return -1; // timeout
+
+        // Verify the file was written
+        NSFileManager *fm = [NSFileManager defaultManager];
+        if (![fm fileExistsAtPath:delegate.outputPath]) return -1;
+
+        return 0;
+    }
+}
diff --git a/src/audio/rcli_overlay.m b/src/audio/rcli_overlay.m
new file mode 100644
index 0000000..274a3fc
--- /dev/null
+++ b/src/audio/rcli_overlay.m
@@ -0,0 +1,191 @@
+// rcli_overlay — standalone Cocoa app showing a draggable/resizable overlay
+// frame for screen capture. Communicates with parent RCLI via stdin/stdout.
+//
+// Commands (one per line on stdin):
+//   frame   → replies "x,y,w,h\n" (screen coords, top-left origin)
+//   hide    → sets alpha to 0 (for capture)
+//   show    → restores alpha to 1
+//   quit    → exits
+
+#import <AppKit/AppKit.h>
+
+static const CGFloat kBorder    = 6.0;
+static const CGFloat kRadius    = 12.0;
+static const CGFloat kHandle    = 18.0;   // corner handle size
+static const CGFloat kEdgeGrab  = 14.0;   // invisible edge grab zone
+
+// ── Custom view: bold border + corner handles + label pill ─────────────
+@interface OverlayView : NSView
+@end
+
+@implementation OverlayView
+
+- (void)drawRect:(NSRect)dirtyRect {
+    [[NSColor clearColor] set];
+    NSRectFill(dirtyRect);
+
+    NSRect inner = NSInsetRect(self.bounds, kBorder, kBorder);
+    NSColor *green = [NSColor colorWithRed:0.15 green:0.9 blue:0.45 alpha:0.92];
+
+    // Outer glow
+    NSBezierPath *glow = [NSBezierPath bezierPathWithRoundedRect:inner
+                                                         xRadius:kRadius yRadius:kRadius];
+    [glow setLineWidth:kBorder + 6];
+    [[green colorWithAlphaComponent:0.12] set];
+    [glow stroke];
+
+    // Main border — solid, thick, rounded
+    NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:inner
+                                                           xRadius:kRadius yRadius:kRadius];
+    [border setLineWidth:kBorder];
+    [green set];
+    [border stroke];
+
+    // Corner handles — filled rounded squares with white dot
+    CGFloat hs = kHandle;
+    CGFloat off = kBorder / 2;
+    NSRect corners[4] = {
+        NSMakeRect(NSMinX(inner) - off, NSMinY(inner) - off, hs, hs),
+        NSMakeRect(NSMaxX(inner) + off - hs, NSMinY(inner) - off, hs, hs),
+        NSMakeRect(NSMinX(inner) - off, NSMaxY(inner) + off - hs, hs, hs),
+        NSMakeRect(NSMaxX(inner) + off - hs, NSMaxY(inner) + off - hs, hs, hs),
+    };
+    for (int i = 0; i < 4; i++) {
+        NSBezierPath *h = [NSBezierPath bezierPathWithRoundedRect:corners[i]
+                                                          xRadius:4 yRadius:4];
+        [green set];
+        [h fill];
+        // White center dot
+        NSRect dot = NSInsetRect(corners[i], 5, 5);
+        [[NSColor colorWithWhite:1.0 alpha:0.85] set];
+        [[NSBezierPath bezierPathWithOvalInRect:dot] fill];
+    }
+
+    // Label pill — centered at top
+    NSString *label = @"  RCLI Visual Mode  ";
+    NSDictionary *attrs = @{
+        NSFontAttributeName: [NSFont systemFontOfSize:11 weight:NSFontWeightBold],
+        NSForegroundColorAttributeName: [NSColor blackColor],
+    };
+    NSSize sz = [label sizeWithAttributes:attrs];
+    CGFloat px = NSMidX(self.bounds) - sz.width / 2 - 6;
+    CGFloat py = NSMaxY(inner) - 2;
+    NSRect pill = NSMakeRect(px, py, sz.width + 12, sz.height + 6);
+    NSBezierPath *pillPath = [NSBezierPath bezierPathWithRoundedRect:pill
+                                                             xRadius:10 yRadius:10];
+    [green set];
+    [pillPath fill];
+    [label drawAtPoint:NSMakePoint(px + 6, py + 3) withAttributes:attrs];
+}
+
+- (BOOL)acceptsFirstMouse:(NSEvent *)e { return YES; }
+@end
+
+// ── Custom window: borderless, transparent, floating, draggable ───────
+@interface OverlayWindow : NSWindow
+@end
+
+@implementation OverlayWindow
+- (instancetype)initWithRect:(NSRect)rect {
+    self = [super initWithContentRect:rect
+                            styleMask:NSWindowStyleMaskBorderless |
+                                      NSWindowStyleMaskResizable
+                              backing:NSBackingStoreBuffered
+                                defer:NO];
+    if (self) {
+        self.opaque = NO;
+        self.backgroundColor = [NSColor clearColor];
+        self.level = NSFloatingWindowLevel;
+        self.hasShadow = NO;
+        self.movableByWindowBackground = YES;
+        self.contentView = [[OverlayView alloc] initWithFrame:rect];
+        self.collectionBehavior = NSWindowCollectionBehaviorCanJoinAllSpaces |
+                                  NSWindowCollectionBehaviorStationary;
+        self.minSize = NSMakeSize(120, 80);
+    }
+    return self;
+}
+- (BOOL)canBecomeKeyWindow  { return YES; }
+- (BOOL)canBecomeMainWindow { return NO; }
+@end
+
+// ── Stdin reader (runs on a background thread) ────────────────────────
+@interface StdinReader : NSObject
+@property (nonatomic, strong) OverlayWindow *window;
+- (void)startReading;
+- (void)handleCommand:(NSString *)cmd;
+@end
+
+@implementation StdinReader
+
+- (void)startReading {
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        char buf[256];
+        while (fgets(buf, sizeof(buf), stdin)) {
+            NSString *cmd = [[NSString stringWithUTF8String:buf]
+                stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]];
+            if (cmd.length == 0) continue;
+            [self performSelectorOnMainThread:@selector(handleCommand:)
+                                   withObject:cmd
+                                waitUntilDone:YES];
+        }
+        dispatch_async(dispatch_get_main_queue(), ^{
+            [NSApp terminate:nil];
+        });
+    });
+}
+
+- (void)handleCommand:(NSString *)cmd {
+    if ([cmd isEqualToString:@"frame"]) {
+        NSRect f = self.window.frame;
+        CGFloat screenH = [NSScreen mainScreen].frame.size.height;
+        int x = (int)f.origin.x;
+        int y = (int)(screenH - f.origin.y - f.size.height);
+        int w = (int)f.size.width;
+        int h = (int)f.size.height;
+        printf("%d,%d,%d,%d\n", x, y, w, h);
+        fflush(stdout);
+    } else if ([cmd isEqualToString:@"hide"]) {
+        [self.window setAlphaValue:0.0];
+        [NSThread sleepForTimeInterval:0.05];
+        printf("ok\n");
+        fflush(stdout);
+    } else if ([cmd isEqualToString:@"show"]) {
+        [self.window setAlphaValue:1.0];
+        printf("ok\n");
+        fflush(stdout);
+    } else if ([cmd isEqualToString:@"quit"]) {
+        [NSApp terminate:nil];
+    }
+}
+
+@end
+
+// ── Main ──────────────────────────────────────────────────────────────
+int main(int argc, const char *argv[]) {
+    @autoreleasepool {
+        NSApplication *app = [NSApplication sharedApplication];
+        [app setActivationPolicy:NSApplicationActivationPolicyAccessory];
+
+        NSScreen *scr = [NSScreen mainScreen];
+        NSRect sf = scr.frame;
+        CGFloat w = 800, h = 600;
+        CGFloat x = (sf.size.width - w) / 2;
+        CGFloat y = (sf.size.height - h) / 2;
+
+        OverlayWindow *win = [[OverlayWindow alloc]
+            initWithRect:NSMakeRect(x, y, w, h)];
+        [win makeKeyAndOrderFront:nil];
+        [app activateIgnoringOtherApps:YES];
+
+        StdinReader *reader = [[StdinReader alloc] init];
+        reader.window = win;
+        [reader startReading];
+
+        printf("ready\n");
+        fflush(stdout);
+
+        [app run];
+    }
+    return 0;
+}
diff --git a/src/audio/screen_capture.h b/src/audio/screen_capture.h
new file mode 100644
index 0000000..0cc5421
--- /dev/null
+++ b/src/audio/screen_capture.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --- Visual Mode (overlay frame) ---
+
+// Show the visual overlay window. User can drag/resize it over content.
+// x, y, w, h: initial position and size in screen coordinates (0 = defaults).
+void screen_capture_show_overlay(int x, int y, int w, int h);
+
+// Hide the visual overlay window.
+void screen_capture_hide_overlay(void);
+
+// Returns 1 if the overlay is currently visible.
+int screen_capture_overlay_active(void);
+
+// Capture the screen region behind the overlay (hides overlay briefly).
+// Returns 0 on success, -1 on failure.
+int screen_capture_overlay_region(const char* output_path);
+
+// --- Legacy capture functions ---
+
+// Capture the frontmost/active window and save as JPEG.
+int screen_capture_active_window(const char* output_path);
+
+// Capture the window behind our own terminal (for voice triggers).
+int screen_capture_behind_terminal(const char* output_path);
+
+// Capture the entire main display and save as JPEG (fallback).
+int screen_capture_full_screen(const char* output_path);
+
+// Convenience: tries overlay if active, then active window, then full screen.
+int screen_capture_screenshot(const char* output_path);
+
+// Get the name of the app targeted by screen_capture_behind_terminal.
+const char* screen_capture_target_app_name(char* buf, int buf_size);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/audio/screen_capture.mm b/src/audio/screen_capture.mm
new file mode 100644
index 0000000..e2f3ea8
--- /dev/null
+++ b/src/audio/screen_capture.mm
@@ -0,0 +1,425 @@
+#import <AppKit/AppKit.h>
+#import <CoreGraphics/CoreGraphics.h>
+#include "screen_capture.h"
+#include <spawn.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <mach-o/dyld.h>
+#include <cmath>
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+extern char** environ;
+
+// ---------------------------------------------------------------------------
+// Helper: downscale a JPEG on disk if it exceeds max dimension (for VLM)
+// ---------------------------------------------------------------------------
+static void downscale_jpeg_if_needed(const char* path, int max_dim) {
+    @autoreleasepool {
+        NSString *nsPath = [NSString stringWithUTF8String:path];
+        NSData *data = [NSData dataWithContentsOfFile:nsPath];
+        if (!data) return;
+
+        NSBitmapImageRep *srcRep = [NSBitmapImageRep imageRepWithData:data];
+        if (!srcRep) return;
+
+        NSInteger w = srcRep.pixelsWide;
+        NSInteger h = srcRep.pixelsHigh;
+        if (w <= max_dim && h <= max_dim) return;
+
+        CGFloat scale = (CGFloat)max_dim / fmax((CGFloat)w, (CGFloat)h);
+        NSInteger nw = (NSInteger)floor(w * scale);
+        NSInteger nh = (NSInteger)floor(h * scale);
+
+        NSBitmapImageRep *dstRep = [[NSBitmapImageRep alloc]
+            initWithBitmapDataPlanes:NULL
+                          pixelsWide:nw
+                          pixelsHigh:nh
+                       bitsPerSample:8
+                     samplesPerPixel:4
+                            hasAlpha:YES
+                            isPlanar:NO
+                      colorSpaceName:NSCalibratedRGBColorSpace
+                         bytesPerRow:0
+                        bitsPerPixel:0];
+
+        [NSGraphicsContext saveGraphicsState];
+        NSGraphicsContext *ctx = [NSGraphicsContext graphicsContextWithBitmapImageRep:dstRep];
+        [NSGraphicsContext setCurrentContext:ctx];
+        [ctx setImageInterpolation:NSImageInterpolationHigh];
+
+        NSImage *nsImage = [[NSImage alloc] initWithSize:NSMakeSize((CGFloat)w, (CGFloat)h)];
+        [nsImage addRepresentation:srcRep];
+        [nsImage drawInRect:NSMakeRect(0, 0, (CGFloat)nw, (CGFloat)nh)
+                   fromRect:NSZeroRect
+                  operation:NSCompositingOperationCopy
+                   fraction:1.0];
+
+        [NSGraphicsContext restoreGraphicsState];
+
+        NSData *jpegData = [dstRep representationUsingType:NSBitmapImageFileTypeJPEG
+                                                properties:@{NSImageCompressionFactor: @0.85}];
+        if (jpegData && jpegData.length > 0) {
+            [jpegData writeToFile:nsPath atomically:YES];
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Helper: run screencapture with given args, verify output
+// ---------------------------------------------------------------------------
+static int run_screencapture(const char* const argv[], const char* output_path) {
+    pid_t pid;
+    int status = 0;
+    if (posix_spawnp(&pid, "screencapture", nullptr, nullptr,
+                     const_cast<char* const*>(argv), environ) != 0) {
+        return -1;
+    }
+    waitpid(pid, &status, 0);
+    if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) return -1;
+
+    struct stat st;
+    if (stat(output_path, &st) != 0 || st.st_size == 0) return -1;
+
+    downscale_jpeg_if_needed(output_path, 2048);
+    return 0;
+}
+
+// ===========================================================================
+// Visual overlay — spawns rcli_overlay helper process (separate Cocoa app)
+// because AppKit window management requires the main thread, which FTXUI owns.
+// Communication via stdin/stdout pipes.
+// ===========================================================================
+
+static pid_t g_overlay_pid = 0;
+static FILE *g_overlay_stdin = nullptr;   // we write commands here
+static FILE *g_overlay_stdout = nullptr;  // we read responses here
+static std::atomic<bool> g_overlay_visible{false};
+
+// Find rcli_overlay binary next to the rcli binary
+static std::string find_overlay_binary() {
+    // Try next to our own executable
+    char path[1024];
+    uint32_t size = sizeof(path);
+    if (_NSGetExecutablePath(path, &size) == 0) {
+        std::string dir(path);
+        auto slash = dir.rfind('/');
+        if (slash != std::string::npos) {
+            std::string candidate = dir.substr(0, slash + 1) + "rcli_overlay";
+            if (access(candidate.c_str(), X_OK) == 0) return candidate;
+        }
+    }
+    // Fallback: try PATH
+    return "rcli_overlay";
+}
+
+// Send a command to the overlay process and read the response line
+static std::string overlay_cmd(const char* cmd) {
+    if (!g_overlay_stdin || !g_overlay_stdout) return "";
+    fprintf(g_overlay_stdin, "%s\n", cmd);
+    fflush(g_overlay_stdin);
+    char buf[256] = {0};
+    if (fgets(buf, sizeof(buf), g_overlay_stdout)) {
+        // Strip trailing newline
+        size_t len = strlen(buf);
+        if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0';
+        return std::string(buf);
+    }
+    return "";
+}
+
+void screen_capture_show_overlay(int x, int y, int w, int h) {
+    (void)x; (void)y; (void)w; (void)h; // TODO: pass initial rect to helper
+
+    if (g_overlay_pid > 0) {
+        // Already running — just return
+        return;
+    }
+
+    std::string binary = find_overlay_binary();
+
+    // Create pipes: parent→child stdin, child→parent stdout
+    int pipe_in[2], pipe_out[2];
+    if (pipe(pipe_in) != 0 || pipe(pipe_out) != 0) return;
+
+    pid_t pid = fork();
+    if (pid == 0) {
+        // Child: wire up pipes
+        close(pipe_in[1]);   // close write end of stdin pipe
+        close(pipe_out[0]);  // close read end of stdout pipe
+        dup2(pipe_in[0], STDIN_FILENO);
+        dup2(pipe_out[1], STDOUT_FILENO);
+        close(pipe_in[0]);
+        close(pipe_out[1]);
+        // Redirect stderr to /dev/null to keep terminal clean
+        int devnull = open("/dev/null", O_WRONLY);
+        if (devnull >= 0) { dup2(devnull, STDERR_FILENO); close(devnull); }
+        execl(binary.c_str(), "rcli_overlay", nullptr);
+        _exit(1);
+    }
+
+    // Parent
+    close(pipe_in[0]);
+    close(pipe_out[1]);
+    g_overlay_pid = pid;
+    g_overlay_stdin = fdopen(pipe_in[1], "w");
+    g_overlay_stdout = fdopen(pipe_out[0], "r");
+
+    // Wait for "ready" from child
+    char buf[64] = {0};
+    if (g_overlay_stdout && fgets(buf, sizeof(buf), g_overlay_stdout)) {
+        g_overlay_visible.store(true);
+    }
+}
+
+void screen_capture_hide_overlay(void) {
+    if (g_overlay_pid <= 0) return;
+
+    overlay_cmd("quit");
+
+    // Clean up
+    if (g_overlay_stdin) { fclose(g_overlay_stdin); g_overlay_stdin = nullptr; }
+    if (g_overlay_stdout) { fclose(g_overlay_stdout); g_overlay_stdout = nullptr; }
+    int status;
+    waitpid(g_overlay_pid, &status, 0);
+    g_overlay_pid = 0;
+    g_overlay_visible.store(false);
+}
+
+int screen_capture_overlay_active(void) {
+    return g_overlay_visible.load() ? 1 : 0;
+}
+
+int screen_capture_overlay_region(const char* output_path) {
+    if (!g_overlay_visible.load() || g_overlay_pid <= 0) return -1;
+
+    // Get frame coordinates (top-left origin)
+    std::string frame_str = overlay_cmd("frame");
+    if (frame_str.empty()) return -1;
+
+    // Hide overlay for capture
+    overlay_cmd("hide");
+
+    // Capture the region
+    char region[128];
+    strlcpy(region, frame_str.c_str(), sizeof(region));
+    const char* argv[] = {
+        "screencapture", "-x", "-t", "jpg", "-R", region, output_path, nullptr
+    };
+    int result = run_screencapture(argv, output_path);
+
+    // Show overlay again
+    overlay_cmd("show");
+
+    return result;
+}
+
+// ---------------------------------------------------------------------------
+// Track the previously active app (before our terminal got focus)
+// Polls frontmostApplication every 200ms on a background thread.
+// NSWorkspace notifications don't work in CLI apps (no NSApplication run loop).
+// ---------------------------------------------------------------------------
+
+static std::atomic<pid_t> g_prev_active_pid{0};
+static pid_t g_our_terminal_pid = 0;
+static char g_prev_app_name[256] = {0};
+static std::mutex g_name_mutex;
+
+// Walk up process tree to find which ancestor owns a window (our terminal)
+static pid_t find_terminal_pid() {
+    @autoreleasepool {
+        pid_t cur = getpid();
+        pid_t ancestors[8];
+        int n = 0;
+        while (cur > 1 && n < 8) {
+            ancestors[n++] = cur;
+            struct kinfo_proc kp;
+            size_t length = sizeof(kp);
+            int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, cur };
+            if (sysctl(mib, 4, &kp, &length, NULL, 0) != 0) break;
+            pid_t ppid = kp.kp_eproc.e_ppid;
+            if (ppid == cur) break;
+            cur = ppid;
+        }
+
+        // Check which ancestor owns on-screen windows — that's the terminal
+        #pragma clang diagnostic push
+        #pragma clang diagnostic ignored "-Wdeprecated-declarations"
+        CFArrayRef windowList = CGWindowListCopyWindowInfo(
+            kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements,
+            kCGNullWindowID);
+        #pragma clang diagnostic pop
+        if (windowList) {
+            NSArray *windows = CFBridgingRelease(windowList);
+            for (int i = n - 1; i >= 0; i--) {
+                for (NSDictionary *info in windows) {
+                    pid_t ownerPid = [[info objectForKey:(NSString *)kCGWindowOwnerPID] intValue];
+                    if (ownerPid == ancestors[i]) {
+                        return ancestors[i];
+                    }
+                }
+            }
+        }
+        return (n >= 3) ? ancestors[2] : getppid();
+    }
+}
+
+// Background poller — tracks which non-terminal app is frontmost
+__attribute__((constructor))
+static void start_app_tracking() {
+    @autoreleasepool {
+        g_our_terminal_pid = find_terminal_pid();
+
+        // Seed with current frontmost app if it's not our terminal
+        NSRunningApplication *front = [[NSWorkspace sharedWorkspace] frontmostApplication];
+        if (front && front.processIdentifier != g_our_terminal_pid) {
+            g_prev_active_pid.store(front.processIdentifier, std::memory_order_relaxed);
+            NSString *name = front.localizedName ?: @"unknown";
+            std::lock_guard<std::mutex> lock(g_name_mutex);
+            strlcpy(g_prev_app_name, [name UTF8String], sizeof(g_prev_app_name));
+        }
+
+        // Poll frontmostApplication every 200ms on a background thread
+        std::thread([]() {
+            pthread_setname_np("rcli.app_tracker");
+            pid_t last_seen_pid = 0;
+            while (true) {
+                @autoreleasepool {
+                    NSRunningApplication *front =
+                        [[NSWorkspace sharedWorkspace] frontmostApplication];
+                    if (front) {
+                        pid_t pid = front.processIdentifier;
+                        // If a non-terminal app is frontmost and it changed, record it
+                        if (pid != g_our_terminal_pid && pid != last_seen_pid) {
+                            last_seen_pid = pid;
+                            g_prev_active_pid.store(pid, std::memory_order_relaxed);
+                            NSString *name = front.localizedName ?: @"unknown";
+                            std::lock_guard<std::mutex> lock(g_name_mutex);
+                            strlcpy(g_prev_app_name, [name UTF8String],
+                                    sizeof(g_prev_app_name));
+                        }
+                    }
+                }
+                usleep(200000); // 200ms
+            }
+        }).detach();
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Window lookup helpers
+// ---------------------------------------------------------------------------
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+
+static bool is_normal_window(NSDictionary *info) {
+    NSDictionary *bounds = [info objectForKey:(NSString *)kCGWindowBounds];
+    if (!bounds) return false;
+    CGFloat w = [[bounds objectForKey:@"Width"] floatValue];
+    CGFloat h = [[bounds objectForKey:@"Height"] floatValue];
+    return (w >= 100 && h >= 100);
+}
+
+// Find a normal window belonging to a specific PID
+static CGWindowID find_window_for_pid(pid_t target_pid) {
+    CFArrayRef windowList = CGWindowListCopyWindowInfo(
+        kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements,
+        kCGNullWindowID);
+    if (!windowList) return kCGNullWindowID;
+
+    NSArray *windows = CFBridgingRelease(windowList);
+    for (NSDictionary *info in windows) {
+        pid_t ownerPid = [[info objectForKey:(NSString *)kCGWindowOwnerPID] intValue];
+        if (ownerPid != target_pid) continue;
+        if (!is_normal_window(info)) continue;
+        return [[info objectForKey:(NSString *)kCGWindowNumber] unsignedIntValue];
+    }
+    return kCGNullWindowID;
+}
+
+// Find the frontmost normal window of the frontmost app
+static CGWindowID get_frontmost_window_id() {
+    @autoreleasepool {
+        NSRunningApplication *frontApp = [[NSWorkspace sharedWorkspace] frontmostApplication];
+        if (!frontApp) return kCGNullWindowID;
+        return find_window_for_pid(frontApp.processIdentifier);
+    }
+}
+
+// Find the window of the previously active app (before terminal got focus)
+static CGWindowID get_previous_app_window_id() {
+    @autoreleasepool {
+        pid_t prev_pid = g_prev_active_pid.load(std::memory_order_relaxed);
+        if (prev_pid <= 0) return kCGNullWindowID;
+        return find_window_for_pid(prev_pid);
+    }
+}
+
+#pragma clang diagnostic pop
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+static int capture_window_id(CGWindowID wid, const char* output_path) {
+    if (wid == kCGNullWindowID) return -1;
+    char wid_str[32];
+    snprintf(wid_str, sizeof(wid_str), "%u", wid);
+    const char* argv[] = {
+        "screencapture", "-x", "-t", "jpg", "-l", wid_str, output_path, nullptr
+    };
+    return run_screencapture(argv, output_path);
+}
+
+int screen_capture_active_window(const char* output_path) {
+    CGWindowID wid = get_frontmost_window_id();
+    if (wid == kCGNullWindowID) {
+        return screen_capture_full_screen(output_path);
+    }
+    return capture_window_id(wid, output_path);
+}
+
+int screen_capture_behind_terminal(const char* output_path) {
+    // Use the tracked previously-active app (before terminal got focus)
+    {
+        std::lock_guard<std::mutex> lock(g_name_mutex);
+        pid_t prev = g_prev_active_pid.load(std::memory_order_relaxed);
+        fprintf(stderr, "[Screen] Targeting: %s (PID %d)\n",
+                g_prev_app_name[0] ? g_prev_app_name : "none", prev);
+    }
+    CGWindowID wid = get_previous_app_window_id();
+    if (wid == kCGNullWindowID) {
+        fprintf(stderr, "[Screen] No previous app window found, falling back to full screen\n");
+        return screen_capture_full_screen(output_path);
+    }
+    return capture_window_id(wid, output_path);
+}
+
+int screen_capture_full_screen(const char* output_path) {
+    const char* argv[] = {
+        "screencapture", "-x", "-t", "jpg", output_path, nullptr
+    };
+    return run_screencapture(argv, output_path);
+}
+
+int screen_capture_screenshot(const char* output_path) {
+    // Prefer overlay if active, then active window, then full screen
+    if (screen_capture_overlay_active()) {
+        return screen_capture_overlay_region(output_path);
+    }
+    return screen_capture_active_window(output_path);
+}
+
+const char* screen_capture_target_app_name(char* buf, int buf_size) {
+    std::lock_guard<std::mutex> lock(g_name_mutex);
+    if (g_prev_app_name[0]) {
+        strlcpy(buf, g_prev_app_name, buf_size);
+    } else {
+        strlcpy(buf, "unknown", buf_size);
+    }
+    return buf;
+}
diff --git a/src/cli/help.h b/src/cli/help.h
index bb9b37a..9ecca9b 100644
--- a/src/cli/help.h
+++ b/src/cli/help.h
@@ -19,6 +19,8 @@ inline void print_usage(const char* argv0) {
         "    %sask%s <text>         One-shot text command\n"
         "    %sactions%s [name]     List all actions, or show detail for one\n"
         "    %saction%s <n> [json]  Execute a named action directly\n"
+        "    %svlm%s <img> [prompt]  Analyze image with Vision Language Model\n"
+        "    %sscreen%s [prompt]    Capture screenshot & analyze with VLM\n"
         "    %srag%s <sub>          RAG: ingest docs, query, status\n"
         "    %ssetup%s              Download AI models (~1GB)\n"
         "    %smodels%s             Manage all AI models (LLM, STT, TTS)\n"
@@ -45,6 +47,10 @@ inline void print_usage(const char* argv0) {
         "    rcli ask \"open Safari\"                  # one-shot command\n"
         "    rcli ask \"create a note called Ideas\"   # triggers action\n"
         "    rcli actions                            # see all actions\n"
+        "    rcli vlm photo.jpg                      # analyze an image\n"
+        "    rcli vlm photo.jpg \"What is this?\"     # image with custom prompt\n"
+        "    rcli screen                             # capture & analyze screen\n"
+        "    rcli screen \"What app is open?\"         # screen with custom prompt\n"
         "    rcli actions create_note                # action detail\n"
         "    rcli setup                              # download models\n\n",
         color::bold, color::orange, color::reset,
@@ -69,6 +75,8 @@ inline void print_usage(const char* argv0) {
         color::green, color::reset,
         color::green, color::reset,
         color::green, color::reset,
+        color::green, color::reset,
+        color::green, color::reset,
         color::dim, color::reset,
         color::dim, color::reset);
 }
@@ -130,7 +138,13 @@ inline void print_help_interactive() {
     fprintf(stderr, "  %sdo <action> [text]%s    execute action directly (no JSON needed)\n", color::bold, color::reset);
     fprintf(stderr, "  %srag status%s            show indexed documents\n", color::bold, color::reset);
     fprintf(stderr, "  %srag ingest <path>%s     index docs for Q&A\n", color::bold, color::reset);
+    fprintf(stderr, "  %scamera%s                capture photo from webcam & analyze\n", color::bold, color::reset);
+    fprintf(stderr, "  %sscreen%s                capture screenshot & analyze\n", color::bold, color::reset);
     fprintf(stderr, "  %squit%s                  exit\n\n", color::bold, color::reset);
+    fprintf(stderr, "  %s%s  Vision:%s\n", color::bold, color::orange, color::reset);
+    fprintf(stderr, "  Drag & drop an image file to analyze it with the VLM.\n");
+    fprintf(stderr, "  Type %scamera%s to capture a photo from your webcam.\n", color::bold, color::reset);
+    fprintf(stderr, "  Type %sscreen%s to capture and analyze your screen.\n\n", color::bold, color::reset);
     fprintf(stderr, "  %s%s  Try:%s\n", color::bold, color::orange, color::reset);
     fprintf(stderr, "  %s\"Open Safari\"  \"What's on my calendar?\"  \"Set volume to 50\"%s\n\n",
             color::dim, color::reset);
diff --git a/src/cli/main.cpp b/src/cli/main.cpp
index 4f49472..58cd4e1 100644
--- a/src/cli/main.cpp
+++ b/src/cli/main.cpp
@@ -27,6 +27,13 @@
 #include "audio/mic_permission.h"
 #include "core/personality.h"
 #include "llama.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include "audio/camera_capture.h"
+#include "audio/screen_capture.h"
+#include <spawn.h>
+
+extern char** environ;
 
 // Defined in cli_common.h as a forward declaration; implemented here because
 // it depends on the Objective-C mic_permission bridge compiled into this TU.
@@ -427,6 +434,229 @@ static int cmd_ask(const Args& args) {
     return 0;
 }
 
+// =============================================================================
+// VLM subcommand
+// =============================================================================
+
+static int cmd_vlm(const Args& args) {
+    if (args.arg1.empty() || args.help) {
+        fprintf(stderr, "\n  Usage: rcli vlm <image_path> [prompt]\n\n");
+        fprintf(stderr, "  Analyze an image using a Vision Language Model.\n\n");
+        fprintf(stderr, "  Examples:\n");
+        fprintf(stderr, "    rcli vlm photo.jpg\n");
+        fprintf(stderr, "    rcli vlm screenshot.png \"What text do you see?\"\n");
+        fprintf(stderr, "    rcli vlm diagram.jpg \"Explain this diagram\"\n\n");
+        return args.help ? 0 : 1;
+    }
+
+    // Resolve image path
+    std::string image_path = args.arg1;
+    if (!image_path.empty() && image_path[0] == '~') {
+        if (const char* home = getenv("HOME"))
+            image_path = std::string(home) + image_path.substr(1);
+    }
+    // Make relative paths absolute
+    if (!image_path.empty() && image_path[0] != '/') {
+        char cwd[4096];
+        if (getcwd(cwd, sizeof(cwd)))
+            image_path = std::string(cwd) + "/" + image_path;
+    }
+
+    struct stat st;
+    if (stat(image_path.c_str(), &st) != 0) {
+        fprintf(stderr, "%s%sError: Image not found: %s%s\n",
+                color::bold, color::red, image_path.c_str(), color::reset);
+        return 1;
+    }
+
+    if (!rastack::VlmEngine::is_supported_image(image_path)) {
+        fprintf(stderr, "%s%sError: Unsupported image format. Supported: jpg, png, bmp, gif, webp, tga%s\n",
+                color::bold, color::red, color::reset);
+        return 1;
+    }
+
+    std::string prompt = args.arg2.empty() ? "Describe this image in detail." : args.arg2;
+
+    // Create engine with models_dir set (we only need VLM, not the full pipeline)
+    std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}";
+    g_engine = rcli_create(config_json.c_str());
+    if (!g_engine) return 1;
+
+    // Initialize VLM
+    fprintf(stderr, "%sInitializing VLM...%s\n", color::dim, color::reset);
+    if (rcli_vlm_init(g_engine) != 0) {
+        fprintf(stderr, "\n%s%s  VLM not available.%s\n\n", color::bold, color::red, color::reset);
+        fprintf(stderr, "  VLM requires the llama.cpp engine and a VLM model.\n");
+        fprintf(stderr, "  Switch engine:  %srcli engine llamacpp%s\n", color::bold, color::reset);
+        fprintf(stderr, "  Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+        rcli_destroy(g_engine);
+        return 1;
+    }
+
+    // Show which VLM backend is active
+    const char* backend = rcli_vlm_backend_name(g_engine);
+    const char* model = rcli_vlm_model_name(g_engine);
+    if (backend && backend[0]) {
+        fprintf(stderr, "%s  VLM: %s%s%s via %s%s%s%s\n",
+                color::dim, color::reset, color::bold, model,
+                color::reset, color::dim, backend, color::reset);
+    }
+
+    fprintf(stderr, "%sAnalyzing image: %s%s\n", color::dim, image_path.c_str(), color::reset);
+
+    const char* response = rcli_vlm_analyze(g_engine, image_path.c_str(), prompt.c_str());
+    if (response && response[0]) {
+        fprintf(stdout, "%s\n", response);
+        RCLIVlmStats stats;
+        if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
+            fprintf(stderr, "\n%s⚡ %.1f tok/s  (%d tokens, %.1fs total, first token %.0fms)%s\n",
+                    color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
+                    stats.total_time_sec, stats.first_token_ms, color::reset);
+        }
+    } else {
+        fprintf(stderr, "%s%sError: VLM analysis failed%s\n",
+                color::bold, color::red, color::reset);
+        rcli_destroy(g_engine);
+        return 1;
+    }
+
+    rcli_destroy(g_engine);
+    return 0;
+}
+
+// =============================================================================
+// Camera subcommand — capture + analyze
+// =============================================================================
+
+static int cmd_camera(const Args& args) {
+    std::string prompt = args.arg1.empty() ? "Describe what you see in this photo in detail." : args.arg1;
+
+    fprintf(stderr, "%sCapturing photo from camera...%s\n", color::dim, color::reset);
+    std::string photo_path = "/tmp/rcli_camera.jpg";
+
+    int rc = camera_capture_photo(photo_path.c_str());
+    if (rc != 0) {
+        fprintf(stderr, "%s%sError: Camera capture failed. Check camera permissions.%s\n",
+                color::bold, color::red, color::reset);
+        return 1;
+    }
+    fprintf(stderr, "%sPhoto captured! Analyzing with VLM...%s\n", color::dim, color::reset);
+
+    std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}";
+    g_engine = rcli_create(config_json.c_str());
+    if (!g_engine) return 1;
+
+    if (rcli_vlm_init(g_engine) != 0) {
+        fprintf(stderr, "\n%s%s  VLM not available.%s\n\n", color::bold, color::red, color::reset);
+        fprintf(stderr, "  VLM requires the llama.cpp engine and a VLM model.\n");
+        fprintf(stderr, "  Switch engine:  %srcli engine llamacpp%s\n", color::bold, color::reset);
+        fprintf(stderr, "  Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+        rcli_destroy(g_engine);
+        return 1;
+    }
+
+    const char* backend = rcli_vlm_backend_name(g_engine);
+    const char* model = rcli_vlm_model_name(g_engine);
+    if (backend && backend[0]) {
+        fprintf(stderr, "%s  VLM: %s%s%s via %s%s%s%s\n",
+                color::dim, color::reset, color::bold, model,
+                color::reset, color::dim, backend, color::reset);
+    }
+
+    const char* response = rcli_vlm_analyze(g_engine, photo_path.c_str(), prompt.c_str());
+    if (response && response[0]) {
+        fprintf(stdout, "%s\n", response);
+        if (!args.no_speak) {
+            rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers);
+            rcli_speak(g_engine, response);
+        }
+        RCLIVlmStats stats;
+        if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
+            fprintf(stderr, "\n%s⚡ %.1f tok/s  (%d tokens, %.1fs total, first token %.0fms)%s\n",
+                    color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
+                    stats.total_time_sec, stats.first_token_ms, color::reset);
+        }
+        {
+            pid_t pid;
+            const char* argv[] = {"open", photo_path.c_str(), nullptr};
+            posix_spawnp(&pid, "open", nullptr, nullptr,
+                         const_cast<char* const*>(argv), environ);
+        }
+    } else {
+        fprintf(stderr, "%s%sError: VLM analysis failed%s\n",
+                color::bold, color::red, color::reset);
+        rcli_destroy(g_engine);
+        return 1;
+    }
+
+    rcli_destroy(g_engine);
+    return 0;
+}
+
+// =============================================================================
+// Screen subcommand — screenshot + analyze
+// =============================================================================
+
+static int cmd_screen(const Args& args) {
+    std::string prompt = args.arg1.empty()
+        ? "Describe what you see on this screen in detail." : args.arg1;
+
+    fprintf(stderr, "%sCapturing screenshot...%s\n", color::dim, color::reset);
+    std::string screen_path = "/tmp/rcli_screen.jpg";
+
+    int rc = screen_capture_screenshot(screen_path.c_str());
+    if (rc != 0) {
+        fprintf(stderr, "%s%sError: Screen capture failed. Check screen recording permissions.%s\n",
+                color::bold, color::red, color::reset);
+        return 1;
+    }
+    fprintf(stderr, "%sScreenshot captured! Analyzing with VLM...%s\n", color::dim, color::reset);
+
+    std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}";
+    g_engine = rcli_create(config_json.c_str());
+    if (!g_engine) return 1;
+
+    if (rcli_vlm_init(g_engine) != 0) {
+        fprintf(stderr, "\n%s%s  VLM not available.%s\n\n", color::bold, color::red, color::reset);
+        fprintf(stderr, "  VLM requires the llama.cpp engine and a VLM model.\n");
+        fprintf(stderr, "  Switch engine:  %srcli engine llamacpp%s\n", color::bold, color::reset);
+        fprintf(stderr, "  Download model: %srcli models vlm%s\n\n", color::bold, color::reset);
+        rcli_destroy(g_engine);
+        return 1;
+    }
+
+    const char* backend = rcli_vlm_backend_name(g_engine);
+    const char* model = rcli_vlm_model_name(g_engine);
+    if (backend && backend[0]) {
+        fprintf(stderr, "%s  VLM: %s%s%s via %s%s%s%s\n",
+                color::dim, color::reset, color::bold, model,
+                color::reset, color::dim, backend, color::reset);
+    }
+
+    const char* response = rcli_vlm_analyze(g_engine, screen_path.c_str(), prompt.c_str());
+    if (response && response[0]) {
+        fprintf(stdout, "%s\n", response);
+        if (!args.no_speak) {
+            rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers);
+            rcli_speak(g_engine, response);
+        }
+        RCLIVlmStats stats;
+        if (rcli_vlm_get_stats(g_engine, &stats) == 0) {
+            fprintf(stderr, "\n%s⚡ %.1f tok/s  (%d tokens, %.1fs total, first token %.0fms)%s\n",
+                    color::dim, stats.gen_tok_per_sec, stats.generated_tokens,
+                    stats.total_time_sec, stats.first_token_ms, color::reset);
+        }
+    } else {
+        fprintf(stderr, "%s%sError: VLM analysis failed%s\n",
+                color::bold, color::red, color::reset);
+        rcli_destroy(g_engine);
+        return 1;
+    }
+
+    rcli_destroy(g_engine);
+    return 0;
+}
+
 // =============================================================================
 // RAG subcommands
 // =============================================================================
@@ -654,16 +884,17 @@ static int cmd_metalrt(const Args& args) {
                     inst ? color::reset : "");
         }
 
-        // STT/TTS component models
+        // STT/TTS/VLM component models
         size_t offset = mrt_models.size();
-        fprintf(stderr, "\n  %s— STT/TTS Components —%s\n", color::bold, color::reset);
+        fprintf(stderr, "\n  %s— STT/TTS/VLM Components —%s\n", color::bold, color::reset);
         fprintf(stderr, "  %s#  %-28s  %-8s  %-5s  Status%s\n",
                 color::bold, "Model", "Size", "Type", color::reset);
 
         for (size_t i = 0; i < comp_models.size(); i++) {
             auto& cm = comp_models[i];
             bool inst = rcli::is_metalrt_component_installed(cm);
-            std::string type_label = (cm.component == "stt") ? "STT" : "TTS";
+            std::string type_label = (cm.component == "stt") ? "STT"
+                                   : (cm.component == "vlm") ? "VLM" : "TTS";
             fprintf(stderr, "  %s%zu%s  %-28s  %-8s  %-5s  %s%s%s\n",
                     color::bold, offset + i + 1, color::reset,
                     cm.name.c_str(),
@@ -917,6 +1148,7 @@ int main(int argc, char** argv) {
 
     if (!args.verbose) {
         llama_log_set([](enum ggml_log_level, const char*, void*) {}, nullptr);
+        mtmd_helper_log_set([](enum ggml_log_level, const char*, void*) {}, nullptr);
     }
 
     if (args.command.empty()) {
@@ -930,6 +1162,9 @@ int main(int argc, char** argv) {
     if (args.command == "actions")     return cmd_actions(args);
     if (args.command == "action")      return cmd_action(args);
     if (args.command == "rag")         return cmd_rag(args);
+    if (args.command == "vlm")         return cmd_vlm(args);
+    if (args.command == "camera")      return cmd_camera(args);
+    if (args.command == "screen")      return cmd_screen(args);
     if (args.command == "setup")       return cmd_setup(args);
     if (args.command == "models")      return cmd_models(args);
     if (args.command == "voices")      return cmd_voices(args);
diff --git a/src/cli/model_pickers.h b/src/cli/model_pickers.h
index 949e25b..ec0b847 100644
--- a/src/cli/model_pickers.h
+++ b/src/cli/model_pickers.h
@@ -12,6 +12,7 @@
 #include "models/model_registry.h"
 #include "models/tts_model_registry.h"
 #include "models/stt_model_registry.h"
+#include "models/vlm_model_registry.h"
 #include "engines/metalrt_loader.h"
 
 // =============================================================================
@@ -407,6 +408,83 @@ inline int pick_metalrt_stt() {
     return 0;
 }
 
+// =============================================================================
+// VLM picker
+// =============================================================================
+
+inline int pick_vlm(const std::string& models_dir) {
+    auto all = rcli::all_vlm_models();
+
+    fprintf(stderr, "\n  %s%s  VLM Models (Vision \xC2\xB7 llama.cpp)%s\n\n", color::bold, color::orange, color::reset);
+
+    fprintf(stderr, "    %s#  %-30s  %-12s  %s%s\n",
+            color::bold, "Model", "Size", "Status", color::reset);
+    fprintf(stderr, "    %s──  %-30s  %-12s  %s%s\n",
+            color::dim, "──────────────────────────────", "────────────", "──────────", color::reset);
+
+    for (size_t i = 0; i < all.size(); i++) {
+        auto& m = all[i];
+        bool installed = rcli::is_vlm_model_installed(models_dir, m);
+        std::string status;
+        if (installed)  status = "\033[32minstalled\033[0m";
+        else            status = "\033[2mnot installed\033[0m";
+        std::string label = m.name;
+        if (m.is_default) label += " (default)";
+        char size_str[32];
+        int total_mb = m.model_size_mb + m.mmproj_size_mb;
+        if (total_mb >= 1024)
+            snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0);
+        else
+            snprintf(size_str, sizeof(size_str), "%d MB", total_mb);
+        fprintf(stderr, "    %s%-2zu%s %-30s  %-12s  %s\n",
+                installed ? "\033[32m" : "", i + 1, installed ? "\033[0m" : "",
+                label.c_str(), size_str, status.c_str());
+    }
+    fprintf(stderr, "\n  %sCommands:%s  [1-%zu] download/select  |  q cancel\n  Choice: ",
+            color::bold, color::reset, all.size());
+    fflush(stderr);
+
+    int choice = read_picker_choice();
+    if (choice == 0 || choice == -1) { picker_no_changes(); return 0; }
+    if (choice < 1 || choice > (int)all.size()) { fprintf(stderr, "\n  Invalid choice.\n\n"); return 1; }
+
+    auto& sel = all[choice - 1];
+    bool installed = rcli::is_vlm_model_installed(models_dir, sel);
+    if (installed) {
+        fprintf(stderr, "\n  %s%s%s is already installed.%s\n\n",
+                color::bold, color::green, sel.name.c_str(), color::reset);
+        return 0;
+    }
+
+    int total_mb = sel.model_size_mb + sel.mmproj_size_mb;
+    char size_str[32];
+    if (total_mb >= 1024)
+        snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0);
+    else
+        snprintf(size_str, sizeof(size_str), "%d MB", total_mb);
+    fprintf(stderr, "\n  %s%s%s%s is not installed (%s). Download? [Y/n]: ",
+            color::bold, color::yellow, sel.name.c_str(), color::reset, size_str);
+    fflush(stderr);
+    if (!confirm_download()) { picker_cancelled(); return 0; }
+
+    std::string model_path = models_dir + "/" + sel.model_filename;
+    std::string mmproj_path = models_dir + "/" + sel.mmproj_filename;
+    std::string cmd = "bash -c '"
+        "set -e; echo \"  Downloading " + sel.name + " model...\"; echo \"\"; "
+        "curl -L -# -o \"" + model_path + "\" \"" + sel.model_url + "\"; "
+        "echo \"\"; echo \"  Downloading vision projector...\"; echo \"\"; "
+        "curl -L -# -o \"" + mmproj_path + "\" \"" + sel.mmproj_url + "\"; "
+        "echo \"\"; echo \"  Done!\"; '";
+    fprintf(stderr, "\n");
+    if (system(cmd.c_str()) != 0) {
+        fprintf(stderr, "\n  %s%sDownload failed.%s\n\n", color::bold, color::red, color::reset);
+        return 1;
+    }
+    fprintf(stderr, "\n  %s%sInstalled: %s%s\n  Use: rcli vlm <image> [prompt]\n\n",
+            color::bold, color::green, sel.name.c_str(), color::reset);
+    return 0;
+}
+
 // =============================================================================
 // Unified models dashboard
 // =============================================================================
@@ -417,6 +495,7 @@ inline int cmd_models(const Args& args) {
     if (args.arg1 == "llm") return pick_llm(models_dir);
     if (args.arg1 == "stt") return pick_stt(models_dir);
     if (args.arg1 == "tts") return pick_tts(models_dir);
+    if (args.arg1 == "vlm") return pick_vlm(models_dir);
     if (args.arg1 == "metalrt-stt" || args.arg1 == "whisper") return pick_metalrt_stt();
 
     if (args.help) {
@@ -426,12 +505,14 @@ inline int cmd_models(const Args& args) {
             "    models              Unified model dashboard\n"
             "    models llm          LLM model picker\n"
             "    models stt          STT model picker\n"
-            "    models tts          TTS voice picker\n\n"
+            "    models tts          TTS voice picker\n"
+            "    models vlm          VLM (vision) model picker\n\n"
             "  %sEXAMPLES%s\n"
             "    rcli models              # dashboard — pick a modality\n"
             "    rcli models llm          # switch LLM directly\n"
             "    rcli models stt          # switch offline STT directly\n"
-            "    rcli models tts          # switch TTS voice directly\n\n",
+            "    rcli models tts          # switch TTS voice directly\n"
+            "    rcli models vlm          # manage VLM models for image analysis\n\n",
             color::bold, color::orange, color::reset,
             color::bold, color::reset,
             color::bold, color::reset);
@@ -483,6 +564,21 @@ inline int cmd_models(const Args& args) {
             color::green, tts_name.c_str(), color::reset,
             tts_inst, tts_all.size());
 
+    // VLM row
+    auto vlm_all = rcli::all_vlm_models();
+    int vlm_inst = 0;
+    std::string vlm_name = "not installed";
+    for (auto& m : vlm_all) {
+        if (rcli::is_vlm_model_installed(models_dir, m)) {
+            vlm_inst++;
+            if (vlm_name == "not installed") vlm_name = m.name;
+        }
+    }
+    fprintf(stderr, "    %s4%s  %sVLM (vision)%s   %s%-28s%s  %d / %zu\n",
+            color::green, color::reset, color::bold, color::reset,
+            vlm_inst > 0 ? color::green : color::dim, vlm_name.c_str(), color::reset,
+            vlm_inst, vlm_all.size());
+
     // MetalRT Whisper row
     auto mrt_comps = rcli::metalrt_component_models();
     std::string mrt_stt_pref = rcli::read_selected_metalrt_stt_id();
@@ -498,7 +594,7 @@ inline int cmd_models(const Args& args) {
         }
     }
     if (mrt_stt_pref.empty() && mrt_stt_inst > 0) mrt_stt_name = "auto (first installed)";
-    fprintf(stderr, "    %s4%s  %sMetalRT STT%s    %s%-28s%s  %d / %d\n",
+    fprintf(stderr, "    %s5%s  %sMetalRT STT%s    %s%-28s%s  %d / %d\n",
             color::green, color::reset, color::bold, color::reset,
             color::green, mrt_stt_name.c_str(), color::reset,
             mrt_stt_inst, mrt_stt_total);
@@ -521,7 +617,7 @@ inline int cmd_models(const Args& args) {
     }
     fprintf(stderr, "  %sNote: STT streaming (Zipformer) is always active for live mic.%s\n\n",
             color::dim, color::reset);
-    fprintf(stderr, "  %sSelect modality:%s  1 LLM  |  2 STT  |  3 TTS  |  4 MetalRT STT  |  q cancel\n  Choice: ",
+    fprintf(stderr, "  %sSelect modality:%s  1 LLM  |  2 STT  |  3 TTS  |  4 VLM  |  5 MetalRT STT  |  q cancel\n  Choice: ",
             color::bold, color::reset);
     fflush(stderr);
 
@@ -530,7 +626,8 @@ inline int cmd_models(const Args& args) {
     if (choice == 1 || choice == -2) return pick_llm(models_dir); // -2 (a) → LLM as first
     if (choice == 2) return pick_stt(models_dir);
     if (choice == 3) return pick_tts(models_dir);
-    if (choice == 4) return pick_metalrt_stt();
+    if (choice == 4) return pick_vlm(models_dir);
+    if (choice == 5) return pick_metalrt_stt();
 
     fprintf(stderr, "\n  Invalid choice.\n\n");
     return 1;
@@ -595,10 +692,20 @@ inline int cmd_info() {
         ? "MetalRT (Metal GPU — LLM, STT, TTS on-device)"
         : "llama.cpp + sherpa-onnx (ONNX Runtime)";
 
+    auto vlm_all_info = rcli::all_vlm_models();
+    auto [vlm_found, vlm_def] = rcli::find_installed_vlm(models_dir);
+    std::string vlm_info;
+    if (vlm_found) {
+        vlm_info = vlm_def.name + " (llama.cpp, Metal GPU)";
+    } else {
+        vlm_info = "not installed — run: rcli models vlm";
+    }
+
     fprintf(stdout,
         "\n%s%s  RCLI%s %s%s%s\n\n"
         "  %sEngine:%s       %s\n"
         "  %sLLM:%s          %s\n"
+        "  %sVLM:%s          %s\n"
         "  %sSTT:%s          %s\n"
         "  %sTTS:%s          %s\n"
         "  %sVAD:%s          Silero VAD\n"
@@ -610,6 +717,7 @@ inline int cmd_info() {
         color::dim, RA_VERSION, color::reset,
         color::bold, color::reset, engine_info.c_str(),
         color::bold, color::reset, llm_info.c_str(),
+        color::bold, color::reset, vlm_info.c_str(),
         color::bold, color::reset, stt_info.c_str(),
         color::bold, color::reset, tts_info.c_str(),
         color::bold, color::reset,
@@ -677,5 +785,24 @@ inline int cmd_info() {
     if (!any_tts) fprintf(stdout, "    (none — run: rcli setup)\n");
     fprintf(stdout, "\n");
 
+    // Installed VLM
+    fprintf(stdout, "  %sInstalled VLM:%s\n", color::bold, color::reset);
+    bool any_vlm = false;
+    for (auto& m : vlm_all_info) {
+        if (rcli::is_vlm_model_installed(models_dir, m)) {
+            char size_str[32];
+            int total_mb = m.model_size_mb + m.mmproj_size_mb;
+            if (total_mb >= 1024)
+                snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0);
+            else
+                snprintf(size_str, sizeof(size_str), "%d MB", total_mb);
+            fprintf(stdout, "    %-28s  %-7s  installed\n",
+                    m.name.c_str(), size_str);
+            any_vlm = true;
+        }
+    }
+    if (!any_vlm) fprintf(stdout, "    (none — run: rcli models vlm)\n");
+    fprintf(stdout, "\n");
+
     return 0;
 }
diff --git a/src/cli/setup_cmds.h b/src/cli/setup_cmds.h
index f33dcc7..b5f85fb 100644
--- a/src/cli/setup_cmds.h
+++ b/src/cli/setup_cmds.h
@@ -178,13 +178,15 @@ inline int cmd_setup(const Args& args) {
                 if (!cm.default_install) continue;
                 std::string cm_dir = rcli::metalrt_models_dir() + "/" + cm.dir_name;
                 if (rcli::is_metalrt_component_installed(cm)) {
-                    std::string skip_label = (cm.component == "stt") ? "STT" : "TTS";
+                    std::string skip_label = (cm.component == "stt") ? "STT"
+                                           : (cm.component == "vlm") ? "VLM" : "TTS";
                     fprintf(stderr, "  %s%sMetalRT %s already installed:%s %s\n",
                             color::bold, color::green, skip_label.c_str(), color::reset, cm.name.c_str());
                     continue;
                 }
 
-                std::string type_label = (cm.component == "stt") ? "STT" : "TTS";
+                std::string type_label = (cm.component == "stt") ? "STT"
+                                       : (cm.component == "vlm") ? "VLM" : "TTS";
                 fprintf(stderr, "  %sDownloading MetalRT %s: %s (~%s)...%s\n",
                         color::dim, type_label.c_str(), cm.name.c_str(),
                         rcli::format_size(cm.size_mb).c_str(), color::reset);
diff --git a/src/cli/tui_app.h b/src/cli/tui_app.h
index 6ec4ed1..7b01d1e 100644
--- a/src/cli/tui_app.h
+++ b/src/cli/tui_app.h
@@ -12,8 +12,15 @@
 #include "models/stt_model_registry.h"
 #include "actions/action_registry.h"
 #include "engines/metalrt_loader.h"
+#include "engines/vlm_engine.h"
+#include "audio/camera_capture.h"
+#include "audio/screen_capture.h"
+#include "models/vlm_model_registry.h"
 #include "core/log.h"
 #include "core/personality.h"
+#include <spawn.h>
+
+extern char** environ;
 
 #include <ftxui/component/component.hpp>
 #include <ftxui/component/screen_interactive.hpp>
@@ -432,7 +439,43 @@ class TuiApp {
                 if (c == "r" || c == "R") { enter_rag_mode(); return true; }
                 if (c == "d" || c == "D") { close_all_panels(); enter_cleanup_mode(); return true; }
                 if (c == "p" || c == "P") { enter_personality_mode(); return true; }
-                // V key: voice mode removed — push-to-talk via SPACE is always active
+                // V key: capture photo from camera and analyze with VLM
+                if (c == "v" || c == "V") {
+                    run_camera_vlm("Describe what you see in this photo in detail.");
+                    return true;
+                }
+                // S key: toggle visual mode (VLM only on llama.cpp engine)
+                if (c == "s" || c == "S") {
+                    if (screen_capture_overlay_active()) {
+                        screen_capture_hide_overlay();
+                        add_system_message("Exiting visual mode...");
+                        screen_->Post(Event::Custom);
+                        std::thread([this]() {
+                            rcli_vlm_exit(engine_);
+                            add_system_message("Visual mode OFF");
+                            screen_->Post(Event::Custom);
+                        }).detach();
+                    } else {
+                        add_system_message("Entering visual mode, loading VLM...");
+                        screen_->Post(Event::Custom);
+                        std::thread([this]() {
+                            if (rcli_vlm_init(engine_) == 0) {
+                                const char* vbe = rcli_vlm_backend_name(engine_);
+                                const char* vmodel = rcli_vlm_model_name(engine_);
+                                screen_capture_show_overlay(0, 0, 0, 0);
+                                std::string msg = "Visual mode ON";
+                                if (vbe && vbe[0])
+                                    msg += std::string(" — ") + vmodel + " via " + vbe;
+                                msg += ". Drag/resize the green frame, then ask a question";
+                                add_system_message(msg);
+                            } else {
+                                add_system_message("VLM requires the llama.cpp engine. Switch with: rcli engine llamacpp, then download a model via [M] \xe2\x86\x92 VLM Models");
+                            }
+                            screen_->Post(Event::Custom);
+                        }).detach();
+                    }
+                    return true;
+                }
                 if (c == "t" || c == "T") {
                     tool_trace_enabled_ = !tool_trace_enabled_.load(std::memory_order_relaxed);
                     add_system_message(tool_trace_enabled_ ? "Tool call trace: ON" : "Tool call trace: OFF");
@@ -538,6 +581,11 @@ class TuiApp {
             std::string user_text = transcript;
             add_user_message(user_text);
 
+            // Visual mode: route voice to VLM screen analysis instead of LLM
+            if (screen_capture_overlay_active()) {
+                run_screen_vlm(user_text);
+                return;
+            }
 
             voice_state_ = VoiceState::THINKING;
             screen_->Post(Event::Custom);
@@ -1069,6 +1117,11 @@ class TuiApp {
         else
             right.push_back(text("[A] actions  ") | dim);
         right.push_back(text("[C] convo  ") | dim);
+        right.push_back(text("[V] camera  ") | dim);
+        if (screen_capture_overlay_active())
+            right.push_back(text("[S] visual ●  ") | ftxui::color(ftxui::Color::Green));
+        else
+            right.push_back(text("[S] visual  ") | dim);
         right.push_back(text("[R] RAG  ") | dim);
         right.push_back(text("[P] personality  ") | dim);
         right.push_back(text("[D] cleanup  ") | dim);
@@ -1458,6 +1511,7 @@ class TuiApp {
                 e.is_archive = false;
                 models_entries_.push_back(e);
             }
+
         } else {
             // ---- llama.cpp engine: show GGUF models only ----
             const auto* llm_active = rcli::resolve_active_model(dir, llm_all);
@@ -1501,6 +1555,21 @@ class TuiApp {
                 e.archive_dir = v.archive_dir;
                 models_entries_.push_back(e);
             }
+
+            // VLM models (vision)
+            auto vlm_all = rcli::all_vlm_models();
+            { ModelEntry h; h.name = "VLM Models (Vision \xC2\xB7 llama.cpp)"; h.is_header = true; models_entries_.push_back(h); }
+            for (auto& m : vlm_all) {
+                ModelEntry e;
+                e.name = m.name; e.id = m.id; e.modality = "VLM";
+                e.size_mb = m.model_size_mb + m.mmproj_size_mb;
+                e.installed = rcli::is_vlm_model_installed(dir, m);
+                e.is_active = false; // VLM is lazy-loaded, no "active" concept
+                e.is_default = m.is_default; e.is_recommended = m.is_default;
+                e.description = m.description;
+                e.url = m.model_url; e.filename = m.model_filename; e.is_archive = false;
+                models_entries_.push_back(e);
+            }
         }
 
         for (int i = 0; i < (int)models_entries_.size(); i++) {
@@ -1666,7 +1735,20 @@ class TuiApp {
             bool archive = e.is_archive;
 
             std::string archive_dir_name = e.archive_dir;
-            std::thread([this, idx, dir, url, fname, mod, id, nm, archive, archive_dir_name]() {
+            // For VLM, also capture the mmproj URL
+            std::string vlm_mmproj_url, vlm_mmproj_fname;
+            if (mod == "VLM") {
+                auto vlm_models = rcli::all_vlm_models();
+                for (auto& vm : vlm_models) {
+                    if (vm.id == id) {
+                        vlm_mmproj_url = vm.mmproj_url;
+                        vlm_mmproj_fname = vm.mmproj_filename;
+                        break;
+                    }
+                }
+            }
+            std::thread([this, idx, dir, url, fname, mod, id, nm, archive, archive_dir_name,
+                         vlm_mmproj_url, vlm_mmproj_fname]() {
                 int rc;
                 if (archive) {
                     rc = system(("curl -sL '" + url + "' | tar xj -C '" + dir + "' 2>/dev/null").c_str());
@@ -1677,6 +1759,12 @@ class TuiApp {
                         if (stat(src.c_str(), &st) == 0 && stat(dst.c_str(), &st) != 0)
                             rename(src.c_str(), dst.c_str());
                     }
+                } else if (mod == "VLM" && !vlm_mmproj_url.empty()) {
+                    // VLM needs two files: language model + mmproj
+                    rc = system(("curl -sL -o '" + dir + "/" + fname + "' '" + url + "' 2>/dev/null").c_str());
+                    if (rc == 0) {
+                        rc = system(("curl -sL -o '" + dir + "/" + vlm_mmproj_fname + "' '" + vlm_mmproj_url + "' 2>/dev/null").c_str());
+                    }
                 } else {
                     rc = system(("curl -sL -o '" + dir + "/" + fname + "' '" + url + "' 2>/dev/null").c_str());
                 }
@@ -1698,6 +1786,9 @@ class TuiApp {
                     } else {
                         if (mod == "STT") rcli::write_selected_stt_id(id);
                         else if (mod == "TTS") rcli::write_selected_tts_id(id);
+                        else if (mod == "VLM") {
+                            // VLM doesn't need selection — just mark installed
+                        }
                         models_message_ = "Downloaded & selected: " + nm + ". Restart RCLI to apply.";
                         models_msg_color_ = theme_.success;
                     }
@@ -2143,6 +2234,117 @@ class TuiApp {
     // process_input
     // ====================================================================
 
+    void run_camera_vlm(const std::string& prompt) {
+        add_system_message("Capturing photo from camera...");
+        voice_state_ = VoiceState::THINKING;
+        std::string prompt_copy = prompt;
+        std::thread([this, prompt_copy]() {
+            std::string photo_path = "/tmp/rcli_camera_" +
+                std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg";
+            int rc = camera_capture_photo(photo_path.c_str());
+            if (rc != 0) {
+                add_response("(Camera capture failed. Check camera permissions in System Settings > Privacy & Security > Camera.)", "");
+                voice_state_ = VoiceState::IDLE;
+                screen_->Post(Event::Custom);
+                return;
+            }
+            add_system_message("Photo captured! Loading VLM...");
+            screen_->Post(Event::Custom);
+
+            const char* response = rcli_vlm_analyze(
+                engine_, photo_path.c_str(), prompt_copy.c_str());
+
+            // Show which backend handled it
+            const char* vbe = rcli_vlm_backend_name(engine_);
+            const char* vmodel = rcli_vlm_model_name(engine_);
+            if (vbe && vbe[0]) {
+                add_system_message(std::string("VLM: ") + vmodel + " via " + vbe);
+                screen_->Post(Event::Custom);
+            }
+
+            if (response && response[0]) {
+                add_response(response, "VLM");
+                voice_state_ = VoiceState::SPEAKING;
+                screen_->Post(Event::Custom);
+                rcli_speak(engine_, response);
+                RCLIVlmStats stats;
+                if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+                    char buf[128];
+                    snprintf(buf, sizeof(buf), "⚡ %.1f tok/s  |  %d tokens  |  %.1fs total",
+                             stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+                    add_system_message(buf);
+                }
+            } else {
+                add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", "");
+            }
+            voice_state_ = VoiceState::IDLE;
+            {
+                pid_t pid;
+                const char* argv[] = {"open", photo_path.c_str(), nullptr};
+                posix_spawnp(&pid, "open", nullptr, nullptr,
+                             const_cast<char* const*>(argv), environ);
+            }
+            screen_->Post(Event::Custom);
+        }).detach();
+    }
+
+    void run_screen_vlm(const std::string& prompt) {
+        char app_name[256];
+        screen_capture_target_app_name(app_name, sizeof(app_name));
+        add_system_message(std::string("Capturing screenshot of ") + app_name + "...");
+        voice_state_ = VoiceState::THINKING;
+        std::string prompt_copy = prompt;
+        std::thread([this, prompt_copy]() {
+            std::string screen_path = "/tmp/rcli_screen_" +
+                std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg";
+            int rc = screen_capture_screenshot(screen_path.c_str());
+            if (rc != 0) {
+                add_response("(Screen capture failed. Check screen recording permissions.)", "");
+                voice_state_ = VoiceState::IDLE;
+                screen_->Post(Event::Custom);
+                return;
+            }
+            add_system_message("Loading VLM...");
+            screen_->Post(Event::Custom);
+
+            std::string accumulated;
+            auto stream_cb = [](const char* event, const char* data, void* ud) {
+                auto* accum = static_cast<std::string*>(ud);
+                if (std::strcmp(event, "token") == 0) {
+                    accum->append(data);
+                }
+            };
+            int vlm_rc = rcli_vlm_analyze_stream(engine_, screen_path.c_str(),
+                                                  prompt_copy.c_str(), stream_cb, &accumulated);
+
+            // Show which backend handled it
+            const char* vbe = rcli_vlm_backend_name(engine_);
+            const char* vmodel = rcli_vlm_model_name(engine_);
+            if (vbe && vbe[0]) {
+                add_system_message(std::string("VLM: ") + vmodel + " via " + vbe);
+                screen_->Post(Event::Custom);
+            }
+
+            if (vlm_rc == 0 && !accumulated.empty()) {
+                add_response(accumulated, "VLM");
+                voice_state_ = VoiceState::SPEAKING;
+                screen_->Post(Event::Custom);
+                rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr);
+                RCLIVlmStats stats;
+                if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+                    char buf[128];
+                    snprintf(buf, sizeof(buf), "⚡ %.1f tok/s  |  %d tokens  |  %.1fs total",
+                             stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+                    add_system_message(buf);
+                }
+            } else {
+                add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", "");
+            }
+            voice_state_ = VoiceState::IDLE;
+            screen_->Post(Event::Custom);
+        }).detach();
+    }
+
     void process_input(const std::string& input) {
         if (input.empty()) return;
 
@@ -2202,6 +2404,26 @@ class TuiApp {
             return;
         }
 
+        if (cmd == "visual") {
+            if (screen_capture_overlay_active()) {
+                screen_capture_hide_overlay();
+                add_system_message("Visual mode OFF");
+            } else {
+                screen_capture_show_overlay(0, 0, 0, 0);
+                add_system_message("Visual mode ON — drag/resize the green frame, then ask a question");
+            }
+            return;
+        }
+
+        if (cmd == "screen" || cmd == "screenshot") {
+            run_screen_vlm("Describe what you see on this screen in detail.");
+            return;
+        }
+
+        if (cmd == "camera" || cmd == "photo" || cmd == "webcam") {
+            run_camera_vlm("Describe what you see in this photo in detail.");
+            return;
+        }
 
         if (!engine_) {
             add_response("Engine not initialized.", "");
@@ -2340,6 +2562,34 @@ class TuiApp {
 
             struct stat path_st;
             if (!resolved.empty() && resolved[0] == '/' && stat(resolved.c_str(), &path_st) == 0) {
+                // Check if this is an image file → route to VLM analysis
+                if (S_ISREG(path_st.st_mode) && rastack::VlmEngine::is_supported_image(resolved)) {
+                    add_system_message("Image detected: " + resolved);
+                    add_system_message("Analyzing image with VLM...");
+                    voice_state_ = VoiceState::THINKING;
+                    std::string path_copy = resolved;
+                    std::thread([this, path_copy]() {
+                        const char* response = rcli_vlm_analyze(
+                            engine_, path_copy.c_str(), "Describe this image in detail.");
+                        if (response && response[0]) {
+                            add_response(response, "VLM");
+                            RCLIVlmStats stats;
+                            if (rcli_vlm_get_stats(engine_, &stats) == 0) {
+                                char buf[128];
+                                snprintf(buf, sizeof(buf), "⚡ %.1f tok/s  |  %d tokens  |  %.1fs total",
+                                         stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec);
+                                add_system_message(buf);
+                            }
+                        } else {
+                            add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", "");
+                        }
+                        voice_state_ = VoiceState::IDLE;
+                        screen_->Post(Event::Custom);
+                    }).detach();
+                    return;
+                }
+
+                // Non-image path → RAG ingest
                 add_system_message("Detected path: " + resolved);
                 add_system_message("Indexing for RAG... this may take a moment.");
                 std::string path_copy = resolved;
diff --git a/src/engines/metalrt_loader.cpp b/src/engines/metalrt_loader.cpp
index 7dd5363..ba0f1c8 100644
--- a/src/engines/metalrt_loader.cpp
+++ b/src/engines/metalrt_loader.cpp
@@ -186,6 +186,22 @@ bool MetalRTLoader::load() {
     LOG_DEBUG("MetalRT", "TTS symbols: tts_create=%p tts_synthesize=%p tts_sample_rate=%p",
               (void*)tts_create, (void*)tts_synthesize, (void*)tts_sample_rate);
 
+    // Vision (VLM) symbols (optional)
+    vision_create        = resolve<CreateFn>("metalrt_vision_create");
+    vision_destroy       = resolve<DestroyFn>("metalrt_vision_destroy");
+    vision_load          = resolve<LoadFn>("metalrt_vision_load");
+    vision_analyze       = resolve<VisionAnalyzeFn>("metalrt_vision_analyze");
+    vision_analyze_stream = resolve<VisionAnalyzeStreamFn>("metalrt_vision_analyze_stream");
+    vision_generate      = resolve<VisionGenerateFn>("metalrt_vision_generate");
+    vision_generate_stream = resolve<VisionGenerateStreamFn>("metalrt_vision_generate_stream");
+    vision_reset         = resolve<ResetFn>("metalrt_vision_reset");
+    vision_model_name    = resolve<ModelNameFn>("metalrt_vision_model_name");
+    vision_device_name   = resolve<DeviceNameFn>("metalrt_vision_device_name");
+    vision_free_result   = resolve<VisionFreeResultFn>("metalrt_vision_free_result");
+
+    LOG_DEBUG("MetalRT", "VLM symbols: vision_create=%p vision_analyze=%p vision_stream=%p",
+              (void*)vision_create, (void*)vision_analyze, (void*)vision_analyze_stream);
+
     if (!fn_abi_version_ || !create || !destroy || !load_model || !generate) {
         LOG_ERROR("MetalRT", "dylib missing required LLM symbols: abi=%p create=%p destroy=%p load=%p gen=%p",
                   (void*)fn_abi_version_, (void*)create, (void*)destroy, (void*)load_model, (void*)generate);
diff --git a/src/engines/metalrt_loader.h b/src/engines/metalrt_loader.h
index 6d6b0b8..41247ed 100644
--- a/src/engines/metalrt_loader.h
+++ b/src/engines/metalrt_loader.h
@@ -128,6 +128,47 @@ class MetalRTLoader {
     TtsFreeAudioFn     tts_free_audio   = nullptr;
     TtsSampleRateFn    tts_sample_rate  = nullptr;
 
+    // --- Vision (VLM) function pointers ---
+
+    struct MetalRTVisionResult {
+        const char* text;
+        const char* thinking;
+        const char* response;
+        int prompt_tokens;
+        int generated_tokens;
+        double vision_encode_ms;
+        double prefill_ms;
+        double decode_ms;
+        double tps;
+    };
+
+    struct MetalRTVisionOptions {
+        int max_tokens;
+        int top_k;
+        float temperature;
+        bool think;
+    };
+
+    using VisionAnalyzeFn     = MetalRTVisionResult (*)(void*, const char*, const char*, const MetalRTVisionOptions*);
+    using VisionAnalyzeStreamFn = MetalRTVisionResult (*)(void*, const char*, const char*, MetalRTStreamCb, void*, const MetalRTVisionOptions*);
+    using VisionGenerateFn    = MetalRTVisionResult (*)(void*, const char*, const MetalRTVisionOptions*);
+    using VisionGenerateStreamFn = MetalRTVisionResult (*)(void*, const char*, MetalRTStreamCb, void*, const MetalRTVisionOptions*);
+    using VisionFreeResultFn  = void (*)(MetalRTVisionResult);
+
+    CreateFn               vision_create        = nullptr;
+    DestroyFn              vision_destroy       = nullptr;
+    LoadFn                 vision_load          = nullptr;
+    VisionAnalyzeFn        vision_analyze       = nullptr;
+    VisionAnalyzeStreamFn  vision_analyze_stream = nullptr;
+    VisionGenerateFn       vision_generate      = nullptr;
+    VisionGenerateStreamFn vision_generate_stream = nullptr;
+    ResetFn                vision_reset         = nullptr;
+    ModelNameFn            vision_model_name    = nullptr;
+    DeviceNameFn           vision_device_name   = nullptr;
+    VisionFreeResultFn     vision_free_result   = nullptr;
+
+    bool has_vision() const { return vision_create != nullptr && vision_analyze != nullptr; }
+
     // --- Install / remove / version management ---
 
     static bool install(const std::string& version = "latest");
diff --git a/src/engines/tts_engine.cpp b/src/engines/tts_engine.cpp
index cf5cd95..b139960 100644
--- a/src/engines/tts_engine.cpp
+++ b/src/engines/tts_engine.cpp
@@ -77,9 +77,26 @@ bool TtsEngine::init(const TtsConfig& config) {
     return true;
 }
 
+bool TtsEngine::reinit() {
+    if (!initialized_) return false;
+    LOG_DEBUG("TTS", "Reinitializing ONNX session to prevent audio degradation");
+    if (tts_) {
+        SherpaOnnxDestroyOfflineTts(tts_);
+        tts_ = nullptr;
+    }
+    initialized_ = false;
+    synth_count_ = 0;
+    return init(config_);
+}
+
 std::vector<float> TtsEngine::synthesize(const std::string& text) {
     if (!initialized_ || !tts_) return {};
 
+    // Periodically reinit to prevent audio quality degradation
+    if (++synth_count_ >= kReinitInterval) {
+        reinit();
+    }
+
     stats_ = TtsStats{};
     int64_t t_start = now_us();
 
diff --git a/src/engines/tts_engine.h b/src/engines/tts_engine.h
index 40c36e9..90b9018 100644
--- a/src/engines/tts_engine.h
+++ b/src/engines/tts_engine.h
@@ -63,12 +63,18 @@ class TtsEngine {
     // Change speaker at runtime (Kokoro multi-voice)
     void set_speaker_id(int id) { config_.speaker_id = id; }
 
+    // Reinitialize the ONNX Runtime session to flush accumulated state.
+    // Call periodically to prevent audio degradation over long sessions.
+    bool reinit();
+
 private:
     const SherpaOnnxOfflineTts* tts_ = nullptr;
     TtsConfig config_;
     TtsStats  stats_;
     int       sample_rate_ = 22050;
     bool      initialized_ = false;
+    int       synth_count_  = 0;        // synthesis calls since last reinit
+    static constexpr int kReinitInterval = 20; // reinit every N calls
 };
 
 } // namespace rastack
diff --git a/src/engines/vlm_engine.cpp b/src/engines/vlm_engine.cpp
new file mode 100644
index 0000000..1f2d09b
--- /dev/null
+++ b/src/engines/vlm_engine.cpp
@@ -0,0 +1,266 @@
+#include "engines/vlm_engine.h"
+#include "core/log.h"
+#include "llama.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "mtmd.h"
+#include "mtmd-helper.h"
+#include <cstring>
+#include <algorithm>
+#include <mutex>
+
+namespace rastack {
+
+VlmEngine::VlmEngine() = default;
+
+VlmEngine::~VlmEngine() {
+    shutdown();
+}
+
+void VlmEngine::shutdown() {
+    if (ctx_mtmd_) { mtmd_free(ctx_mtmd_);           ctx_mtmd_ = nullptr; }
+    if (sampler_)  { llama_sampler_free(sampler_);    sampler_  = nullptr; }
+    if (ctx_)      { llama_free(ctx_);                ctx_      = nullptr; }
+    if (model_)    { llama_model_free(model_);        model_    = nullptr; }
+    vocab_       = nullptr;
+    initialized_ = false;
+    stats_       = VlmStats{};
+    LOG_DEBUG("VLM", "Shutdown complete");
+}
+
+bool VlmEngine::init(const VlmConfig& config) {
+    if (initialized_) shutdown();
+
+    config_ = config;
+
+    // Initialize backend (loads Metal, etc.) — safe to call multiple times
+    static std::once_flag backend_init_flag;
+    std::call_once(backend_init_flag, [] { ggml_backend_load_all(); });
+
+    // Load language model
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = config.n_gpu_layers;
+    model_params.use_mmap     = config.use_mmap;
+    model_params.use_mlock    = config.use_mlock;
+
+    LOG_DEBUG("VLM", "Loading VLM model: %s", config.model_path.c_str());
+    model_ = llama_model_load_from_file(config.model_path.c_str(), model_params);
+    if (!model_) {
+        LOG_ERROR("VLM", "Failed to load VLM model");
+        return false;
+    }
+
+    vocab_ = llama_model_get_vocab(model_);
+
+    // Create inference context
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx           = config.n_ctx;
+    ctx_params.n_batch         = config.n_batch;
+    ctx_params.n_threads       = config.n_threads;
+    ctx_params.n_threads_batch = config.n_threads_batch;
+    ctx_params.no_perf         = false;
+    ctx_params.flash_attn_type = config.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+    ctx_ = llama_init_from_model(model_, ctx_params);
+    if (!ctx_) {
+        LOG_ERROR("VLM", "Failed to create VLM context");
+        llama_model_free(model_);
+        model_ = nullptr;
+        return false;
+    }
+
+    // Initialize mtmd (vision projector)
+    LOG_DEBUG("VLM", "Loading vision projector: %s", config.mmproj_path.c_str());
+    mtmd_context_params mtmd_params = mtmd_context_params_default();
+    mtmd_params.use_gpu    = (config.n_gpu_layers > 0);
+    mtmd_params.n_threads  = config.n_threads_batch;
+    mtmd_params.flash_attn_type = config.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED;
+
+    ctx_mtmd_ = mtmd_init_from_file(config.mmproj_path.c_str(), model_, mtmd_params);
+    if (!ctx_mtmd_) {
+        LOG_ERROR("VLM", "Failed to load vision projector (mmproj)");
+        llama_free(ctx_);
+        llama_model_free(model_);
+        ctx_   = nullptr;
+        model_ = nullptr;
+        return false;
+    }
+
+    if (!mtmd_support_vision(ctx_mtmd_)) {
+        LOG_ERROR("VLM", "Model does not support vision input");
+        mtmd_free(ctx_mtmd_);
+        llama_free(ctx_);
+        llama_model_free(model_);
+        ctx_mtmd_ = nullptr;
+        ctx_      = nullptr;
+        model_    = nullptr;
+        return false;
+    }
+
+    // Setup sampler chain
+    auto sparams = llama_sampler_chain_default_params();
+    sampler_ = llama_sampler_chain_init(sparams);
+    if (config.temperature > 0.0f) {
+        llama_sampler_chain_add(sampler_, llama_sampler_init_temp(config.temperature));
+        llama_sampler_chain_add(sampler_, llama_sampler_init_top_k(config.top_k));
+        llama_sampler_chain_add(sampler_, llama_sampler_init_top_p(config.top_p, 1));
+        llama_sampler_chain_add(sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED));
+    } else {
+        llama_sampler_chain_add(sampler_, llama_sampler_init_greedy());
+    }
+
+    initialized_ = true;
+    LOG_INFO("VLM", "Initialized (vision support: yes)");
+    return true;
+}
+
+std::string VlmEngine::analyze_image(const std::string& image_path,
+                                      const std::string& prompt,
+                                      TokenCallback on_token) {
+    if (!initialized_) return "";
+
+    cancelled_.store(false, std::memory_order_relaxed);
+    stats_ = VlmStats{};
+
+    // Clear KV cache
+    llama_memory_clear(llama_get_memory(ctx_), true);
+    if (sampler_) llama_sampler_reset(sampler_);
+
+    // 1. Load image
+    LOG_DEBUG("VLM", "Loading image: %s", image_path.c_str());
+    mtmd_bitmap* bitmap = mtmd_helper_bitmap_init_from_file(ctx_mtmd_, image_path.c_str());
+    if (!bitmap) {
+        LOG_ERROR("VLM", "Failed to load image: %s", image_path.c_str());
+        return "";
+    }
+
+    // 2. Build prompt with media marker using ChatML template (Qwen3-VL format)
+    // The model expects: <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n<marker>\nprompt<|im_end|>\n<|im_start|>assistant\n
+    std::string marker = mtmd_default_marker();
+    std::string full_prompt =
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\n" + marker + "\n" + prompt + "<|im_end|>\n"
+        "<|im_start|>assistant\n";
+
+    mtmd_input_text input_text;
+    input_text.text          = full_prompt.c_str();
+    input_text.add_special   = true;
+    input_text.parse_special = true;
+
+    // 3. Tokenize (combines text tokens + image tokens)
+    mtmd_input_chunks* chunks = mtmd_input_chunks_init();
+    const mtmd_bitmap* bitmap_ptr = bitmap;
+
+    int64_t t_encode_start = now_us();
+    int32_t tokenize_result = mtmd_tokenize(ctx_mtmd_, chunks, &input_text, &bitmap_ptr, 1);
+    if (tokenize_result != 0) {
+        LOG_ERROR("VLM", "Failed to tokenize image+text (error=%d)", tokenize_result);
+        mtmd_input_chunks_free(chunks);
+        mtmd_bitmap_free(bitmap);
+        return "";
+    }
+
+    size_t n_tokens = mtmd_helper_get_n_tokens(chunks);
+    stats_.prompt_tokens = n_tokens;
+    LOG_DEBUG("VLM", "Tokenized: %zu total tokens (text + image)", n_tokens);
+
+    // 4. Evaluate all chunks (text + image encoding + decoding)
+    int64_t t_prompt_start = now_us();
+    llama_pos n_past = 0;
+    int32_t eval_result = mtmd_helper_eval_chunks(
+        ctx_mtmd_, ctx_, chunks,
+        n_past,           // n_past
+        0,                // seq_id
+        config_.n_batch,  // n_batch
+        true,             // logits_last
+        &n_past           // updated n_past
+    );
+
+    stats_.image_encode_us = now_us() - t_encode_start;
+    stats_.prompt_eval_us  = now_us() - t_prompt_start;
+
+    // Clean up image resources
+    mtmd_input_chunks_free(chunks);
+    mtmd_bitmap_free(bitmap);
+
+    if (eval_result != 0) {
+        LOG_ERROR("VLM", "Failed to evaluate image+text chunks (error=%d)", eval_result);
+        return "";
+    }
+
+    LOG_DEBUG("VLM", "Image encoded in %.1fms, prompt eval in %.1fms",
+            stats_.image_encode_us / 1000.0, stats_.prompt_eval_us / 1000.0);
+
+    // 5. Generate tokens (same pattern as LlmEngine::generate)
+    std::string result;
+    int64_t t_gen_start = now_us();
+    bool first_token = true;
+
+    for (int i = 0; i < config_.max_tokens; i++) {
+        if (cancelled_.load(std::memory_order_relaxed)) {
+            LOG_DEBUG("VLM", "Generation cancelled");
+            break;
+        }
+
+        int32_t new_token = llama_sampler_sample(sampler_, ctx_, -1);
+
+        if (first_token) {
+            stats_.first_token_us = now_us() - t_prompt_start;
+            first_token = false;
+        }
+
+        if (llama_vocab_is_eog(vocab_, new_token)) {
+            break;
+        }
+
+        // Decode token to text
+        char buf[256];
+        int n = llama_token_to_piece(vocab_, new_token, buf, sizeof(buf), 0, true);
+        if (n < 0) continue;
+        std::string piece(buf, n);
+
+        result += piece;
+        stats_.generated_tokens++;
+
+        if (on_token) {
+            TokenOutput tok;
+            tok.text      = piece;
+            tok.token_id  = new_token;
+            tok.is_eos    = false;
+            tok.is_tool_call = false;
+            on_token(tok);
+        }
+
+        // Feed token back for next iteration
+        llama_batch batch = llama_batch_get_one(&new_token, 1);
+        if (llama_decode(ctx_, batch) != 0) {
+            LOG_ERROR("VLM", "Failed to decode token");
+            break;
+        }
+    }
+
+    stats_.generation_us = now_us() - t_gen_start;
+
+    LOG_DEBUG("VLM", "Generated %lld tokens (%.1f tok/s), first token: %.1fms",
+            stats_.generated_tokens, stats_.gen_tps(),
+            stats_.first_token_us / 1000.0);
+
+    return result;
+}
+
+bool VlmEngine::is_supported_image(const std::string& path) {
+    // Get extension (case-insensitive)
+    auto dot = path.rfind('.');
+    if (dot == std::string::npos) return false;
+
+    std::string ext = path.substr(dot);
+    // Convert to lowercase
+    std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+
+    return ext == ".jpg"  || ext == ".jpeg" ||
+           ext == ".png"  || ext == ".bmp"  ||
+           ext == ".gif"  || ext == ".webp" ||
+           ext == ".tga";
+}
+
+} // namespace rastack
diff --git a/src/engines/vlm_engine.h b/src/engines/vlm_engine.h
new file mode 100644
index 0000000..57739a2
--- /dev/null
+++ b/src/engines/vlm_engine.h
@@ -0,0 +1,88 @@
+#pragma once
+
+#include "core/types.h"
+#include <string>
+#include <vector>
+#include <atomic>
+
+// Forward declare llama types
+struct llama_model;
+struct llama_context;
+struct llama_sampler;
+struct llama_vocab;
+
+// Forward declare mtmd types
+struct mtmd_context;
+
+namespace rastack {
+
+struct VlmConfig {
+    std::string model_path;       // Path to VLM language model GGUF
+    std::string mmproj_path;      // Path to vision projector (mmproj) GGUF
+    int         n_gpu_layers = 99;
+    int         n_ctx        = 4096;  // VLM needs larger context for image tokens
+    int         n_batch      = 512;
+    int         n_threads       = 1;
+    int         n_threads_batch = 8;
+    float       temperature  = 0.7f;
+    float       top_p        = 0.9f;
+    int         top_k        = 40;
+    int         max_tokens   = 512;
+    bool        use_mmap     = true;
+    bool        use_mlock    = false;
+    bool        flash_attn   = true;
+};
+
+struct VlmStats {
+    int64_t  prompt_tokens     = 0;
+    int64_t  generated_tokens  = 0;
+    int64_t  prompt_eval_us    = 0;
+    int64_t  generation_us     = 0;
+    int64_t  image_encode_us   = 0;   // Time spent encoding the image
+    double   prompt_tps()  const { return prompt_tokens > 0 ? prompt_tokens * 1e6 / prompt_eval_us : 0; }
+    double   gen_tps()     const { return generated_tokens > 0 ? generated_tokens * 1e6 / generation_us : 0; }
+    int64_t  first_token_us    = 0;
+};
+
+class VlmEngine {
+public:
+    VlmEngine();
+    ~VlmEngine();
+
+    // Initialize model + vision projector
+    bool init(const VlmConfig& config);
+
+    // Release all resources
+    void shutdown();
+
+    // Analyze an image with a text prompt
+    // Returns the generated description/analysis text
+    std::string analyze_image(const std::string& image_path,
+                              const std::string& prompt,
+                              TokenCallback on_token = nullptr);
+
+    // Cancel ongoing generation
+    void cancel() { cancelled_.store(true, std::memory_order_release); }
+
+    // Get stats from last generation
+    const VlmStats& last_stats() const { return stats_; }
+
+    bool is_initialized() const { return initialized_; }
+
+    // Check if an image file is a supported format
+    static bool is_supported_image(const std::string& path);
+
+private:
+    llama_model*    model_      = nullptr;
+    llama_context*  ctx_        = nullptr;
+    llama_sampler*  sampler_    = nullptr;
+    const llama_vocab* vocab_   = nullptr;
+    mtmd_context*   ctx_mtmd_   = nullptr;
+
+    VlmConfig        config_;
+    VlmStats         stats_;
+    bool             initialized_ = false;
+    std::atomic<bool> cancelled_{false};
+};
+
+} // namespace rastack
diff --git a/src/models/model_registry.h b/src/models/model_registry.h
index 79d3da4..e0084d1 100644
--- a/src/models/model_registry.h
+++ b/src/models/model_registry.h
@@ -287,7 +287,7 @@ inline bool is_metalrt_model_installed(const LlmModelDef& m) {
 struct MetalRTComponentModel {
     std::string id;
     std::string name;
-    std::string component;       // "stt" or "tts"
+    std::string component;       // "stt", "tts", or "vlm"
     std::string hf_repo;         // HuggingFace repo path (org/repo)
     std::string hf_subdir;       // subdirectory within repo (empty for flat repos)
     std::string dir_name;        // local dir under metalrt_models_dir()
@@ -350,6 +350,7 @@ inline std::vector<MetalRTComponentModel> metalrt_component_models() {
     };
 }
 
+
 inline bool is_metalrt_component_installed(const MetalRTComponentModel& m) {
     std::string dir = metalrt_models_dir() + "/" + m.dir_name;
     if (access(dir.c_str(), R_OK) != 0) return false;
diff --git a/src/models/vlm_model_registry.h b/src/models/vlm_model_registry.h
new file mode 100644
index 0000000..5556d7a
--- /dev/null
+++ b/src/models/vlm_model_registry.h
@@ -0,0 +1,94 @@
+#pragma once
+// =============================================================================
+// RCLI VLM Model Registry
+// =============================================================================
+//
+// Registry of supported VLM (Vision Language Model) models.
+// Each model consists of a language model GGUF + an mmproj (vision projector) GGUF.
+//
+// =============================================================================
+
+#include <string>
+#include <vector>
+#include <unistd.h>
+
+namespace rcli {
+
+struct VlmModelDef {
+    std::string id;               // Unique slug: "smolvlm-500m"
+    std::string name;             // Display name: "SmolVLM 500M Instruct"
+    std::string model_filename;   // Language model GGUF filename
+    std::string mmproj_filename;  // Vision projector GGUF filename
+    std::string model_url;        // HuggingFace download URL for language model
+    std::string mmproj_url;       // HuggingFace download URL for mmproj
+    int         model_size_mb;    // Approximate model download size
+    int         mmproj_size_mb;   // Approximate mmproj download size
+    std::string description;      // One-line description
+    bool        is_default;       // Default model for `rcli vlm`
+};
+
+inline std::vector<VlmModelDef> all_vlm_models() {
+    return {
+        {
+            /* id              */ "qwen3-vl-2b",
+            /* name            */ "Qwen3 VL 2B Instruct",
+            /* model_filename  */ "Qwen3-VL-2B-Instruct-Q8_0.gguf",
+            /* mmproj_filename */ "mmproj-Qwen3-VL-2B-Instruct-Q8_0.gguf",
+            /* model_url       */ "https://huggingface.co/ggml-org/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3-VL-2B-Instruct-Q8_0.gguf",
+            /* mmproj_url      */ "https://huggingface.co/ggml-org/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3-VL-2B-Instruct-Q8_0.gguf",
+            /* model_size_mb   */ 1830,
+            /* mmproj_size_mb  */ 445,
+            /* description     */ "Qwen3 Vision-Language model. High quality image analysis.",
+            /* is_default      */ false,
+        },
+        {
+            /* id              */ "lfm2-vl-1.6b",
+            /* name            */ "Liquid LFM2 VL 1.6B",
+            /* model_filename  */ "LFM2-VL-1.6B-Q8_0.gguf",
+            /* mmproj_filename */ "mmproj-LFM2-VL-1.6B-Q8_0.gguf",
+            /* model_url       */ "https://huggingface.co/LiquidAI/LFM2-VL-1.6B-GGUF/resolve/main/LFM2-VL-1.6B-Q8_0.gguf",
+            /* mmproj_url      */ "https://huggingface.co/LiquidAI/LFM2-VL-1.6B-GGUF/resolve/main/mmproj-LFM2-VL-1.6B-Q8_0.gguf",
+            /* model_size_mb   */ 1250,
+            /* mmproj_size_mb  */ 210,
+            /* description     */ "Liquid Foundation Model for vision. Fast, 128K context.",
+            /* is_default      */ false,
+        },
+        {
+            /* id              */ "smolvlm-500m",
+            /* name            */ "SmolVLM 500M Instruct",
+            /* model_filename  */ "SmolVLM-500M-Instruct-Q8_0.gguf",
+            /* mmproj_filename */ "mmproj-SmolVLM-500M-Instruct-Q8_0.gguf",
+            /* model_url       */ "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/SmolVLM-500M-Instruct-Q8_0.gguf",
+            /* mmproj_url      */ "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf",
+            /* model_size_mb   */ 437,
+            /* mmproj_size_mb  */ 109,
+            /* description     */ "Smallest VLM. Fast image analysis, lower quality.",
+            /* is_default      */ false,
+        },
+    };
+}
+
+inline std::pair<bool, VlmModelDef> get_default_vlm_model() {
+    auto models = all_vlm_models();
+    for (auto& m : models) {
+        if (m.is_default) return {true, m};
+    }
+    return {false, {}};
+}
+
+inline bool is_vlm_model_installed(const std::string& models_dir, const VlmModelDef& m) {
+    std::string model_path = models_dir + "/" + m.model_filename;
+    std::string mmproj_path = models_dir + "/" + m.mmproj_filename;
+    return access(model_path.c_str(), R_OK) == 0 &&
+           access(mmproj_path.c_str(), R_OK) == 0;
+}
+
+inline std::pair<bool, VlmModelDef> find_installed_vlm(const std::string& models_dir) {
+    auto models = all_vlm_models();
+    for (auto& m : models) {
+        if (is_vlm_model_installed(models_dir, m)) return {true, m};
+    }
+    return {false, {}};
+}
+
+} // namespace rcli
diff --git a/src/pipeline/orchestrator.h b/src/pipeline/orchestrator.h
index 8648374..51a5527 100644
--- a/src/pipeline/orchestrator.h
+++ b/src/pipeline/orchestrator.h
@@ -6,6 +6,7 @@
 #include "core/ring_buffer.h"
 #include "engines/stt_engine.h"
 #include "engines/llm_engine.h"
+#include "engines/vlm_engine.h"
 #include "engines/metalrt_engine.h"
 #include "engines/metalrt_stt_engine.h"
 #include "engines/metalrt_tts_engine.h"
@@ -93,12 +94,16 @@ class Orchestrator {
     VadEngine&  vad()  { return vad_; }
     ToolEngine& tools() { return tools_; }
     AudioIO&    audio() { return audio_; }
+    VlmEngine&  vlm()   { return vlm_; }
     RingBuffer<float>* playback_ring_buffer() { return playback_rb_.get(); }
 
     // Active LLM backend
     LlmBackend active_llm_backend() const { return active_backend_; }
     bool using_metalrt() const { return active_backend_ == LlmBackend::METALRT; }
 
+    // Access the pipeline config (e.g. for MetalRT model dir during VLM swap)
+    const PipelineConfig& config() const { return config_; }
+
     // Update the base system prompt (e.g. when personality changes)
     void set_system_prompt(const std::string& prompt) { config_.system_prompt = prompt; }
 
@@ -168,6 +173,7 @@ class Orchestrator {
     SttEngine        stt_;
     OfflineSttEngine offline_stt_;  // Whisper for file pipeline
     LlmEngine        llm_;
+    VlmEngine        vlm_;
     MetalRTEngine    metalrt_;
     MetalRTSttEngine metalrt_stt_;
     MetalRTTtsEngine metalrt_tts_;
diff --git a/src/pipeline/text_sanitizer.h b/src/pipeline/text_sanitizer.h
index b21b1a0..5c454a3 100644
--- a/src/pipeline/text_sanitizer.h
+++ b/src/pipeline/text_sanitizer.h
@@ -73,6 +73,33 @@ inline std::string sanitize_for_tts(const std::string& text) {
         out = std::move(cleaned);
     }
 
+    // 4b. Strip emote/action markers like *laughs*, *sighs*, *smiles*, etc.
+    //     These are non-speakable stage directions that LLMs often generate.
+    {
+        std::string cleaned;
+        cleaned.reserve(out.size());
+        for (size_t i = 0; i < out.size(); i++) {
+            if (out[i] == '*') {
+                size_t close = out.find('*', i + 1);
+                if (close != std::string::npos && close - i <= 30) {
+                    // Check it looks like an emote (single word or short phrase, no nested formatting)
+                    bool is_emote = true;
+                    for (size_t j = i + 1; j < close; j++) {
+                        if (out[j] == '*' || out[j] == '\n') { is_emote = false; break; }
+                    }
+                    if (is_emote) {
+                        i = close; // skip past closing *
+                        // Also skip trailing space if present
+                        if (i + 1 < out.size() && out[i + 1] == ' ') i++;
+                        continue;
+                    }
+                }
+            }
+            cleaned += out[i];
+        }
+        out = std::move(cleaned);
+    }
+
     // 5. Strip markdown symbols and non-speakable formatting
     {
         std::string cleaned;
@@ -215,6 +242,84 @@ inline std::string sanitize_for_tts(const std::string& text) {
         }
     }
 
+    // 6c. Replace brand names / proper nouns that G2P spells letter-by-letter
+    //     with phonetic approximations so TTS pronounces them naturally.
+    {
+        struct Phonetic { const char* from; const char* to; };
+        static const Phonetic table[] = {
+            {"Spotify",  "Spotifye"},
+            {"spotify",  "spotifye"},
+            {"SPOTIFY",  "Spotifye"},
+            {"YouTube",  "You Tube"},
+            {"Youtube",  "You Tube"},
+            {"youtube",  "you tube"},
+            {"YOUTUBE",  "You Tube"},
+            {"WiFi",     "Why Fye"},
+            {"wifi",     "why fye"},
+            {"WIFI",     "Why Fye"},
+            {"Wi-Fi",    "Why Fye"},
+            {"iPhone",   "eye phone"},
+            {"iphone",   "eye phone"},
+            {"IPHONE",   "eye phone"},
+            {"iPad",     "eye pad"},
+            {"ipad",     "eye pad"},
+            {"IPAD",     "eye pad"},
+            {"macOS",    "mac O S"},
+            {"MacOS",    "mac O S"},
+            {"iOS",      "eye O S"},
+            {"AirPods",  "Air Pods"},
+            {"airpods",  "air pods"},
+            {"AIRPODS",  "Air Pods"},
+            {"ChatGPT",  "Chat G P T"},
+            {"WhatsApp", "Whats App"},
+            {"whatsapp", "whats app"},
+            {"WHATSAPP", "Whats App"},
+            {"TikTok",   "Tick Tock"},
+            {"tiktok",   "tick tock"},
+            {"TIKTOK",   "Tick Tock"},
+            {"LinkedIn", "Linked In"},
+            {"linkedin", "linked in"},
+            {"LINKEDIN", "Linked In"},
+        };
+        for (auto& p : table) {
+            std::string needle(p.from);
+            std::string replacement(p.to);
+            size_t pos = 0;
+            while ((pos = out.find(needle, pos)) != std::string::npos) {
+                bool left_ok = (pos == 0 || out[pos - 1] == ' ' || out[pos - 1] == '\n' ||
+                                out[pos - 1] == '"' || out[pos - 1] == '\'');
+                size_t end = pos + needle.size();
+                bool right_ok = (end >= out.size() || out[end] == ' ' || out[end] == ',' ||
+                                 out[end] == '.' || out[end] == '!' || out[end] == '?' ||
+                                 out[end] == '\n' || out[end] == ';' || out[end] == ':' ||
+                                 out[end] == '\'' || out[end] == '"');
+                if (left_ok && right_ok) {
+                    out.replace(pos, needle.size(), replacement);
+                    pos += replacement.size();
+                } else {
+                    pos += needle.size();
+                }
+            }
+        }
+    }
+
+    // 6d. Replace hyphens between letters/words with spaces so G2P does not
+    //     spell out hyphenated compounds (e.g. "well-known" → "well known").
+    {
+        std::string cleaned;
+        cleaned.reserve(out.size());
+        for (size_t i = 0; i < out.size(); i++) {
+            if (out[i] == '-' && i > 0 && i + 1 < out.size() &&
+                std::isalpha((unsigned char)out[i - 1]) &&
+                std::isalpha((unsigned char)out[i + 1])) {
+                cleaned += ' ';
+            } else {
+                cleaned += out[i];
+            }
+        }
+        out = std::move(cleaned);
+    }
+
     // 7. Collapse multiple whitespace to single space, trim
     {
         std::string cleaned;
diff --git a/src/test/test_pipeline.cpp b/src/test/test_pipeline.cpp
index a4b7bfb..d73a1b8 100644
--- a/src/test/test_pipeline.cpp
+++ b/src/test/test_pipeline.cpp
@@ -783,31 +783,36 @@ static void test_metalrt_llm(const std::string& models_dir) {
     engine.reset_conversation();
     engine.generate("hi");
 
-    // Benchmark 3 prompts
-    const char* prompts[] = {
-        "What is 2+2?",
-        "Write a haiku about the sea.",
-        "Explain gravity in one sentence.",
-    };
-
-    TEST_SECTION("MetalRT LLM Inference (Metal GPU)");
-    for (int i = 0; i < 3; i++) {
+    // Benchmark across max_tokens sweep: 64, 128, 256, 512, 1024, 2048
+    const int token_limits[] = { 64, 128, 256, 512, 1024, 2048 };
+    const char* prompt = "Write a detailed essay about the history and future of artificial intelligence, "
+                         "covering early pioneers, neural networks, deep learning breakthroughs, "
+                         "large language models, and predictions for the next decade.";
+
+    TEST_SECTION("MetalRT LLM Token Sweep Benchmark (Metal GPU)");
+    fprintf(stderr, "\n    \033[1;33m%-12s %8s %12s %10s %12s %10s %10s\033[0m\n",
+            "max_tokens", "gen_tok", "decode_ms", "tok/s", "prefill_ms", "pf_tok/s", "wall_ms");
+    fprintf(stderr, "    \033[33m%s\033[0m\n",
+            "------------ -------- ------------ ---------- ------------ ---------- ----------");
+
+    for (int limit : token_limits) {
+        engine.set_max_tokens(limit);
+        engine.set_ignore_eos(true);
         engine.reset_conversation();
+
         t0 = std::chrono::steady_clock::now();
-        std::string result = engine.generate(prompts[i]);
+        std::string result = engine.generate(prompt);
         double gen_ms = elapsed_ms(t0);
 
         const auto& stats = engine.last_stats();
-        TEST_INFO("--- Run %d ---", i + 1);
-        TEST_INFO("  Prompt:   \"%s\"", prompts[i]);
-        TEST_INFO("  Response: \"%.*s%s\"", (int)std::min(result.size(), (size_t)80),
-                  result.c_str(), result.size() > 80 ? "..." : "");
-        TEST_INFO("  Backend:  MetalRT (Metal GPU)");
-        TEST_INFO("  Prefill:  %.1f ms (%d tokens, %.0f tok/s)",
-                  stats.prompt_eval_us / 1000.0, stats.prompt_tokens, stats.prompt_tps());
-        TEST_INFO("  Decode:   %.1f ms (%d tokens, %.0f tok/s)",
-                  stats.generation_us / 1000.0, stats.generated_tokens, stats.gen_tps());
-        TEST_INFO("  Wall:     %.1f ms", gen_ms);
+        fprintf(stderr, "    %-12d %8d %10.1f ms %8.1f %10.1f ms %8.0f %8.1f ms\n",
+                limit,
+                stats.generated_tokens,
+                stats.generation_us / 1000.0,
+                stats.gen_tps(),
+                stats.prompt_eval_us / 1000.0,
+                stats.prompt_tps(),
+                gen_ms);
         TEST("run produces output", !result.empty());
     }
 }