diff --git a/CMakeLists.txt b/CMakeLists.txt index 00f5224..e9515d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,6 +46,10 @@ set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE) set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE) add_subdirectory(deps/llama.cpp ${CMAKE_BINARY_DIR}/llama.cpp EXCLUDE_FROM_ALL) +# --- libmtmd (multimodal/vision support from llama.cpp) --- +set(LLAMA_INSTALL_VERSION "0.0.0" CACHE STRING "" FORCE) +add_subdirectory(deps/llama.cpp/tools/mtmd ${CMAKE_BINARY_DIR}/mtmd EXCLUDE_FROM_ALL) + # --- sherpa-onnx (STT + TTS + VAD) --- set(SHERPA_ONNX_ENABLE_C_API ON CACHE BOOL "Enable C API" FORCE) set(SHERPA_ONNX_ENABLE_BINARY OFF CACHE BOOL "" FORCE) @@ -99,8 +103,11 @@ add_library(rcli STATIC src/engines/metalrt_engine.cpp src/engines/metalrt_stt_engine.cpp src/engines/metalrt_tts_engine.cpp + src/engines/vlm_engine.cpp src/audio/audio_io.cpp src/audio/mic_permission.mm + src/audio/camera_capture.mm + src/audio/screen_capture.mm src/pipeline/orchestrator.cpp src/pipeline/sentence_detector.cpp src/tools/tool_engine.cpp @@ -133,13 +140,14 @@ add_library(rcli STATIC src/api/rcli_api.cpp ) -set_source_files_properties(src/audio/mic_permission.mm +set_source_files_properties(src/audio/mic_permission.mm src/audio/camera_capture.mm src/audio/screen_capture.mm PROPERTIES LANGUAGE CXX) target_include_directories(rcli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/src ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/include ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/ggml/include + ${CMAKE_CURRENT_SOURCE_DIR}/deps/llama.cpp/tools/mtmd ${CMAKE_CURRENT_SOURCE_DIR}/deps/sherpa-onnx/sherpa-onnx/c-api ${usearch_SOURCE_DIR}/include ) @@ -147,12 +155,18 @@ target_include_directories(rcli PUBLIC target_link_libraries(rcli PUBLIC llama ggml + mtmd sherpa-onnx-c-api "-framework CoreAudio" "-framework AudioToolbox" "-framework AudioUnit" "-framework Foundation" "-framework AVFoundation" + "-framework AppKit" + "-framework CoreImage" + "-framework CoreMedia" + "-framework CoreVideo" + "-framework CoreGraphics" "-framework IOKit" ) @@ -186,6 +200,27 @@ target_compile_definitions(rcli_cli PRIVATE RCLI_VERSION="${PROJECT_VERSION}" ) +# ============================================================================= +# rcli_overlay — standalone Cocoa helper for visual overlay window +# ============================================================================= +add_executable(rcli_overlay + src/audio/rcli_overlay.m +) + +set_source_files_properties(src/audio/rcli_overlay.m PROPERTIES LANGUAGE CXX) + +target_compile_options(rcli_overlay PRIVATE -x objective-c++) + +target_link_libraries(rcli_overlay PRIVATE + "-framework AppKit" + "-framework CoreGraphics" +) + +set_target_properties(rcli_overlay PROPERTIES + OUTPUT_NAME "rcli_overlay" + RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) + # ============================================================================= # rcli_test — test executable # ============================================================================= diff --git a/README.md b/README.md index dcefc11..972342a 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ MIT

-**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS pipeline running natively on Apple Silicon — 38 macOS actions via voice, local RAG over your documents, sub-200ms end-to-end latency. No cloud, no API keys. +**RCLI** is an on-device voice AI for macOS. A complete STT + LLM + TTS + VLM pipeline running natively on Apple Silicon — 40 macOS actions via voice, local RAG over your documents, on-device vision (camera & screen analysis), sub-200ms end-to-end latency. No cloud, no API keys. Powered by [MetalRT](#metalrt-gpu-engine), a proprietary GPU inference engine built by [RunAnywhere, Inc.](https://runanywhere.ai) specifically for Apple Silicon. @@ -112,6 +112,9 @@ rcli # interactive TUI (push-to-talk + text) rcli listen # continuous voice mode rcli ask "open Safari" # one-shot command rcli ask "play some jazz on Spotify" +rcli vlm photo.jpg "what's in this image?" # vision analysis +rcli camera # live camera VLM +rcli screen # screen capture VLM rcli metalrt # MetalRT GPU engine management rcli llamacpp # llama.cpp engine management ``` @@ -149,7 +152,18 @@ A full STT + LLM + TTS pipeline running on Metal GPU with three concurrent threa - **Tool Calling** — LLM-native tool call formats (Qwen3, LFM2, etc.) - **Multi-turn Memory** — Sliding window conversation history with token-budget trimming -### 38 macOS Actions +### Vision (VLM) + +Analyze images, camera captures, and screen regions using on-device vision-language models. VLM runs on the llama.cpp engine via Metal GPU — no cloud. + +- **Image Analysis** — `rcli vlm photo.jpg "describe this"` for single-image queries +- **Camera** — Press **V** in the TUI or run `rcli camera` for live camera analysis +- **Screen Capture** — Press **S** in the TUI or run `rcli screen` to analyze screen regions +- **Models** — Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — download on demand via `rcli models vlm` + +> **Note:** VLM is currently available on the llama.cpp engine. MetalRT VLM support is coming soon. + +### 40 macOS Actions Control your Mac by voice or text. The LLM routes intent to actions executed locally via AppleScript and shell commands. @@ -161,7 +175,7 @@ Control your Mac by voice or text. The LLM routes intent to actions executed loc | **System** | `open_app`, `quit_app`, `set_volume`, `toggle_dark_mode`, `screenshot`, `lock_screen` | | **Web** | `search_web`, `search_youtube`, `open_url`, `open_maps` | -Run `rcli actions` to see all 38, or toggle them on/off in the TUI Actions panel. +Run `rcli actions` to see all 40, or toggle them on/off in the TUI Actions panel. > **Tip:** If tool calling feels unreliable, press **X** in the TUI to clear the conversation and reset context. With small LLMs, accumulated context can degrade tool-calling accuracy — a fresh context often fixes it. @@ -181,7 +195,9 @@ A terminal dashboard with push-to-talk, live hardware monitoring, model manageme | Key | Action | |-----|--------| | **SPACE** | Push-to-talk | -| **M** | Models — browse, download, hot-swap LLM/STT/TTS | +| **V** | Camera — capture and analyze with VLM | +| **S** | Screen — capture and analyze a screen region with VLM | +| **M** | Models — browse, download, hot-swap LLM/STT/TTS/VLM | | **A** | Actions — browse, enable/disable macOS actions | | **R** | RAG — ingest documents | | **X** | Clear conversation and reset context | @@ -207,7 +223,7 @@ MetalRT is distributed under a [proprietary license](https://github.com/Runanywh ## Supported Models -RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch. +RCLI supports 20+ models across LLM, STT, TTS, VLM, VAD, and embeddings. All run locally on Apple Silicon. Use `rcli models` to browse, download, or switch. **LLM:** LFM2 1.2B (default), LFM2 350M, LFM2.5 1.2B, LFM2 2.6B, Qwen3 0.6B, Qwen3.5 0.8B/2B/4B, Qwen3 4B @@ -215,10 +231,13 @@ RCLI supports 20+ models across LLM, STT, TTS, VAD, and embeddings. All run loca **TTS:** Piper Lessac/Amy, KittenTTS Nano, Matcha LJSpeech, Kokoro English/Multi-lang -**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings. +**VLM:** Qwen3 VL 2B, Liquid LFM2 VL 1.6B, SmolVLM 500M — on-demand download via `rcli models vlm` (llama.cpp engine only) + +**Default install** (`rcli setup`): ~1GB — LFM2 1.2B + Whisper + Piper + Silero VAD + Snowflake embeddings. VLM models are downloaded on demand. ```bash rcli models # interactive model management +rcli models vlm # download/manage VLM models rcli upgrade-llm # guided LLM upgrade rcli voices # browse and switch TTS voices rcli cleanup # remove unused models @@ -247,10 +266,13 @@ All dependencies are vendored or CMake-fetched. Requires CMake 3.15+ and Apple C rcli Interactive TUI (push-to-talk + text + trace) rcli listen Continuous voice mode rcli ask One-shot text command +rcli vlm [prompt] Analyze an image with VLM +rcli camera [prompt] Live camera capture + VLM analysis +rcli screen [prompt] Screen capture + VLM analysis rcli actions [name] List actions or show detail rcli rag ingest Index documents for RAG rcli rag query Query indexed documents -rcli models [llm|stt|tts] Manage AI models +rcli models [llm|stt|tts|vlm] Manage AI models rcli voices Manage TTS voices rcli metalrt MetalRT GPU engine management rcli llamacpp llama.cpp engine management diff --git a/src/api/rcli_api.cpp b/src/api/rcli_api.cpp index 8baa3ef..f292c78 100644 --- a/src/api/rcli_api.cpp +++ b/src/api/rcli_api.cpp @@ -16,6 +16,7 @@ #include "rag/index_builder.h" #include "pipeline/text_sanitizer.h" #include "pipeline/sentence_detector.h" +#include "audio/screen_capture.h" #include #include #include @@ -32,9 +33,15 @@ #include #include #include +#include +#include + +extern char** environ; #include "actions/action_registry.h" #include "actions/macos_actions.h" +#include "engines/vlm_engine.h" +#include "models/vlm_model_registry.h" using namespace rastack; @@ -109,6 +116,13 @@ struct RCLIEngine { // so the context gauge shows stable, meaningful usage. int ctx_main_prompt_tokens = 0; + // VLM (Vision Language Model) subsystem + VlmEngine vlm_engine; + bool vlm_initialized = false; + std::string last_vlm_response; + std::string vlm_backend_name; // "llama.cpp (Metal GPU)" or "MetalRT" + std::string vlm_model_name; // e.g. "Qwen3 VL 2B" + std::mutex mutex; bool initialized = false; }; @@ -969,6 +983,113 @@ static std::vector try_parse_bare_tool_calls( return calls; } +// Forward declaration (defined later in VLM section) +static int vlm_init_locked(RCLIEngine* engine); + +// ============================================================================= +// Screen intent detection — intercept voice commands about the user's screen +// ============================================================================= + +static bool has_word(const std::string& text, const char* word) { + return text.find(word) != std::string::npos; +} + +static bool is_screen_intent(const std::string& input) { + // Normalize to lowercase for matching + std::string lower = input; + for (auto& c : lower) c = (char)std::tolower((unsigned char)c); + + // --- Tier 1: explicit screenshot keywords (always trigger) --- + if (has_word(lower, "screenshot") || has_word(lower, "screen capture") || + has_word(lower, "screen shot")) + return true; + + // --- Tier 2: "screen" + any vision/action verb --- + bool has_screen = has_word(lower, "screen"); + if (has_screen) { + static const char* screen_verbs[] = { + "look", "see", "show", "what", "tell", "describe", "explain", + "check", "analyze", "read", "capture", "going on", "happening", + }; + for (const auto* v : screen_verbs) { + if (has_word(lower, v)) return true; + } + } + + // --- Tier 3: visual context phrases (no "screen" needed) --- + // "does this look good/right/ok", "how does this look", etc. + if (has_word(lower, "does this look") || has_word(lower, "how does this look")) + return true; + // "what am I looking at" + if (has_word(lower, "looking at") && has_word(lower, "what")) + return true; + // "can you see this/that", "what do you see", "what can you see" + if ((has_word(lower, "can you see") || has_word(lower, "do you see")) && + !has_word(lower, "file") && !has_word(lower, "code") && !has_word(lower, "error")) + return true; + // "what's happening here", "explain what's happening" + if (has_word(lower, "happening here") || has_word(lower, "happening on")) + return true; + + return false; +} + +// Capture active window + analyze with VLM. Returns response or empty on failure. +// Caller must hold engine->mutex. +static std::string handle_screen_intent(RCLIEngine* engine, const std::string& user_text) { + // Generate a temp path + auto ts = std::chrono::system_clock::now().time_since_epoch().count(); + std::string path = "/tmp/rcli_screen_" + std::to_string(ts) + ".jpg"; + + int rc; + const char* capture_source; + if (screen_capture_overlay_active()) { + // Visual mode: capture the overlay region + capture_source = "visual frame"; + rc = screen_capture_overlay_region(path.c_str()); + } else { + // Fallback: capture the previously active app's window + char target_app[256]; + screen_capture_target_app_name(target_app, sizeof(target_app)); + capture_source = target_app; + rc = screen_capture_behind_terminal(path.c_str()); + } + LOG_INFO("RCLI", "[screen_intent] Capturing %s → %s", capture_source, path.c_str()); + if (rc != 0) { + LOG_ERROR("RCLI", "[screen_intent] Screen capture failed"); + return "I couldn't capture your screen. Please check screen recording permissions " + "in System Settings > Privacy & Security > Screen Recording."; + } + + // Initialize VLM if needed + if (!engine->vlm_initialized) { + if (vlm_init_locked(engine) != 0) { + return "I can see you're asking about your screen, but VLM isn't available. " + "It requires the llama.cpp engine and a VLM model. " + "Switch with: rcli engine llamacpp, then download a model: rcli models vlm"; + } + } + + // Build a natural prompt from the user's words + std::string vlm_prompt = user_text; + if (vlm_prompt.empty()) { + vlm_prompt = "Describe what you see on this screen in detail."; + } + + std::string result = engine->vlm_engine.analyze_image(path, vlm_prompt, nullptr); + + if (result.empty()) { + return "I captured your screen but the analysis failed. Please try again."; + } + + // Prepend which app was captured so the user knows + std::string prefixed = "[Captured: " + std::string(capture_source) + "]\n" + result; + + // Store for stats retrieval + engine->last_vlm_response = prefixed; + return prefixed; +} + // ============================================================================= // Process command entry points // ============================================================================= @@ -984,6 +1105,14 @@ const char* rcli_process_command(RCLIHandle handle, const char* text) { LOG_TRACE("RCLI", "[process_command] engine->mutex acquired, input='%.40s'", text); std::string input(text); + // --- Screen intent intercept: capture active window + VLM --- + if (is_screen_intent(input)) { + engine->last_response = handle_screen_intent(engine, input); + engine->conversation_history.emplace_back("user", input); + engine->conversation_history.emplace_back("assistant", engine->last_response); + return engine->last_response.c_str(); + } + // --- MetalRT path: tool-aware inference via generate_raw (pre-formatted prompt) --- if (engine->pipeline.using_metalrt()) { auto& mrt = engine->pipeline.metalrt_llm(); @@ -1027,19 +1156,12 @@ const char* rcli_process_command(RCLIHandle handle, const char* text) { full_prompt.compare(0, cached.size(), cached) == 0) { std::string full_continuation = full_prompt.substr(cached.size()); - if (engine->metalrt_kv_continuation_len > 0 && - engine->metalrt_kv_continuation_len < full_continuation.size()) { - std::string new_part = full_continuation.substr(engine->metalrt_kv_continuation_len); - LOG_TRACE("RCLI", "[process_command] incremental continue " - "(new=%zu chars, skip=%zu already in KV)", - new_part.size(), engine->metalrt_kv_continuation_len); - raw_output = mrt.generate_raw_continue(new_part, nullptr, false); - } else { - LOG_TRACE("RCLI", "[process_command] full continue " - "(continuation=%zu chars)", full_continuation.size()); - raw_output = mrt.generate_raw_continue(full_continuation, nullptr, true); - } - engine->metalrt_kv_continuation_len = full_continuation.size(); + // Always re-prefill full continuation from cached system prompt. + // Incremental continue (reset_cache=false) is unsafe because the KV + // cache includes generated tokens not tracked by continuation_len. + LOG_TRACE("RCLI", "[process_command] full continue " + "(continuation=%zu chars)", full_continuation.size()); + raw_output = mrt.generate_raw_continue(full_continuation, nullptr, true); } else { LOG_TRACE("RCLI", "[process_command] calling mrt.generate_raw() ..."); raw_output = mrt.generate_raw(full_prompt); @@ -1499,6 +1621,92 @@ const char* rcli_process_and_speak(RCLIHandle handle, const char* text, engine->streaming_cancelled.store(false, std::memory_order_release); std::string input(text); + // --- Screen intent intercept: capture + VLM + sentence-streamed TTS --- + if (is_screen_intent(input)) { + auto t_start_screen = std::chrono::steady_clock::now(); + std::string response = handle_screen_intent(engine, input); + engine->last_response = response; + engine->conversation_history.emplace_back("user", input); + engine->conversation_history.emplace_back("assistant", response); + + // Fire "response" callback so TUI displays the text + if (callback) { + callback("response", response.c_str(), user_data); + } + + // Sentence-streamed TTS (same pattern as LLM path for low TTFA) + std::string clean_text = rastack::sanitize_for_tts(response); + if (!clean_text.empty()) { + if (!engine->pipeline.audio().is_running()) { + engine->pipeline.audio().start(); + } + auto* rb = engine->pipeline.playback_ring_buffer(); + if (rb) { + rb->clear(); + + // Split into sentences and synthesize each one + std::vector sentences; + rastack::SentenceDetector splitter([&](const std::string& s) { + sentences.push_back(s); + }, /*min_words=*/3); + // Feed the entire text token-by-token (word by word) + for (size_t i = 0; i < clean_text.size(); ) { + size_t end = clean_text.find(' ', i); + if (end == std::string::npos) end = clean_text.size(); + else end++; // include space + splitter.feed(clean_text.substr(i, end - i)); + i = end; + } + splitter.flush(); + + bool first_audio = false; + for (auto& sentence : sentences) { + if (engine->streaming_cancelled.load(std::memory_order_acquire)) break; + + std::vector samples; + if (engine->pipeline.using_metalrt_tts()) { + samples = engine->pipeline.metalrt_tts().synthesize(sentence); + } else { + samples = engine->pipeline.tts().synthesize(sentence); + } + + // Write with backpressure + size_t offset = 0; + while (offset < samples.size() && + !engine->streaming_cancelled.load(std::memory_order_acquire)) { + size_t written = rb->write(samples.data() + offset, samples.size() - offset); + offset += written; + if (offset < samples.size()) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + } + + if (!first_audio) { + first_audio = true; + if (callback) { + auto now = std::chrono::steady_clock::now(); + double ttfa_ms = std::chrono::duration(now - t_start_screen).count(); + char buf[32]; + snprintf(buf, sizeof(buf), "%.1f", ttfa_ms); + callback("first_audio", buf, user_data); + } + } + } + + // Wait for playback to drain + size_t samples_per_frame = 256; + while (rb->available_read() > samples_per_frame && + !engine->streaming_cancelled.load(std::memory_order_acquire)) { + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + } + + if (callback) callback("complete", "{}", user_data); + return engine->last_response.c_str(); + } + auto t_start = std::chrono::steady_clock::now(); // --- TTS worker thread (sentence queue → ring buffer → CoreAudio) --- @@ -1711,19 +1919,14 @@ const char* rcli_process_and_speak(RCLIHandle handle, const char* text, full_continuation.size(), engine->metalrt_kv_continuation_len); - if (engine->metalrt_kv_continuation_len > 0 && - engine->metalrt_kv_continuation_len < full_continuation.size()) { - std::string new_part = full_continuation.substr(engine->metalrt_kv_continuation_len); - LOG_DEBUG("RCLI", "[speak] incremental continue " - "(new=%zu chars, skip=%zu already in KV)", - new_part.size(), engine->metalrt_kv_continuation_len); - response = mrt.generate_raw_continue(new_part, streaming_cb, false); - } else { - LOG_DEBUG("RCLI", "[speak] full continue " - "(continuation=%zu chars)", full_continuation.size()); - response = mrt.generate_raw_continue(full_continuation, streaming_cb, true); - } - engine->metalrt_kv_continuation_len = full_continuation.size(); + // Always truncate to cached system prompt and re-prefill the full + // continuation. The incremental path (reset_cache=false) is unsafe + // because the KV cache also contains generated-response tokens that + // metalrt_kv_continuation_len does not account for, which causes + // duplicate content in the KV and corrupts multi-turn attention. + LOG_DEBUG("RCLI", "[speak] full continue " + "(continuation=%zu chars)", full_continuation.size()); + response = mrt.generate_raw_continue(full_continuation, streaming_cb, true); } else { LOG_DEBUG("RCLI", "[speak] cache MISS path — calling generate_raw() " "(has_cache=%d prefix_match=%d)", @@ -2745,6 +2948,243 @@ void rcli_get_context_info(RCLIHandle handle, int* out_prompt_tokens, int* out_c } } +// ============================================================================= +// VLM (Vision Language Model) +// ============================================================================= + +// Recursively create directories (like mkdir -p) +static bool mkdirs(const std::string& path) { + struct stat st; + if (stat(path.c_str(), &st) == 0) return S_ISDIR(st.st_mode); + // Recurse to create parent + auto slash = path.rfind('/'); + if (slash != std::string::npos && slash > 0) { + if (!mkdirs(path.substr(0, slash))) return false; + } + return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; +} + +// Download a file using fork/exec to avoid shell injection +static bool safe_download(const std::string& url, const std::string& dest) { + pid_t pid; + const char* argv[] = { + "curl", "-L", "--progress-bar", "-o", dest.c_str(), url.c_str(), nullptr + }; + int status = 0; + posix_spawn_file_actions_t actions; + posix_spawn_file_actions_init(&actions); + if (posix_spawnp(&pid, "curl", &actions, nullptr, + const_cast(argv), environ) != 0) { + posix_spawn_file_actions_destroy(&actions); + return false; + } + posix_spawn_file_actions_destroy(&actions); + waitpid(pid, &status, 0); + return WIFEXITED(status) && WEXITSTATUS(status) == 0; +} + +// Internal init (caller must hold engine->mutex) +// VLM is only available on the llama.cpp engine. MetalRT VLM support coming soon. +static int vlm_init_locked(RCLIEngine* engine) { + if (engine->vlm_initialized) return 0; + + if (engine->models_dir.empty()) { + if (const char* home = getenv("HOME")) + engine->models_dir = std::string(home) + "/Library/RCLI/models"; + else + engine->models_dir = "./models"; + } + + // VLM requires the llama.cpp engine + if (engine->initialized && engine->pipeline.using_metalrt()) { + LOG_ERROR("VLM", "VLM is currently available with the llama.cpp engine. Switch with: rcli engine llamacpp"); + return -1; + } + + // Check if any VLM model is installed (on-demand, no auto-download) + auto vlm_models = rcli::all_vlm_models(); + rcli::VlmModelDef model_def; + bool found = false; + + for (auto& m : vlm_models) { + if (rcli::is_vlm_model_installed(engine->models_dir, m)) { + model_def = m; + found = true; + break; + } + } + + if (!found) { + LOG_ERROR("VLM", "No VLM model installed. Download one with: rcli models vlm"); + return -1; + } + + // Initialize VLM engine with the installed model + VlmConfig config; + config.model_path = engine->models_dir + "/" + model_def.model_filename; + config.mmproj_path = engine->models_dir + "/" + model_def.mmproj_filename; + config.n_gpu_layers = 99; + config.n_ctx = 4096; + config.n_batch = 512; + config.n_threads = 1; + config.n_threads_batch = 8; + config.flash_attn = true; + + if (!engine->vlm_engine.init(config)) { + LOG_ERROR("VLM", "Failed to initialize VLM engine"); + return -1; + } + + engine->vlm_initialized = true; + engine->vlm_backend_name = "llama.cpp (Metal GPU)"; + engine->vlm_model_name = model_def.name; + LOG_INFO("VLM", "VLM engine ready — %s via llama.cpp (Metal GPU)", model_def.name.c_str()); + return 0; +} + +int rcli_vlm_init(RCLIHandle handle) { + if (!handle) return -1; + auto* engine = static_cast(handle); + std::lock_guard lock(engine->mutex); + return vlm_init_locked(engine); +} + +const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const char* prompt) { + if (!handle || !image_path) return ""; + auto* engine = static_cast(handle); + std::lock_guard lock(engine->mutex); + + if (!engine->vlm_initialized) { + if (vlm_init_locked(engine) != 0) { + engine->last_vlm_response = "VLM not available. Requires llama.cpp engine (rcli engine llamacpp) and a VLM model (rcli models vlm)."; + return engine->last_vlm_response.c_str(); + } + } + + std::string text_prompt = prompt && prompt[0] + ? std::string(prompt) + : "Describe this image in detail."; + + { + std::string result = engine->vlm_engine.analyze_image( + std::string(image_path), text_prompt, nullptr); + + if (result.empty()) { + engine->last_vlm_response = "Error: Failed to analyze image."; + } else { + engine->last_vlm_response = result; + } + } + return engine->last_vlm_response.c_str(); +} + +int rcli_vlm_is_ready(RCLIHandle handle) { + if (!handle) return 0; + auto* engine = static_cast(handle); + return engine->vlm_initialized ? 1 : 0; +} + +const char* rcli_vlm_backend_name(RCLIHandle handle) { + if (!handle) return ""; + auto* engine = static_cast(handle); + return engine->vlm_backend_name.c_str(); +} + +const char* rcli_vlm_model_name(RCLIHandle handle) { + if (!handle) return ""; + auto* engine = static_cast(handle); + return engine->vlm_model_name.c_str(); +} + +int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats) { + if (!handle || !out_stats) return -1; + auto* engine = static_cast(handle); + if (!engine->vlm_initialized) return -1; + + auto& s = engine->vlm_engine.last_stats(); + out_stats->gen_tok_per_sec = s.gen_tps(); + out_stats->generated_tokens = static_cast(s.generated_tokens); + out_stats->total_time_sec = (s.image_encode_us + s.generation_us) / 1e6; + out_stats->image_encode_ms = s.image_encode_us / 1000.0; + out_stats->first_token_ms = s.first_token_us / 1000.0; + return 0; +} + +// ============================================================================= +// VLM GPU swap: enter/exit visual mode by swapping LLM ↔ VLM on GPU +// ============================================================================= + +int rcli_vlm_enter(RCLIHandle handle) { + if (!handle) return -1; + auto* engine = static_cast(handle); + std::lock_guard lock(engine->mutex); + + if (engine->vlm_initialized) return 0; + return vlm_init_locked(engine); +} + +int rcli_vlm_exit(RCLIHandle handle) { + if (!handle) return -1; + auto* engine = static_cast(handle); + std::lock_guard lock(engine->mutex); + + if (engine->vlm_engine.is_initialized()) { + engine->vlm_engine.shutdown(); + } + + engine->vlm_initialized = false; + engine->vlm_backend_name.clear(); + engine->vlm_model_name.clear(); + LOG_INFO("VLM", "VLM unloaded"); + return 0; +} + +int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path, + const char* prompt, + RCLIEventCallback callback, void* user_data) { + if (!handle || !image_path) return -1; + auto* engine = static_cast(handle); + std::lock_guard lock(engine->mutex); + + // Lazy-init VLM if not yet loaded + if (!engine->vlm_initialized) { + if (vlm_init_locked(engine) != 0) { + LOG_ERROR("VLM", "Failed to initialize VLM engine for streaming"); + return -1; + } + } + + std::string text_prompt = (prompt && prompt[0]) + ? std::string(prompt) : "Describe this image in detail."; + + // llama.cpp VLM streaming path + rastack::TokenCallback token_cb = nullptr; + if (callback) { + token_cb = [callback, user_data](const rastack::TokenOutput& tok) { + if (!tok.text.empty()) { + callback("token", tok.text.c_str(), user_data); + } + }; + } + + std::string result = engine->vlm_engine.analyze_image( + std::string(image_path), text_prompt, token_cb); + + engine->last_vlm_response = result.empty() ? "Error: Failed to analyze image." : result; + + if (callback) { + callback("response", engine->last_vlm_response.c_str(), user_data); + auto& s = engine->vlm_engine.last_stats(); + char stats_buf[256]; + snprintf(stats_buf, sizeof(stats_buf), + "{\"tps\":%.1f,\"tokens\":%lld,\"vision_encode_ms\":%.1f}", + s.gen_tps(), s.generated_tokens, s.image_encode_us / 1000.0); + callback("stats", stats_buf, user_data); + } + + return engine->last_vlm_response.find("Error:") == 0 ? -1 : 0; +} + } // extern "C" std::vector rcli_get_all_action_defs(RCLIHandle handle) { diff --git a/src/api/rcli_api.h b/src/api/rcli_api.h index 5a0e2d3..e6906d1 100644 --- a/src/api/rcli_api.h +++ b/src/api/rcli_api.h @@ -262,6 +262,60 @@ const char* rcli_get_stt_model(RCLIHandle handle); // Both output pointers are optional (pass NULL to skip). void rcli_get_context_info(RCLIHandle handle, int* out_prompt_tokens, int* out_ctx_size); +// --- VLM (Vision Language Model) --- + +// Initialize the VLM engine with the default VLM model. +// Lazily downloads the model if not present. Thread-safe. +// Returns 0 on success, -1 on failure. +int rcli_vlm_init(RCLIHandle handle); + +// Analyze an image with an optional text prompt. +// image_path: absolute path to an image file (jpg, png, bmp, gif, webp, tga). +// prompt: text prompt (e.g. "Describe this image"). NULL defaults to "Describe this image in detail." +// Returns the analysis text. Caller must NOT free the returned pointer. +const char* rcli_vlm_analyze(RCLIHandle handle, const char* image_path, const char* prompt); + +// Check if the VLM engine is initialized and ready for image analysis. +// Returns 1 if ready, 0 if not. +int rcli_vlm_is_ready(RCLIHandle handle); + +// Get the name of the active VLM backend (e.g. "llama.cpp (Metal GPU)" or "MetalRT"). +// Returns "" if VLM is not initialized. +const char* rcli_vlm_backend_name(RCLIHandle handle); + +// Get the name of the active VLM model (e.g. "Qwen3 VL 2B Instruct"). +// Returns "" if VLM is not initialized. +const char* rcli_vlm_model_name(RCLIHandle handle); + +// VLM performance stats from the last analysis call. +typedef struct { + double gen_tok_per_sec; // Generation tokens/second + int generated_tokens; // Total tokens generated + double total_time_sec; // Total wall time (image encode + prompt eval + generation) + double image_encode_ms; // Time to encode image through vision projector + double first_token_ms; // Time-to-first-token (prompt eval + image encode) +} RCLIVlmStats; + +// Get stats from the last VLM analysis. Returns 0 on success. +int rcli_vlm_get_stats(RCLIHandle handle, RCLIVlmStats* out_stats); + +// Swap MetalRT LLM out and VLM in on the GPU (for visual mode). +// Unloads the LLM model, loads the MetalRT VLM model. +// Returns 0 on success, -1 on failure. +int rcli_vlm_enter(RCLIHandle handle); + +// Swap MetalRT VLM out and LLM back in on the GPU (exit visual mode). +// Unloads the VLM model, reloads the LLM and re-caches the system prompt. +// Returns 0 on success, -1 on failure. +int rcli_vlm_exit(RCLIHandle handle); + +// Streaming VLM image analysis (use after rcli_vlm_enter). +// Fires callback with events: "token", "response", "stats". +// Returns 0 on success, -1 on failure. +int rcli_vlm_analyze_stream(RCLIHandle handle, const char* image_path, + const char* prompt, + RCLIEventCallback callback, void* user_data); + #ifdef __cplusplus } #endif diff --git a/src/audio/camera_capture.h b/src/audio/camera_capture.h new file mode 100644 index 0000000..1d5ade4 --- /dev/null +++ b/src/audio/camera_capture.h @@ -0,0 +1,14 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +// Capture a single frame from the default camera and save as JPEG. +// output_path: where to save the JPEG (e.g. "/tmp/rcli_camera.jpg"). +// Returns 0 on success, -1 on failure. +int camera_capture_photo(const char* output_path); + +#ifdef __cplusplus +} +#endif diff --git a/src/audio/camera_capture.mm b/src/audio/camera_capture.mm new file mode 100644 index 0000000..a4cdf8b --- /dev/null +++ b/src/audio/camera_capture.mm @@ -0,0 +1,142 @@ +#import +#import +#import +#import +#include "camera_capture.h" +#include + +// Delegate that skips warmup frames then captures one properly-exposed frame +@interface RCLISingleFrameCapture : NSObject +@property (nonatomic, strong) NSString *outputPath; +@property (nonatomic, assign) BOOL captured; +@property (nonatomic, strong) dispatch_semaphore_t semaphore; +@property (nonatomic, assign) int frameCount; +@property (nonatomic, assign) int framesToSkip; +@end + +@implementation RCLISingleFrameCapture + +- (void)captureOutput:(AVCaptureOutput *)output +didOutputSampleBuffer:(CMSampleBufferRef)sampleBuffer + fromConnection:(AVCaptureConnection *)connection { + if (self.captured) return; + + // Skip initial frames to let auto-exposure/white-balance stabilize + self.frameCount++; + if (self.frameCount < self.framesToSkip) return; + + self.captured = YES; + + CVImageBufferRef imageBuffer = CMSampleBufferGetImageBuffer(sampleBuffer); + if (!imageBuffer) { + dispatch_semaphore_signal(self.semaphore); + return; + } + + CIImage *ciImage = [CIImage imageWithCVImageBuffer:imageBuffer]; + NSCIImageRep *rep = [NSCIImageRep imageRepWithCIImage:ciImage]; + NSImage *nsImage = [[NSImage alloc] initWithSize:rep.size]; + [nsImage addRepresentation:rep]; + + // Convert to JPEG at high quality + NSData *tiffData = [nsImage TIFFRepresentation]; + NSBitmapImageRep *bitmapRep = [NSBitmapImageRep imageRepWithData:tiffData]; + NSData *jpegData = [bitmapRep representationUsingType:NSBitmapImageFileTypeJPEG + properties:@{NSImageCompressionFactor: @0.92}]; + [jpegData writeToFile:self.outputPath atomically:YES]; + + dispatch_semaphore_signal(self.semaphore); +} + +@end + +int camera_capture_photo(const char* output_path) { + @autoreleasepool { + // Check camera permission + AVAuthorizationStatus status = [AVCaptureDevice authorizationStatusForMediaType:AVMediaTypeVideo]; + if (status == AVAuthorizationStatusDenied || status == AVAuthorizationStatusRestricted) { + return -1; + } + if (status == AVAuthorizationStatusNotDetermined) { + dispatch_semaphore_t perm_sem = dispatch_semaphore_create(0); + __block BOOL granted = NO; + [AVCaptureDevice requestAccessForMediaType:AVMediaTypeVideo completionHandler:^(BOOL g) { + granted = g; + dispatch_semaphore_signal(perm_sem); + }]; + dispatch_semaphore_wait(perm_sem, DISPATCH_TIME_FOREVER); + if (!granted) return -1; + } + + // Find default camera + AVCaptureDevice *device = [AVCaptureDevice defaultDeviceWithMediaType:AVMediaTypeVideo]; + if (!device) return -1; + + // Configure camera for best quality and let auto-exposure do its thing + NSError *error = nil; + if ([device lockForConfiguration:&error]) { + // Enable continuous auto-exposure and white balance + if ([device isExposureModeSupported:AVCaptureExposureModeContinuousAutoExposure]) { + device.exposureMode = AVCaptureExposureModeContinuousAutoExposure; + } + if ([device isWhiteBalanceModeSupported:AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance]) { + device.whiteBalanceMode = AVCaptureWhiteBalanceModeContinuousAutoWhiteBalance; + } + if ([device isFocusModeSupported:AVCaptureFocusModeContinuousAutoFocus]) { + device.focusMode = AVCaptureFocusModeContinuousAutoFocus; + } + [device unlockForConfiguration]; + } + + AVCaptureDeviceInput *input = [AVCaptureDeviceInput deviceInputWithDevice:device error:&error]; + if (!input) return -1; + + AVCaptureSession *session = [[AVCaptureSession alloc] init]; + // Use Photo preset for highest quality + if ([session canSetSessionPreset:AVCaptureSessionPresetPhoto]) { + session.sessionPreset = AVCaptureSessionPresetPhoto; + } else if ([session canSetSessionPreset:AVCaptureSessionPresetHigh]) { + session.sessionPreset = AVCaptureSessionPresetHigh; + } else { + session.sessionPreset = AVCaptureSessionPresetMedium; + } + + if (![session canAddInput:input]) return -1; + [session addInput:input]; + + AVCaptureVideoDataOutput *videoOutput = [[AVCaptureVideoDataOutput alloc] init]; + videoOutput.videoSettings = @{(NSString *)kCVPixelBufferPixelFormatTypeKey: @(kCVPixelFormatType_32BGRA)}; + videoOutput.alwaysDiscardsLateVideoFrames = YES; + + RCLISingleFrameCapture *delegate = [[RCLISingleFrameCapture alloc] init]; + delegate.outputPath = [NSString stringWithUTF8String:output_path]; + delegate.captured = NO; + delegate.semaphore = dispatch_semaphore_create(0); + delegate.frameCount = 0; + // Skip ~60 frames (~2 seconds at 30fps) to let auto-exposure fully stabilize + delegate.framesToSkip = 60; + + dispatch_queue_t queue = dispatch_queue_create("com.rcli.camera", DISPATCH_QUEUE_SERIAL); + [videoOutput setSampleBufferDelegate:delegate queue:queue]; + + if (![session canAddOutput:videoOutput]) return -1; + [session addOutput:videoOutput]; + + // Start capture — delegate will skip first 60 frames for AE stabilization + [session startRunning]; + + // Wait for frame capture (timeout 10 seconds — allows for warmup + capture) + long result = dispatch_semaphore_wait(delegate.semaphore, + dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC)); + + [session stopRunning]; + + if (result != 0) return -1; // timeout + + // Verify the file was written + NSFileManager *fm = [NSFileManager defaultManager]; + if (![fm fileExistsAtPath:delegate.outputPath]) return -1; + + return 0; + } +} diff --git a/src/audio/rcli_overlay.m b/src/audio/rcli_overlay.m new file mode 100644 index 0000000..274a3fc --- /dev/null +++ b/src/audio/rcli_overlay.m @@ -0,0 +1,191 @@ +// rcli_overlay — standalone Cocoa app showing a draggable/resizable overlay +// frame for screen capture. Communicates with parent RCLI via stdin/stdout. +// +// Commands (one per line on stdin): +// frame → replies "x,y,w,h\n" (screen coords, top-left origin) +// hide → sets alpha to 0 (for capture) +// show → restores alpha to 1 +// quit → exits + +#import + +static const CGFloat kBorder = 6.0; +static const CGFloat kRadius = 12.0; +static const CGFloat kHandle = 18.0; // corner handle size +static const CGFloat kEdgeGrab = 14.0; // invisible edge grab zone + +// ── Custom view: bold border + corner handles + label pill ───────────── +@interface OverlayView : NSView +@end + +@implementation OverlayView + +- (void)drawRect:(NSRect)dirtyRect { + [[NSColor clearColor] set]; + NSRectFill(dirtyRect); + + NSRect inner = NSInsetRect(self.bounds, kBorder, kBorder); + NSColor *green = [NSColor colorWithRed:0.15 green:0.9 blue:0.45 alpha:0.92]; + + // Outer glow + NSBezierPath *glow = [NSBezierPath bezierPathWithRoundedRect:inner + xRadius:kRadius yRadius:kRadius]; + [glow setLineWidth:kBorder + 6]; + [[green colorWithAlphaComponent:0.12] set]; + [glow stroke]; + + // Main border — solid, thick, rounded + NSBezierPath *border = [NSBezierPath bezierPathWithRoundedRect:inner + xRadius:kRadius yRadius:kRadius]; + [border setLineWidth:kBorder]; + [green set]; + [border stroke]; + + // Corner handles — filled rounded squares with white dot + CGFloat hs = kHandle; + CGFloat off = kBorder / 2; + NSRect corners[4] = { + NSMakeRect(NSMinX(inner) - off, NSMinY(inner) - off, hs, hs), + NSMakeRect(NSMaxX(inner) + off - hs, NSMinY(inner) - off, hs, hs), + NSMakeRect(NSMinX(inner) - off, NSMaxY(inner) + off - hs, hs, hs), + NSMakeRect(NSMaxX(inner) + off - hs, NSMaxY(inner) + off - hs, hs, hs), + }; + for (int i = 0; i < 4; i++) { + NSBezierPath *h = [NSBezierPath bezierPathWithRoundedRect:corners[i] + xRadius:4 yRadius:4]; + [green set]; + [h fill]; + // White center dot + NSRect dot = NSInsetRect(corners[i], 5, 5); + [[NSColor colorWithWhite:1.0 alpha:0.85] set]; + [[NSBezierPath bezierPathWithOvalInRect:dot] fill]; + } + + // Label pill — centered at top + NSString *label = @" RCLI Visual Mode "; + NSDictionary *attrs = @{ + NSFontAttributeName: [NSFont systemFontOfSize:11 weight:NSFontWeightBold], + NSForegroundColorAttributeName: [NSColor blackColor], + }; + NSSize sz = [label sizeWithAttributes:attrs]; + CGFloat px = NSMidX(self.bounds) - sz.width / 2 - 6; + CGFloat py = NSMaxY(inner) - 2; + NSRect pill = NSMakeRect(px, py, sz.width + 12, sz.height + 6); + NSBezierPath *pillPath = [NSBezierPath bezierPathWithRoundedRect:pill + xRadius:10 yRadius:10]; + [green set]; + [pillPath fill]; + [label drawAtPoint:NSMakePoint(px + 6, py + 3) withAttributes:attrs]; +} + +- (BOOL)acceptsFirstMouse:(NSEvent *)e { return YES; } +@end + +// ── Custom window: borderless, transparent, floating, draggable ─────── +@interface OverlayWindow : NSWindow +@end + +@implementation OverlayWindow +- (instancetype)initWithRect:(NSRect)rect { + self = [super initWithContentRect:rect + styleMask:NSWindowStyleMaskBorderless | + NSWindowStyleMaskResizable + backing:NSBackingStoreBuffered + defer:NO]; + if (self) { + self.opaque = NO; + self.backgroundColor = [NSColor clearColor]; + self.level = NSFloatingWindowLevel; + self.hasShadow = NO; + self.movableByWindowBackground = YES; + self.contentView = [[OverlayView alloc] initWithFrame:rect]; + self.collectionBehavior = NSWindowCollectionBehaviorCanJoinAllSpaces | + NSWindowCollectionBehaviorStationary; + self.minSize = NSMakeSize(120, 80); + } + return self; +} +- (BOOL)canBecomeKeyWindow { return YES; } +- (BOOL)canBecomeMainWindow { return NO; } +@end + +// ── Stdin reader (runs on a background thread) ──────────────────────── +@interface StdinReader : NSObject +@property (nonatomic, strong) OverlayWindow *window; +- (void)startReading; +- (void)handleCommand:(NSString *)cmd; +@end + +@implementation StdinReader + +- (void)startReading { + dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{ + char buf[256]; + while (fgets(buf, sizeof(buf), stdin)) { + NSString *cmd = [[NSString stringWithUTF8String:buf] + stringByTrimmingCharactersInSet:[NSCharacterSet whitespaceAndNewlineCharacterSet]]; + if (cmd.length == 0) continue; + [self performSelectorOnMainThread:@selector(handleCommand:) + withObject:cmd + waitUntilDone:YES]; + } + dispatch_async(dispatch_get_main_queue(), ^{ + [NSApp terminate:nil]; + }); + }); +} + +- (void)handleCommand:(NSString *)cmd { + if ([cmd isEqualToString:@"frame"]) { + NSRect f = self.window.frame; + CGFloat screenH = [NSScreen mainScreen].frame.size.height; + int x = (int)f.origin.x; + int y = (int)(screenH - f.origin.y - f.size.height); + int w = (int)f.size.width; + int h = (int)f.size.height; + printf("%d,%d,%d,%d\n", x, y, w, h); + fflush(stdout); + } else if ([cmd isEqualToString:@"hide"]) { + [self.window setAlphaValue:0.0]; + [NSThread sleepForTimeInterval:0.05]; + printf("ok\n"); + fflush(stdout); + } else if ([cmd isEqualToString:@"show"]) { + [self.window setAlphaValue:1.0]; + printf("ok\n"); + fflush(stdout); + } else if ([cmd isEqualToString:@"quit"]) { + [NSApp terminate:nil]; + } +} + +@end + +// ── Main ────────────────────────────────────────────────────────────── +int main(int argc, const char *argv[]) { + @autoreleasepool { + NSApplication *app = [NSApplication sharedApplication]; + [app setActivationPolicy:NSApplicationActivationPolicyAccessory]; + + NSScreen *scr = [NSScreen mainScreen]; + NSRect sf = scr.frame; + CGFloat w = 800, h = 600; + CGFloat x = (sf.size.width - w) / 2; + CGFloat y = (sf.size.height - h) / 2; + + OverlayWindow *win = [[OverlayWindow alloc] + initWithRect:NSMakeRect(x, y, w, h)]; + [win makeKeyAndOrderFront:nil]; + [app activateIgnoringOtherApps:YES]; + + StdinReader *reader = [[StdinReader alloc] init]; + reader.window = win; + [reader startReading]; + + printf("ready\n"); + fflush(stdout); + + [app run]; + } + return 0; +} diff --git a/src/audio/screen_capture.h b/src/audio/screen_capture.h new file mode 100644 index 0000000..0cc5421 --- /dev/null +++ b/src/audio/screen_capture.h @@ -0,0 +1,42 @@ +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +// --- Visual Mode (overlay frame) --- + +// Show the visual overlay window. User can drag/resize it over content. +// x, y, w, h: initial position and size in screen coordinates (0 = defaults). +void screen_capture_show_overlay(int x, int y, int w, int h); + +// Hide the visual overlay window. +void screen_capture_hide_overlay(void); + +// Returns 1 if the overlay is currently visible. +int screen_capture_overlay_active(void); + +// Capture the screen region behind the overlay (hides overlay briefly). +// Returns 0 on success, -1 on failure. +int screen_capture_overlay_region(const char* output_path); + +// --- Legacy capture functions --- + +// Capture the frontmost/active window and save as JPEG. +int screen_capture_active_window(const char* output_path); + +// Capture the window behind our own terminal (for voice triggers). +int screen_capture_behind_terminal(const char* output_path); + +// Capture the entire main display and save as JPEG (fallback). +int screen_capture_full_screen(const char* output_path); + +// Convenience: tries overlay if active, then active window, then full screen. +int screen_capture_screenshot(const char* output_path); + +// Get the name of the app targeted by screen_capture_behind_terminal. +const char* screen_capture_target_app_name(char* buf, int buf_size); + +#ifdef __cplusplus +} +#endif diff --git a/src/audio/screen_capture.mm b/src/audio/screen_capture.mm new file mode 100644 index 0000000..e2f3ea8 --- /dev/null +++ b/src/audio/screen_capture.mm @@ -0,0 +1,425 @@ +#import +#import +#include "screen_capture.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char** environ; + +// --------------------------------------------------------------------------- +// Helper: downscale a JPEG on disk if it exceeds max dimension (for VLM) +// --------------------------------------------------------------------------- +static void downscale_jpeg_if_needed(const char* path, int max_dim) { + @autoreleasepool { + NSString *nsPath = [NSString stringWithUTF8String:path]; + NSData *data = [NSData dataWithContentsOfFile:nsPath]; + if (!data) return; + + NSBitmapImageRep *srcRep = [NSBitmapImageRep imageRepWithData:data]; + if (!srcRep) return; + + NSInteger w = srcRep.pixelsWide; + NSInteger h = srcRep.pixelsHigh; + if (w <= max_dim && h <= max_dim) return; + + CGFloat scale = (CGFloat)max_dim / fmax((CGFloat)w, (CGFloat)h); + NSInteger nw = (NSInteger)floor(w * scale); + NSInteger nh = (NSInteger)floor(h * scale); + + NSBitmapImageRep *dstRep = [[NSBitmapImageRep alloc] + initWithBitmapDataPlanes:NULL + pixelsWide:nw + pixelsHigh:nh + bitsPerSample:8 + samplesPerPixel:4 + hasAlpha:YES + isPlanar:NO + colorSpaceName:NSCalibratedRGBColorSpace + bytesPerRow:0 + bitsPerPixel:0]; + + [NSGraphicsContext saveGraphicsState]; + NSGraphicsContext *ctx = [NSGraphicsContext graphicsContextWithBitmapImageRep:dstRep]; + [NSGraphicsContext setCurrentContext:ctx]; + [ctx setImageInterpolation:NSImageInterpolationHigh]; + + NSImage *nsImage = [[NSImage alloc] initWithSize:NSMakeSize((CGFloat)w, (CGFloat)h)]; + [nsImage addRepresentation:srcRep]; + [nsImage drawInRect:NSMakeRect(0, 0, (CGFloat)nw, (CGFloat)nh) + fromRect:NSZeroRect + operation:NSCompositingOperationCopy + fraction:1.0]; + + [NSGraphicsContext restoreGraphicsState]; + + NSData *jpegData = [dstRep representationUsingType:NSBitmapImageFileTypeJPEG + properties:@{NSImageCompressionFactor: @0.85}]; + if (jpegData && jpegData.length > 0) { + [jpegData writeToFile:nsPath atomically:YES]; + } + } +} + +// --------------------------------------------------------------------------- +// Helper: run screencapture with given args, verify output +// --------------------------------------------------------------------------- +static int run_screencapture(const char* const argv[], const char* output_path) { + pid_t pid; + int status = 0; + if (posix_spawnp(&pid, "screencapture", nullptr, nullptr, + const_cast(argv), environ) != 0) { + return -1; + } + waitpid(pid, &status, 0); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) return -1; + + struct stat st; + if (stat(output_path, &st) != 0 || st.st_size == 0) return -1; + + downscale_jpeg_if_needed(output_path, 2048); + return 0; +} + +// =========================================================================== +// Visual overlay — spawns rcli_overlay helper process (separate Cocoa app) +// because AppKit window management requires the main thread, which FTXUI owns. +// Communication via stdin/stdout pipes. +// =========================================================================== + +static pid_t g_overlay_pid = 0; +static FILE *g_overlay_stdin = nullptr; // we write commands here +static FILE *g_overlay_stdout = nullptr; // we read responses here +static std::atomic g_overlay_visible{false}; + +// Find rcli_overlay binary next to the rcli binary +static std::string find_overlay_binary() { + // Try next to our own executable + char path[1024]; + uint32_t size = sizeof(path); + if (_NSGetExecutablePath(path, &size) == 0) { + std::string dir(path); + auto slash = dir.rfind('/'); + if (slash != std::string::npos) { + std::string candidate = dir.substr(0, slash + 1) + "rcli_overlay"; + if (access(candidate.c_str(), X_OK) == 0) return candidate; + } + } + // Fallback: try PATH + return "rcli_overlay"; +} + +// Send a command to the overlay process and read the response line +static std::string overlay_cmd(const char* cmd) { + if (!g_overlay_stdin || !g_overlay_stdout) return ""; + fprintf(g_overlay_stdin, "%s\n", cmd); + fflush(g_overlay_stdin); + char buf[256] = {0}; + if (fgets(buf, sizeof(buf), g_overlay_stdout)) { + // Strip trailing newline + size_t len = strlen(buf); + if (len > 0 && buf[len-1] == '\n') buf[len-1] = '\0'; + return std::string(buf); + } + return ""; +} + +void screen_capture_show_overlay(int x, int y, int w, int h) { + (void)x; (void)y; (void)w; (void)h; // TODO: pass initial rect to helper + + if (g_overlay_pid > 0) { + // Already running — just return + return; + } + + std::string binary = find_overlay_binary(); + + // Create pipes: parent→child stdin, child→parent stdout + int pipe_in[2], pipe_out[2]; + if (pipe(pipe_in) != 0 || pipe(pipe_out) != 0) return; + + pid_t pid = fork(); + if (pid == 0) { + // Child: wire up pipes + close(pipe_in[1]); // close write end of stdin pipe + close(pipe_out[0]); // close read end of stdout pipe + dup2(pipe_in[0], STDIN_FILENO); + dup2(pipe_out[1], STDOUT_FILENO); + close(pipe_in[0]); + close(pipe_out[1]); + // Redirect stderr to /dev/null to keep terminal clean + int devnull = open("/dev/null", O_WRONLY); + if (devnull >= 0) { dup2(devnull, STDERR_FILENO); close(devnull); } + execl(binary.c_str(), "rcli_overlay", nullptr); + _exit(1); + } + + // Parent + close(pipe_in[0]); + close(pipe_out[1]); + g_overlay_pid = pid; + g_overlay_stdin = fdopen(pipe_in[1], "w"); + g_overlay_stdout = fdopen(pipe_out[0], "r"); + + // Wait for "ready" from child + char buf[64] = {0}; + if (g_overlay_stdout && fgets(buf, sizeof(buf), g_overlay_stdout)) { + g_overlay_visible.store(true); + } +} + +void screen_capture_hide_overlay(void) { + if (g_overlay_pid <= 0) return; + + overlay_cmd("quit"); + + // Clean up + if (g_overlay_stdin) { fclose(g_overlay_stdin); g_overlay_stdin = nullptr; } + if (g_overlay_stdout) { fclose(g_overlay_stdout); g_overlay_stdout = nullptr; } + int status; + waitpid(g_overlay_pid, &status, 0); + g_overlay_pid = 0; + g_overlay_visible.store(false); +} + +int screen_capture_overlay_active(void) { + return g_overlay_visible.load() ? 1 : 0; +} + +int screen_capture_overlay_region(const char* output_path) { + if (!g_overlay_visible.load() || g_overlay_pid <= 0) return -1; + + // Get frame coordinates (top-left origin) + std::string frame_str = overlay_cmd("frame"); + if (frame_str.empty()) return -1; + + // Hide overlay for capture + overlay_cmd("hide"); + + // Capture the region + char region[128]; + strlcpy(region, frame_str.c_str(), sizeof(region)); + const char* argv[] = { + "screencapture", "-x", "-t", "jpg", "-R", region, output_path, nullptr + }; + int result = run_screencapture(argv, output_path); + + // Show overlay again + overlay_cmd("show"); + + return result; +} + +// --------------------------------------------------------------------------- +// Track the previously active app (before our terminal got focus) +// Polls frontmostApplication every 200ms on a background thread. +// NSWorkspace notifications don't work in CLI apps (no NSApplication run loop). +// --------------------------------------------------------------------------- + +static std::atomic g_prev_active_pid{0}; +static pid_t g_our_terminal_pid = 0; +static char g_prev_app_name[256] = {0}; +static std::mutex g_name_mutex; + +// Walk up process tree to find which ancestor owns a window (our terminal) +static pid_t find_terminal_pid() { + @autoreleasepool { + pid_t cur = getpid(); + pid_t ancestors[8]; + int n = 0; + while (cur > 1 && n < 8) { + ancestors[n++] = cur; + struct kinfo_proc kp; + size_t length = sizeof(kp); + int mib[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, cur }; + if (sysctl(mib, 4, &kp, &length, NULL, 0) != 0) break; + pid_t ppid = kp.kp_eproc.e_ppid; + if (ppid == cur) break; + cur = ppid; + } + + // Check which ancestor owns on-screen windows — that's the terminal + #pragma clang diagnostic push + #pragma clang diagnostic ignored "-Wdeprecated-declarations" + CFArrayRef windowList = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements, + kCGNullWindowID); + #pragma clang diagnostic pop + if (windowList) { + NSArray *windows = CFBridgingRelease(windowList); + for (int i = n - 1; i >= 0; i--) { + for (NSDictionary *info in windows) { + pid_t ownerPid = [[info objectForKey:(NSString *)kCGWindowOwnerPID] intValue]; + if (ownerPid == ancestors[i]) { + return ancestors[i]; + } + } + } + } + return (n >= 3) ? ancestors[2] : getppid(); + } +} + +// Background poller — tracks which non-terminal app is frontmost +__attribute__((constructor)) +static void start_app_tracking() { + @autoreleasepool { + g_our_terminal_pid = find_terminal_pid(); + + // Seed with current frontmost app if it's not our terminal + NSRunningApplication *front = [[NSWorkspace sharedWorkspace] frontmostApplication]; + if (front && front.processIdentifier != g_our_terminal_pid) { + g_prev_active_pid.store(front.processIdentifier, std::memory_order_relaxed); + NSString *name = front.localizedName ?: @"unknown"; + std::lock_guard lock(g_name_mutex); + strlcpy(g_prev_app_name, [name UTF8String], sizeof(g_prev_app_name)); + } + + // Poll frontmostApplication every 200ms on a background thread + std::thread([]() { + pthread_setname_np("rcli.app_tracker"); + pid_t last_seen_pid = 0; + while (true) { + @autoreleasepool { + NSRunningApplication *front = + [[NSWorkspace sharedWorkspace] frontmostApplication]; + if (front) { + pid_t pid = front.processIdentifier; + // If a non-terminal app is frontmost and it changed, record it + if (pid != g_our_terminal_pid && pid != last_seen_pid) { + last_seen_pid = pid; + g_prev_active_pid.store(pid, std::memory_order_relaxed); + NSString *name = front.localizedName ?: @"unknown"; + std::lock_guard lock(g_name_mutex); + strlcpy(g_prev_app_name, [name UTF8String], + sizeof(g_prev_app_name)); + } + } + } + usleep(200000); // 200ms + } + }).detach(); + } +} + +// --------------------------------------------------------------------------- +// Window lookup helpers +// --------------------------------------------------------------------------- +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wdeprecated-declarations" + +static bool is_normal_window(NSDictionary *info) { + NSDictionary *bounds = [info objectForKey:(NSString *)kCGWindowBounds]; + if (!bounds) return false; + CGFloat w = [[bounds objectForKey:@"Width"] floatValue]; + CGFloat h = [[bounds objectForKey:@"Height"] floatValue]; + return (w >= 100 && h >= 100); +} + +// Find a normal window belonging to a specific PID +static CGWindowID find_window_for_pid(pid_t target_pid) { + CFArrayRef windowList = CGWindowListCopyWindowInfo( + kCGWindowListOptionOnScreenOnly | kCGWindowListExcludeDesktopElements, + kCGNullWindowID); + if (!windowList) return kCGNullWindowID; + + NSArray *windows = CFBridgingRelease(windowList); + for (NSDictionary *info in windows) { + pid_t ownerPid = [[info objectForKey:(NSString *)kCGWindowOwnerPID] intValue]; + if (ownerPid != target_pid) continue; + if (!is_normal_window(info)) continue; + return [[info objectForKey:(NSString *)kCGWindowNumber] unsignedIntValue]; + } + return kCGNullWindowID; +} + +// Find the frontmost normal window of the frontmost app +static CGWindowID get_frontmost_window_id() { + @autoreleasepool { + NSRunningApplication *frontApp = [[NSWorkspace sharedWorkspace] frontmostApplication]; + if (!frontApp) return kCGNullWindowID; + return find_window_for_pid(frontApp.processIdentifier); + } +} + +// Find the window of the previously active app (before terminal got focus) +static CGWindowID get_previous_app_window_id() { + @autoreleasepool { + pid_t prev_pid = g_prev_active_pid.load(std::memory_order_relaxed); + if (prev_pid <= 0) return kCGNullWindowID; + return find_window_for_pid(prev_pid); + } +} + +#pragma clang diagnostic pop + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +static int capture_window_id(CGWindowID wid, const char* output_path) { + if (wid == kCGNullWindowID) return -1; + char wid_str[32]; + snprintf(wid_str, sizeof(wid_str), "%u", wid); + const char* argv[] = { + "screencapture", "-x", "-t", "jpg", "-l", wid_str, output_path, nullptr + }; + return run_screencapture(argv, output_path); +} + +int screen_capture_active_window(const char* output_path) { + CGWindowID wid = get_frontmost_window_id(); + if (wid == kCGNullWindowID) { + return screen_capture_full_screen(output_path); + } + return capture_window_id(wid, output_path); +} + +int screen_capture_behind_terminal(const char* output_path) { + // Use the tracked previously-active app (before terminal got focus) + { + std::lock_guard lock(g_name_mutex); + pid_t prev = g_prev_active_pid.load(std::memory_order_relaxed); + fprintf(stderr, "[Screen] Targeting: %s (PID %d)\n", + g_prev_app_name[0] ? g_prev_app_name : "none", prev); + } + CGWindowID wid = get_previous_app_window_id(); + if (wid == kCGNullWindowID) { + fprintf(stderr, "[Screen] No previous app window found, falling back to full screen\n"); + return screen_capture_full_screen(output_path); + } + return capture_window_id(wid, output_path); +} + +int screen_capture_full_screen(const char* output_path) { + const char* argv[] = { + "screencapture", "-x", "-t", "jpg", output_path, nullptr + }; + return run_screencapture(argv, output_path); +} + +int screen_capture_screenshot(const char* output_path) { + // Prefer overlay if active, then active window, then full screen + if (screen_capture_overlay_active()) { + return screen_capture_overlay_region(output_path); + } + return screen_capture_active_window(output_path); +} + +const char* screen_capture_target_app_name(char* buf, int buf_size) { + std::lock_guard lock(g_name_mutex); + if (g_prev_app_name[0]) { + strlcpy(buf, g_prev_app_name, buf_size); + } else { + strlcpy(buf, "unknown", buf_size); + } + return buf; +} diff --git a/src/cli/help.h b/src/cli/help.h index bb9b37a..9ecca9b 100644 --- a/src/cli/help.h +++ b/src/cli/help.h @@ -19,6 +19,8 @@ inline void print_usage(const char* argv0) { " %sask%s One-shot text command\n" " %sactions%s [name] List all actions, or show detail for one\n" " %saction%s [json] Execute a named action directly\n" + " %svlm%s [prompt] Analyze image with Vision Language Model\n" + " %sscreen%s [prompt] Capture screenshot & analyze with VLM\n" " %srag%s RAG: ingest docs, query, status\n" " %ssetup%s Download AI models (~1GB)\n" " %smodels%s Manage all AI models (LLM, STT, TTS)\n" @@ -45,6 +47,10 @@ inline void print_usage(const char* argv0) { " rcli ask \"open Safari\" # one-shot command\n" " rcli ask \"create a note called Ideas\" # triggers action\n" " rcli actions # see all actions\n" + " rcli vlm photo.jpg # analyze an image\n" + " rcli vlm photo.jpg \"What is this?\" # image with custom prompt\n" + " rcli screen # capture & analyze screen\n" + " rcli screen \"What app is open?\" # screen with custom prompt\n" " rcli actions create_note # action detail\n" " rcli setup # download models\n\n", color::bold, color::orange, color::reset, @@ -69,6 +75,8 @@ inline void print_usage(const char* argv0) { color::green, color::reset, color::green, color::reset, color::green, color::reset, + color::green, color::reset, + color::green, color::reset, color::dim, color::reset, color::dim, color::reset); } @@ -130,7 +138,13 @@ inline void print_help_interactive() { fprintf(stderr, " %sdo [text]%s execute action directly (no JSON needed)\n", color::bold, color::reset); fprintf(stderr, " %srag status%s show indexed documents\n", color::bold, color::reset); fprintf(stderr, " %srag ingest %s index docs for Q&A\n", color::bold, color::reset); + fprintf(stderr, " %scamera%s capture photo from webcam & analyze\n", color::bold, color::reset); + fprintf(stderr, " %sscreen%s capture screenshot & analyze\n", color::bold, color::reset); fprintf(stderr, " %squit%s exit\n\n", color::bold, color::reset); + fprintf(stderr, " %s%s Vision:%s\n", color::bold, color::orange, color::reset); + fprintf(stderr, " Drag & drop an image file to analyze it with the VLM.\n"); + fprintf(stderr, " Type %scamera%s to capture a photo from your webcam.\n", color::bold, color::reset); + fprintf(stderr, " Type %sscreen%s to capture and analyze your screen.\n\n", color::bold, color::reset); fprintf(stderr, " %s%s Try:%s\n", color::bold, color::orange, color::reset); fprintf(stderr, " %s\"Open Safari\" \"What's on my calendar?\" \"Set volume to 50\"%s\n\n", color::dim, color::reset); diff --git a/src/cli/main.cpp b/src/cli/main.cpp index 4f49472..58cd4e1 100644 --- a/src/cli/main.cpp +++ b/src/cli/main.cpp @@ -27,6 +27,13 @@ #include "audio/mic_permission.h" #include "core/personality.h" #include "llama.h" +#include "mtmd.h" +#include "mtmd-helper.h" +#include "audio/camera_capture.h" +#include "audio/screen_capture.h" +#include + +extern char** environ; // Defined in cli_common.h as a forward declaration; implemented here because // it depends on the Objective-C mic_permission bridge compiled into this TU. @@ -427,6 +434,229 @@ static int cmd_ask(const Args& args) { return 0; } +// ============================================================================= +// VLM subcommand +// ============================================================================= + +static int cmd_vlm(const Args& args) { + if (args.arg1.empty() || args.help) { + fprintf(stderr, "\n Usage: rcli vlm [prompt]\n\n"); + fprintf(stderr, " Analyze an image using a Vision Language Model.\n\n"); + fprintf(stderr, " Examples:\n"); + fprintf(stderr, " rcli vlm photo.jpg\n"); + fprintf(stderr, " rcli vlm screenshot.png \"What text do you see?\"\n"); + fprintf(stderr, " rcli vlm diagram.jpg \"Explain this diagram\"\n\n"); + return args.help ? 0 : 1; + } + + // Resolve image path + std::string image_path = args.arg1; + if (!image_path.empty() && image_path[0] == '~') { + if (const char* home = getenv("HOME")) + image_path = std::string(home) + image_path.substr(1); + } + // Make relative paths absolute + if (!image_path.empty() && image_path[0] != '/') { + char cwd[4096]; + if (getcwd(cwd, sizeof(cwd))) + image_path = std::string(cwd) + "/" + image_path; + } + + struct stat st; + if (stat(image_path.c_str(), &st) != 0) { + fprintf(stderr, "%s%sError: Image not found: %s%s\n", + color::bold, color::red, image_path.c_str(), color::reset); + return 1; + } + + if (!rastack::VlmEngine::is_supported_image(image_path)) { + fprintf(stderr, "%s%sError: Unsupported image format. Supported: jpg, png, bmp, gif, webp, tga%s\n", + color::bold, color::red, color::reset); + return 1; + } + + std::string prompt = args.arg2.empty() ? "Describe this image in detail." : args.arg2; + + // Create engine with models_dir set (we only need VLM, not the full pipeline) + std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}"; + g_engine = rcli_create(config_json.c_str()); + if (!g_engine) return 1; + + // Initialize VLM + fprintf(stderr, "%sInitializing VLM...%s\n", color::dim, color::reset); + if (rcli_vlm_init(g_engine) != 0) { + fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); + fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); + fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); + fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + rcli_destroy(g_engine); + return 1; + } + + // Show which VLM backend is active + const char* backend = rcli_vlm_backend_name(g_engine); + const char* model = rcli_vlm_model_name(g_engine); + if (backend && backend[0]) { + fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n", + color::dim, color::reset, color::bold, model, + color::reset, color::dim, backend, color::reset); + } + + fprintf(stderr, "%sAnalyzing image: %s%s\n", color::dim, image_path.c_str(), color::reset); + + const char* response = rcli_vlm_analyze(g_engine, image_path.c_str(), prompt.c_str()); + if (response && response[0]) { + fprintf(stdout, "%s\n", response); + RCLIVlmStats stats; + if (rcli_vlm_get_stats(g_engine, &stats) == 0) { + fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n", + color::dim, stats.gen_tok_per_sec, stats.generated_tokens, + stats.total_time_sec, stats.first_token_ms, color::reset); + } + } else { + fprintf(stderr, "%s%sError: VLM analysis failed%s\n", + color::bold, color::red, color::reset); + rcli_destroy(g_engine); + return 1; + } + + rcli_destroy(g_engine); + return 0; +} + +// ============================================================================= +// Camera subcommand — capture + analyze +// ============================================================================= + +static int cmd_camera(const Args& args) { + std::string prompt = args.arg1.empty() ? "Describe what you see in this photo in detail." : args.arg1; + + fprintf(stderr, "%sCapturing photo from camera...%s\n", color::dim, color::reset); + std::string photo_path = "/tmp/rcli_camera.jpg"; + + int rc = camera_capture_photo(photo_path.c_str()); + if (rc != 0) { + fprintf(stderr, "%s%sError: Camera capture failed. Check camera permissions.%s\n", + color::bold, color::red, color::reset); + return 1; + } + fprintf(stderr, "%sPhoto captured! Analyzing with VLM...%s\n", color::dim, color::reset); + + std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}"; + g_engine = rcli_create(config_json.c_str()); + if (!g_engine) return 1; + + if (rcli_vlm_init(g_engine) != 0) { + fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); + fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); + fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); + fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + rcli_destroy(g_engine); + return 1; + } + + const char* backend = rcli_vlm_backend_name(g_engine); + const char* model = rcli_vlm_model_name(g_engine); + if (backend && backend[0]) { + fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n", + color::dim, color::reset, color::bold, model, + color::reset, color::dim, backend, color::reset); + } + + const char* response = rcli_vlm_analyze(g_engine, photo_path.c_str(), prompt.c_str()); + if (response && response[0]) { + fprintf(stdout, "%s\n", response); + if (!args.no_speak) { + rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers); + rcli_speak(g_engine, response); + } + RCLIVlmStats stats; + if (rcli_vlm_get_stats(g_engine, &stats) == 0) { + fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n", + color::dim, stats.gen_tok_per_sec, stats.generated_tokens, + stats.total_time_sec, stats.first_token_ms, color::reset); + } + { + pid_t pid; + const char* argv[] = {"open", photo_path.c_str(), nullptr}; + posix_spawnp(&pid, "open", nullptr, nullptr, + const_cast(argv), environ); + } + } else { + fprintf(stderr, "%s%sError: VLM analysis failed%s\n", + color::bold, color::red, color::reset); + rcli_destroy(g_engine); + return 1; + } + + rcli_destroy(g_engine); + return 0; +} + +// ============================================================================= +// Screen subcommand — screenshot + analyze +// ============================================================================= + +static int cmd_screen(const Args& args) { + std::string prompt = args.arg1.empty() + ? "Describe what you see on this screen in detail." : args.arg1; + + fprintf(stderr, "%sCapturing screenshot...%s\n", color::dim, color::reset); + std::string screen_path = "/tmp/rcli_screen.jpg"; + + int rc = screen_capture_screenshot(screen_path.c_str()); + if (rc != 0) { + fprintf(stderr, "%s%sError: Screen capture failed. Check screen recording permissions.%s\n", + color::bold, color::red, color::reset); + return 1; + } + fprintf(stderr, "%sScreenshot captured! Analyzing with VLM...%s\n", color::dim, color::reset); + + std::string config_json = "{\"models_dir\": \"" + args.models_dir + "\"}"; + g_engine = rcli_create(config_json.c_str()); + if (!g_engine) return 1; + + if (rcli_vlm_init(g_engine) != 0) { + fprintf(stderr, "\n%s%s VLM not available.%s\n\n", color::bold, color::red, color::reset); + fprintf(stderr, " VLM requires the llama.cpp engine and a VLM model.\n"); + fprintf(stderr, " Switch engine: %srcli engine llamacpp%s\n", color::bold, color::reset); + fprintf(stderr, " Download model: %srcli models vlm%s\n\n", color::bold, color::reset); + rcli_destroy(g_engine); + return 1; + } + + const char* backend = rcli_vlm_backend_name(g_engine); + const char* model = rcli_vlm_model_name(g_engine); + if (backend && backend[0]) { + fprintf(stderr, "%s VLM: %s%s%s via %s%s%s%s\n", + color::dim, color::reset, color::bold, model, + color::reset, color::dim, backend, color::reset); + } + + const char* response = rcli_vlm_analyze(g_engine, screen_path.c_str(), prompt.c_str()); + if (response && response[0]) { + fprintf(stdout, "%s\n", response); + if (!args.no_speak) { + rcli_init(g_engine, args.models_dir.c_str(), args.gpu_layers); + rcli_speak(g_engine, response); + } + RCLIVlmStats stats; + if (rcli_vlm_get_stats(g_engine, &stats) == 0) { + fprintf(stderr, "\n%s⚡ %.1f tok/s (%d tokens, %.1fs total, first token %.0fms)%s\n", + color::dim, stats.gen_tok_per_sec, stats.generated_tokens, + stats.total_time_sec, stats.first_token_ms, color::reset); + } + } else { + fprintf(stderr, "%s%sError: VLM analysis failed%s\n", + color::bold, color::red, color::reset); + rcli_destroy(g_engine); + return 1; + } + + rcli_destroy(g_engine); + return 0; +} + // ============================================================================= // RAG subcommands // ============================================================================= @@ -654,16 +884,17 @@ static int cmd_metalrt(const Args& args) { inst ? color::reset : ""); } - // STT/TTS component models + // STT/TTS/VLM component models size_t offset = mrt_models.size(); - fprintf(stderr, "\n %s— STT/TTS Components —%s\n", color::bold, color::reset); + fprintf(stderr, "\n %s— STT/TTS/VLM Components —%s\n", color::bold, color::reset); fprintf(stderr, " %s# %-28s %-8s %-5s Status%s\n", color::bold, "Model", "Size", "Type", color::reset); for (size_t i = 0; i < comp_models.size(); i++) { auto& cm = comp_models[i]; bool inst = rcli::is_metalrt_component_installed(cm); - std::string type_label = (cm.component == "stt") ? "STT" : "TTS"; + std::string type_label = (cm.component == "stt") ? "STT" + : (cm.component == "vlm") ? "VLM" : "TTS"; fprintf(stderr, " %s%zu%s %-28s %-8s %-5s %s%s%s\n", color::bold, offset + i + 1, color::reset, cm.name.c_str(), @@ -917,6 +1148,7 @@ int main(int argc, char** argv) { if (!args.verbose) { llama_log_set([](enum ggml_log_level, const char*, void*) {}, nullptr); + mtmd_helper_log_set([](enum ggml_log_level, const char*, void*) {}, nullptr); } if (args.command.empty()) { @@ -930,6 +1162,9 @@ int main(int argc, char** argv) { if (args.command == "actions") return cmd_actions(args); if (args.command == "action") return cmd_action(args); if (args.command == "rag") return cmd_rag(args); + if (args.command == "vlm") return cmd_vlm(args); + if (args.command == "camera") return cmd_camera(args); + if (args.command == "screen") return cmd_screen(args); if (args.command == "setup") return cmd_setup(args); if (args.command == "models") return cmd_models(args); if (args.command == "voices") return cmd_voices(args); diff --git a/src/cli/model_pickers.h b/src/cli/model_pickers.h index 949e25b..ec0b847 100644 --- a/src/cli/model_pickers.h +++ b/src/cli/model_pickers.h @@ -12,6 +12,7 @@ #include "models/model_registry.h" #include "models/tts_model_registry.h" #include "models/stt_model_registry.h" +#include "models/vlm_model_registry.h" #include "engines/metalrt_loader.h" // ============================================================================= @@ -407,6 +408,83 @@ inline int pick_metalrt_stt() { return 0; } +// ============================================================================= +// VLM picker +// ============================================================================= + +inline int pick_vlm(const std::string& models_dir) { + auto all = rcli::all_vlm_models(); + + fprintf(stderr, "\n %s%s VLM Models (Vision \xC2\xB7 llama.cpp)%s\n\n", color::bold, color::orange, color::reset); + + fprintf(stderr, " %s# %-30s %-12s %s%s\n", + color::bold, "Model", "Size", "Status", color::reset); + fprintf(stderr, " %s── %-30s %-12s %s%s\n", + color::dim, "──────────────────────────────", "────────────", "──────────", color::reset); + + for (size_t i = 0; i < all.size(); i++) { + auto& m = all[i]; + bool installed = rcli::is_vlm_model_installed(models_dir, m); + std::string status; + if (installed) status = "\033[32minstalled\033[0m"; + else status = "\033[2mnot installed\033[0m"; + std::string label = m.name; + if (m.is_default) label += " (default)"; + char size_str[32]; + int total_mb = m.model_size_mb + m.mmproj_size_mb; + if (total_mb >= 1024) + snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0); + else + snprintf(size_str, sizeof(size_str), "%d MB", total_mb); + fprintf(stderr, " %s%-2zu%s %-30s %-12s %s\n", + installed ? "\033[32m" : "", i + 1, installed ? "\033[0m" : "", + label.c_str(), size_str, status.c_str()); + } + fprintf(stderr, "\n %sCommands:%s [1-%zu] download/select | q cancel\n Choice: ", + color::bold, color::reset, all.size()); + fflush(stderr); + + int choice = read_picker_choice(); + if (choice == 0 || choice == -1) { picker_no_changes(); return 0; } + if (choice < 1 || choice > (int)all.size()) { fprintf(stderr, "\n Invalid choice.\n\n"); return 1; } + + auto& sel = all[choice - 1]; + bool installed = rcli::is_vlm_model_installed(models_dir, sel); + if (installed) { + fprintf(stderr, "\n %s%s%s is already installed.%s\n\n", + color::bold, color::green, sel.name.c_str(), color::reset); + return 0; + } + + int total_mb = sel.model_size_mb + sel.mmproj_size_mb; + char size_str[32]; + if (total_mb >= 1024) + snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0); + else + snprintf(size_str, sizeof(size_str), "%d MB", total_mb); + fprintf(stderr, "\n %s%s%s%s is not installed (%s). Download? [Y/n]: ", + color::bold, color::yellow, sel.name.c_str(), color::reset, size_str); + fflush(stderr); + if (!confirm_download()) { picker_cancelled(); return 0; } + + std::string model_path = models_dir + "/" + sel.model_filename; + std::string mmproj_path = models_dir + "/" + sel.mmproj_filename; + std::string cmd = "bash -c '" + "set -e; echo \" Downloading " + sel.name + " model...\"; echo \"\"; " + "curl -L -# -o \"" + model_path + "\" \"" + sel.model_url + "\"; " + "echo \"\"; echo \" Downloading vision projector...\"; echo \"\"; " + "curl -L -# -o \"" + mmproj_path + "\" \"" + sel.mmproj_url + "\"; " + "echo \"\"; echo \" Done!\"; '"; + fprintf(stderr, "\n"); + if (system(cmd.c_str()) != 0) { + fprintf(stderr, "\n %s%sDownload failed.%s\n\n", color::bold, color::red, color::reset); + return 1; + } + fprintf(stderr, "\n %s%sInstalled: %s%s\n Use: rcli vlm [prompt]\n\n", + color::bold, color::green, sel.name.c_str(), color::reset); + return 0; +} + // ============================================================================= // Unified models dashboard // ============================================================================= @@ -417,6 +495,7 @@ inline int cmd_models(const Args& args) { if (args.arg1 == "llm") return pick_llm(models_dir); if (args.arg1 == "stt") return pick_stt(models_dir); if (args.arg1 == "tts") return pick_tts(models_dir); + if (args.arg1 == "vlm") return pick_vlm(models_dir); if (args.arg1 == "metalrt-stt" || args.arg1 == "whisper") return pick_metalrt_stt(); if (args.help) { @@ -426,12 +505,14 @@ inline int cmd_models(const Args& args) { " models Unified model dashboard\n" " models llm LLM model picker\n" " models stt STT model picker\n" - " models tts TTS voice picker\n\n" + " models tts TTS voice picker\n" + " models vlm VLM (vision) model picker\n\n" " %sEXAMPLES%s\n" " rcli models # dashboard — pick a modality\n" " rcli models llm # switch LLM directly\n" " rcli models stt # switch offline STT directly\n" - " rcli models tts # switch TTS voice directly\n\n", + " rcli models tts # switch TTS voice directly\n" + " rcli models vlm # manage VLM models for image analysis\n\n", color::bold, color::orange, color::reset, color::bold, color::reset, color::bold, color::reset); @@ -483,6 +564,21 @@ inline int cmd_models(const Args& args) { color::green, tts_name.c_str(), color::reset, tts_inst, tts_all.size()); + // VLM row + auto vlm_all = rcli::all_vlm_models(); + int vlm_inst = 0; + std::string vlm_name = "not installed"; + for (auto& m : vlm_all) { + if (rcli::is_vlm_model_installed(models_dir, m)) { + vlm_inst++; + if (vlm_name == "not installed") vlm_name = m.name; + } + } + fprintf(stderr, " %s4%s %sVLM (vision)%s %s%-28s%s %d / %zu\n", + color::green, color::reset, color::bold, color::reset, + vlm_inst > 0 ? color::green : color::dim, vlm_name.c_str(), color::reset, + vlm_inst, vlm_all.size()); + // MetalRT Whisper row auto mrt_comps = rcli::metalrt_component_models(); std::string mrt_stt_pref = rcli::read_selected_metalrt_stt_id(); @@ -498,7 +594,7 @@ inline int cmd_models(const Args& args) { } } if (mrt_stt_pref.empty() && mrt_stt_inst > 0) mrt_stt_name = "auto (first installed)"; - fprintf(stderr, " %s4%s %sMetalRT STT%s %s%-28s%s %d / %d\n", + fprintf(stderr, " %s5%s %sMetalRT STT%s %s%-28s%s %d / %d\n", color::green, color::reset, color::bold, color::reset, color::green, mrt_stt_name.c_str(), color::reset, mrt_stt_inst, mrt_stt_total); @@ -521,7 +617,7 @@ inline int cmd_models(const Args& args) { } fprintf(stderr, " %sNote: STT streaming (Zipformer) is always active for live mic.%s\n\n", color::dim, color::reset); - fprintf(stderr, " %sSelect modality:%s 1 LLM | 2 STT | 3 TTS | 4 MetalRT STT | q cancel\n Choice: ", + fprintf(stderr, " %sSelect modality:%s 1 LLM | 2 STT | 3 TTS | 4 VLM | 5 MetalRT STT | q cancel\n Choice: ", color::bold, color::reset); fflush(stderr); @@ -530,7 +626,8 @@ inline int cmd_models(const Args& args) { if (choice == 1 || choice == -2) return pick_llm(models_dir); // -2 (a) → LLM as first if (choice == 2) return pick_stt(models_dir); if (choice == 3) return pick_tts(models_dir); - if (choice == 4) return pick_metalrt_stt(); + if (choice == 4) return pick_vlm(models_dir); + if (choice == 5) return pick_metalrt_stt(); fprintf(stderr, "\n Invalid choice.\n\n"); return 1; @@ -595,10 +692,20 @@ inline int cmd_info() { ? "MetalRT (Metal GPU — LLM, STT, TTS on-device)" : "llama.cpp + sherpa-onnx (ONNX Runtime)"; + auto vlm_all_info = rcli::all_vlm_models(); + auto [vlm_found, vlm_def] = rcli::find_installed_vlm(models_dir); + std::string vlm_info; + if (vlm_found) { + vlm_info = vlm_def.name + " (llama.cpp, Metal GPU)"; + } else { + vlm_info = "not installed — run: rcli models vlm"; + } + fprintf(stdout, "\n%s%s RCLI%s %s%s%s\n\n" " %sEngine:%s %s\n" " %sLLM:%s %s\n" + " %sVLM:%s %s\n" " %sSTT:%s %s\n" " %sTTS:%s %s\n" " %sVAD:%s Silero VAD\n" @@ -610,6 +717,7 @@ inline int cmd_info() { color::dim, RA_VERSION, color::reset, color::bold, color::reset, engine_info.c_str(), color::bold, color::reset, llm_info.c_str(), + color::bold, color::reset, vlm_info.c_str(), color::bold, color::reset, stt_info.c_str(), color::bold, color::reset, tts_info.c_str(), color::bold, color::reset, @@ -677,5 +785,24 @@ inline int cmd_info() { if (!any_tts) fprintf(stdout, " (none — run: rcli setup)\n"); fprintf(stdout, "\n"); + // Installed VLM + fprintf(stdout, " %sInstalled VLM:%s\n", color::bold, color::reset); + bool any_vlm = false; + for (auto& m : vlm_all_info) { + if (rcli::is_vlm_model_installed(models_dir, m)) { + char size_str[32]; + int total_mb = m.model_size_mb + m.mmproj_size_mb; + if (total_mb >= 1024) + snprintf(size_str, sizeof(size_str), "%.1f GB", total_mb / 1024.0); + else + snprintf(size_str, sizeof(size_str), "%d MB", total_mb); + fprintf(stdout, " %-28s %-7s installed\n", + m.name.c_str(), size_str); + any_vlm = true; + } + } + if (!any_vlm) fprintf(stdout, " (none — run: rcli models vlm)\n"); + fprintf(stdout, "\n"); + return 0; } diff --git a/src/cli/setup_cmds.h b/src/cli/setup_cmds.h index f33dcc7..b5f85fb 100644 --- a/src/cli/setup_cmds.h +++ b/src/cli/setup_cmds.h @@ -178,13 +178,15 @@ inline int cmd_setup(const Args& args) { if (!cm.default_install) continue; std::string cm_dir = rcli::metalrt_models_dir() + "/" + cm.dir_name; if (rcli::is_metalrt_component_installed(cm)) { - std::string skip_label = (cm.component == "stt") ? "STT" : "TTS"; + std::string skip_label = (cm.component == "stt") ? "STT" + : (cm.component == "vlm") ? "VLM" : "TTS"; fprintf(stderr, " %s%sMetalRT %s already installed:%s %s\n", color::bold, color::green, skip_label.c_str(), color::reset, cm.name.c_str()); continue; } - std::string type_label = (cm.component == "stt") ? "STT" : "TTS"; + std::string type_label = (cm.component == "stt") ? "STT" + : (cm.component == "vlm") ? "VLM" : "TTS"; fprintf(stderr, " %sDownloading MetalRT %s: %s (~%s)...%s\n", color::dim, type_label.c_str(), cm.name.c_str(), rcli::format_size(cm.size_mb).c_str(), color::reset); diff --git a/src/cli/tui_app.h b/src/cli/tui_app.h index 6ec4ed1..7b01d1e 100644 --- a/src/cli/tui_app.h +++ b/src/cli/tui_app.h @@ -12,8 +12,15 @@ #include "models/stt_model_registry.h" #include "actions/action_registry.h" #include "engines/metalrt_loader.h" +#include "engines/vlm_engine.h" +#include "audio/camera_capture.h" +#include "audio/screen_capture.h" +#include "models/vlm_model_registry.h" #include "core/log.h" #include "core/personality.h" +#include + +extern char** environ; #include #include @@ -432,7 +439,43 @@ class TuiApp { if (c == "r" || c == "R") { enter_rag_mode(); return true; } if (c == "d" || c == "D") { close_all_panels(); enter_cleanup_mode(); return true; } if (c == "p" || c == "P") { enter_personality_mode(); return true; } - // V key: voice mode removed — push-to-talk via SPACE is always active + // V key: capture photo from camera and analyze with VLM + if (c == "v" || c == "V") { + run_camera_vlm("Describe what you see in this photo in detail."); + return true; + } + // S key: toggle visual mode (VLM only on llama.cpp engine) + if (c == "s" || c == "S") { + if (screen_capture_overlay_active()) { + screen_capture_hide_overlay(); + add_system_message("Exiting visual mode..."); + screen_->Post(Event::Custom); + std::thread([this]() { + rcli_vlm_exit(engine_); + add_system_message("Visual mode OFF"); + screen_->Post(Event::Custom); + }).detach(); + } else { + add_system_message("Entering visual mode, loading VLM..."); + screen_->Post(Event::Custom); + std::thread([this]() { + if (rcli_vlm_init(engine_) == 0) { + const char* vbe = rcli_vlm_backend_name(engine_); + const char* vmodel = rcli_vlm_model_name(engine_); + screen_capture_show_overlay(0, 0, 0, 0); + std::string msg = "Visual mode ON"; + if (vbe && vbe[0]) + msg += std::string(" — ") + vmodel + " via " + vbe; + msg += ". Drag/resize the green frame, then ask a question"; + add_system_message(msg); + } else { + add_system_message("VLM requires the llama.cpp engine. Switch with: rcli engine llamacpp, then download a model via [M] \xe2\x86\x92 VLM Models"); + } + screen_->Post(Event::Custom); + }).detach(); + } + return true; + } if (c == "t" || c == "T") { tool_trace_enabled_ = !tool_trace_enabled_.load(std::memory_order_relaxed); add_system_message(tool_trace_enabled_ ? "Tool call trace: ON" : "Tool call trace: OFF"); @@ -538,6 +581,11 @@ class TuiApp { std::string user_text = transcript; add_user_message(user_text); + // Visual mode: route voice to VLM screen analysis instead of LLM + if (screen_capture_overlay_active()) { + run_screen_vlm(user_text); + return; + } voice_state_ = VoiceState::THINKING; screen_->Post(Event::Custom); @@ -1069,6 +1117,11 @@ class TuiApp { else right.push_back(text("[A] actions ") | dim); right.push_back(text("[C] convo ") | dim); + right.push_back(text("[V] camera ") | dim); + if (screen_capture_overlay_active()) + right.push_back(text("[S] visual ● ") | ftxui::color(ftxui::Color::Green)); + else + right.push_back(text("[S] visual ") | dim); right.push_back(text("[R] RAG ") | dim); right.push_back(text("[P] personality ") | dim); right.push_back(text("[D] cleanup ") | dim); @@ -1458,6 +1511,7 @@ class TuiApp { e.is_archive = false; models_entries_.push_back(e); } + } else { // ---- llama.cpp engine: show GGUF models only ---- const auto* llm_active = rcli::resolve_active_model(dir, llm_all); @@ -1501,6 +1555,21 @@ class TuiApp { e.archive_dir = v.archive_dir; models_entries_.push_back(e); } + + // VLM models (vision) + auto vlm_all = rcli::all_vlm_models(); + { ModelEntry h; h.name = "VLM Models (Vision \xC2\xB7 llama.cpp)"; h.is_header = true; models_entries_.push_back(h); } + for (auto& m : vlm_all) { + ModelEntry e; + e.name = m.name; e.id = m.id; e.modality = "VLM"; + e.size_mb = m.model_size_mb + m.mmproj_size_mb; + e.installed = rcli::is_vlm_model_installed(dir, m); + e.is_active = false; // VLM is lazy-loaded, no "active" concept + e.is_default = m.is_default; e.is_recommended = m.is_default; + e.description = m.description; + e.url = m.model_url; e.filename = m.model_filename; e.is_archive = false; + models_entries_.push_back(e); + } } for (int i = 0; i < (int)models_entries_.size(); i++) { @@ -1666,7 +1735,20 @@ class TuiApp { bool archive = e.is_archive; std::string archive_dir_name = e.archive_dir; - std::thread([this, idx, dir, url, fname, mod, id, nm, archive, archive_dir_name]() { + // For VLM, also capture the mmproj URL + std::string vlm_mmproj_url, vlm_mmproj_fname; + if (mod == "VLM") { + auto vlm_models = rcli::all_vlm_models(); + for (auto& vm : vlm_models) { + if (vm.id == id) { + vlm_mmproj_url = vm.mmproj_url; + vlm_mmproj_fname = vm.mmproj_filename; + break; + } + } + } + std::thread([this, idx, dir, url, fname, mod, id, nm, archive, archive_dir_name, + vlm_mmproj_url, vlm_mmproj_fname]() { int rc; if (archive) { rc = system(("curl -sL '" + url + "' | tar xj -C '" + dir + "' 2>/dev/null").c_str()); @@ -1677,6 +1759,12 @@ class TuiApp { if (stat(src.c_str(), &st) == 0 && stat(dst.c_str(), &st) != 0) rename(src.c_str(), dst.c_str()); } + } else if (mod == "VLM" && !vlm_mmproj_url.empty()) { + // VLM needs two files: language model + mmproj + rc = system(("curl -sL -o '" + dir + "/" + fname + "' '" + url + "' 2>/dev/null").c_str()); + if (rc == 0) { + rc = system(("curl -sL -o '" + dir + "/" + vlm_mmproj_fname + "' '" + vlm_mmproj_url + "' 2>/dev/null").c_str()); + } } else { rc = system(("curl -sL -o '" + dir + "/" + fname + "' '" + url + "' 2>/dev/null").c_str()); } @@ -1698,6 +1786,9 @@ class TuiApp { } else { if (mod == "STT") rcli::write_selected_stt_id(id); else if (mod == "TTS") rcli::write_selected_tts_id(id); + else if (mod == "VLM") { + // VLM doesn't need selection — just mark installed + } models_message_ = "Downloaded & selected: " + nm + ". Restart RCLI to apply."; models_msg_color_ = theme_.success; } @@ -2143,6 +2234,117 @@ class TuiApp { // process_input // ==================================================================== + void run_camera_vlm(const std::string& prompt) { + add_system_message("Capturing photo from camera..."); + voice_state_ = VoiceState::THINKING; + std::string prompt_copy = prompt; + std::thread([this, prompt_copy]() { + std::string photo_path = "/tmp/rcli_camera_" + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg"; + int rc = camera_capture_photo(photo_path.c_str()); + if (rc != 0) { + add_response("(Camera capture failed. Check camera permissions in System Settings > Privacy & Security > Camera.)", ""); + voice_state_ = VoiceState::IDLE; + screen_->Post(Event::Custom); + return; + } + add_system_message("Photo captured! Loading VLM..."); + screen_->Post(Event::Custom); + + const char* response = rcli_vlm_analyze( + engine_, photo_path.c_str(), prompt_copy.c_str()); + + // Show which backend handled it + const char* vbe = rcli_vlm_backend_name(engine_); + const char* vmodel = rcli_vlm_model_name(engine_); + if (vbe && vbe[0]) { + add_system_message(std::string("VLM: ") + vmodel + " via " + vbe); + screen_->Post(Event::Custom); + } + + if (response && response[0]) { + add_response(response, "VLM"); + voice_state_ = VoiceState::SPEAKING; + screen_->Post(Event::Custom); + rcli_speak(engine_, response); + RCLIVlmStats stats; + if (rcli_vlm_get_stats(engine_, &stats) == 0) { + char buf[128]; + snprintf(buf, sizeof(buf), "⚡ %.1f tok/s | %d tokens | %.1fs total", + stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec); + add_system_message(buf); + } + } else { + add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", ""); + } + voice_state_ = VoiceState::IDLE; + { + pid_t pid; + const char* argv[] = {"open", photo_path.c_str(), nullptr}; + posix_spawnp(&pid, "open", nullptr, nullptr, + const_cast(argv), environ); + } + screen_->Post(Event::Custom); + }).detach(); + } + + void run_screen_vlm(const std::string& prompt) { + char app_name[256]; + screen_capture_target_app_name(app_name, sizeof(app_name)); + add_system_message(std::string("Capturing screenshot of ") + app_name + "..."); + voice_state_ = VoiceState::THINKING; + std::string prompt_copy = prompt; + std::thread([this, prompt_copy]() { + std::string screen_path = "/tmp/rcli_screen_" + + std::to_string(std::chrono::system_clock::now().time_since_epoch().count()) + ".jpg"; + int rc = screen_capture_screenshot(screen_path.c_str()); + if (rc != 0) { + add_response("(Screen capture failed. Check screen recording permissions.)", ""); + voice_state_ = VoiceState::IDLE; + screen_->Post(Event::Custom); + return; + } + add_system_message("Loading VLM..."); + screen_->Post(Event::Custom); + + std::string accumulated; + auto stream_cb = [](const char* event, const char* data, void* ud) { + auto* accum = static_cast(ud); + if (std::strcmp(event, "token") == 0) { + accum->append(data); + } + }; + int vlm_rc = rcli_vlm_analyze_stream(engine_, screen_path.c_str(), + prompt_copy.c_str(), stream_cb, &accumulated); + + // Show which backend handled it + const char* vbe = rcli_vlm_backend_name(engine_); + const char* vmodel = rcli_vlm_model_name(engine_); + if (vbe && vbe[0]) { + add_system_message(std::string("VLM: ") + vmodel + " via " + vbe); + screen_->Post(Event::Custom); + } + + if (vlm_rc == 0 && !accumulated.empty()) { + add_response(accumulated, "VLM"); + voice_state_ = VoiceState::SPEAKING; + screen_->Post(Event::Custom); + rcli_speak_streaming(engine_, accumulated.c_str(), nullptr, nullptr); + RCLIVlmStats stats; + if (rcli_vlm_get_stats(engine_, &stats) == 0) { + char buf[128]; + snprintf(buf, sizeof(buf), "⚡ %.1f tok/s | %d tokens | %.1fs total", + stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec); + add_system_message(buf); + } + } else { + add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", ""); + } + voice_state_ = VoiceState::IDLE; + screen_->Post(Event::Custom); + }).detach(); + } + void process_input(const std::string& input) { if (input.empty()) return; @@ -2202,6 +2404,26 @@ class TuiApp { return; } + if (cmd == "visual") { + if (screen_capture_overlay_active()) { + screen_capture_hide_overlay(); + add_system_message("Visual mode OFF"); + } else { + screen_capture_show_overlay(0, 0, 0, 0); + add_system_message("Visual mode ON — drag/resize the green frame, then ask a question"); + } + return; + } + + if (cmd == "screen" || cmd == "screenshot") { + run_screen_vlm("Describe what you see on this screen in detail."); + return; + } + + if (cmd == "camera" || cmd == "photo" || cmd == "webcam") { + run_camera_vlm("Describe what you see in this photo in detail."); + return; + } if (!engine_) { add_response("Engine not initialized.", ""); @@ -2340,6 +2562,34 @@ class TuiApp { struct stat path_st; if (!resolved.empty() && resolved[0] == '/' && stat(resolved.c_str(), &path_st) == 0) { + // Check if this is an image file → route to VLM analysis + if (S_ISREG(path_st.st_mode) && rastack::VlmEngine::is_supported_image(resolved)) { + add_system_message("Image detected: " + resolved); + add_system_message("Analyzing image with VLM..."); + voice_state_ = VoiceState::THINKING; + std::string path_copy = resolved; + std::thread([this, path_copy]() { + const char* response = rcli_vlm_analyze( + engine_, path_copy.c_str(), "Describe this image in detail."); + if (response && response[0]) { + add_response(response, "VLM"); + RCLIVlmStats stats; + if (rcli_vlm_get_stats(engine_, &stats) == 0) { + char buf[128]; + snprintf(buf, sizeof(buf), "⚡ %.1f tok/s | %d tokens | %.1fs total", + stats.gen_tok_per_sec, stats.generated_tokens, stats.total_time_sec); + add_system_message(buf); + } + } else { + add_response("(VLM not available. Requires llama.cpp engine and a VLM model. Use [M] → VLM Models to download.)", ""); + } + voice_state_ = VoiceState::IDLE; + screen_->Post(Event::Custom); + }).detach(); + return; + } + + // Non-image path → RAG ingest add_system_message("Detected path: " + resolved); add_system_message("Indexing for RAG... this may take a moment."); std::string path_copy = resolved; diff --git a/src/engines/metalrt_loader.cpp b/src/engines/metalrt_loader.cpp index 7dd5363..ba0f1c8 100644 --- a/src/engines/metalrt_loader.cpp +++ b/src/engines/metalrt_loader.cpp @@ -186,6 +186,22 @@ bool MetalRTLoader::load() { LOG_DEBUG("MetalRT", "TTS symbols: tts_create=%p tts_synthesize=%p tts_sample_rate=%p", (void*)tts_create, (void*)tts_synthesize, (void*)tts_sample_rate); + // Vision (VLM) symbols (optional) + vision_create = resolve("metalrt_vision_create"); + vision_destroy = resolve("metalrt_vision_destroy"); + vision_load = resolve("metalrt_vision_load"); + vision_analyze = resolve("metalrt_vision_analyze"); + vision_analyze_stream = resolve("metalrt_vision_analyze_stream"); + vision_generate = resolve("metalrt_vision_generate"); + vision_generate_stream = resolve("metalrt_vision_generate_stream"); + vision_reset = resolve("metalrt_vision_reset"); + vision_model_name = resolve("metalrt_vision_model_name"); + vision_device_name = resolve("metalrt_vision_device_name"); + vision_free_result = resolve("metalrt_vision_free_result"); + + LOG_DEBUG("MetalRT", "VLM symbols: vision_create=%p vision_analyze=%p vision_stream=%p", + (void*)vision_create, (void*)vision_analyze, (void*)vision_analyze_stream); + if (!fn_abi_version_ || !create || !destroy || !load_model || !generate) { LOG_ERROR("MetalRT", "dylib missing required LLM symbols: abi=%p create=%p destroy=%p load=%p gen=%p", (void*)fn_abi_version_, (void*)create, (void*)destroy, (void*)load_model, (void*)generate); diff --git a/src/engines/metalrt_loader.h b/src/engines/metalrt_loader.h index 6d6b0b8..41247ed 100644 --- a/src/engines/metalrt_loader.h +++ b/src/engines/metalrt_loader.h @@ -128,6 +128,47 @@ class MetalRTLoader { TtsFreeAudioFn tts_free_audio = nullptr; TtsSampleRateFn tts_sample_rate = nullptr; + // --- Vision (VLM) function pointers --- + + struct MetalRTVisionResult { + const char* text; + const char* thinking; + const char* response; + int prompt_tokens; + int generated_tokens; + double vision_encode_ms; + double prefill_ms; + double decode_ms; + double tps; + }; + + struct MetalRTVisionOptions { + int max_tokens; + int top_k; + float temperature; + bool think; + }; + + using VisionAnalyzeFn = MetalRTVisionResult (*)(void*, const char*, const char*, const MetalRTVisionOptions*); + using VisionAnalyzeStreamFn = MetalRTVisionResult (*)(void*, const char*, const char*, MetalRTStreamCb, void*, const MetalRTVisionOptions*); + using VisionGenerateFn = MetalRTVisionResult (*)(void*, const char*, const MetalRTVisionOptions*); + using VisionGenerateStreamFn = MetalRTVisionResult (*)(void*, const char*, MetalRTStreamCb, void*, const MetalRTVisionOptions*); + using VisionFreeResultFn = void (*)(MetalRTVisionResult); + + CreateFn vision_create = nullptr; + DestroyFn vision_destroy = nullptr; + LoadFn vision_load = nullptr; + VisionAnalyzeFn vision_analyze = nullptr; + VisionAnalyzeStreamFn vision_analyze_stream = nullptr; + VisionGenerateFn vision_generate = nullptr; + VisionGenerateStreamFn vision_generate_stream = nullptr; + ResetFn vision_reset = nullptr; + ModelNameFn vision_model_name = nullptr; + DeviceNameFn vision_device_name = nullptr; + VisionFreeResultFn vision_free_result = nullptr; + + bool has_vision() const { return vision_create != nullptr && vision_analyze != nullptr; } + // --- Install / remove / version management --- static bool install(const std::string& version = "latest"); diff --git a/src/engines/tts_engine.cpp b/src/engines/tts_engine.cpp index cf5cd95..b139960 100644 --- a/src/engines/tts_engine.cpp +++ b/src/engines/tts_engine.cpp @@ -77,9 +77,26 @@ bool TtsEngine::init(const TtsConfig& config) { return true; } +bool TtsEngine::reinit() { + if (!initialized_) return false; + LOG_DEBUG("TTS", "Reinitializing ONNX session to prevent audio degradation"); + if (tts_) { + SherpaOnnxDestroyOfflineTts(tts_); + tts_ = nullptr; + } + initialized_ = false; + synth_count_ = 0; + return init(config_); +} + std::vector TtsEngine::synthesize(const std::string& text) { if (!initialized_ || !tts_) return {}; + // Periodically reinit to prevent audio quality degradation + if (++synth_count_ >= kReinitInterval) { + reinit(); + } + stats_ = TtsStats{}; int64_t t_start = now_us(); diff --git a/src/engines/tts_engine.h b/src/engines/tts_engine.h index 40c36e9..90b9018 100644 --- a/src/engines/tts_engine.h +++ b/src/engines/tts_engine.h @@ -63,12 +63,18 @@ class TtsEngine { // Change speaker at runtime (Kokoro multi-voice) void set_speaker_id(int id) { config_.speaker_id = id; } + // Reinitialize the ONNX Runtime session to flush accumulated state. + // Call periodically to prevent audio degradation over long sessions. + bool reinit(); + private: const SherpaOnnxOfflineTts* tts_ = nullptr; TtsConfig config_; TtsStats stats_; int sample_rate_ = 22050; bool initialized_ = false; + int synth_count_ = 0; // synthesis calls since last reinit + static constexpr int kReinitInterval = 20; // reinit every N calls }; } // namespace rastack diff --git a/src/engines/vlm_engine.cpp b/src/engines/vlm_engine.cpp new file mode 100644 index 0000000..1f2d09b --- /dev/null +++ b/src/engines/vlm_engine.cpp @@ -0,0 +1,266 @@ +#include "engines/vlm_engine.h" +#include "core/log.h" +#include "llama.h" +#include "ggml.h" +#include "ggml-backend.h" +#include "mtmd.h" +#include "mtmd-helper.h" +#include +#include +#include + +namespace rastack { + +VlmEngine::VlmEngine() = default; + +VlmEngine::~VlmEngine() { + shutdown(); +} + +void VlmEngine::shutdown() { + if (ctx_mtmd_) { mtmd_free(ctx_mtmd_); ctx_mtmd_ = nullptr; } + if (sampler_) { llama_sampler_free(sampler_); sampler_ = nullptr; } + if (ctx_) { llama_free(ctx_); ctx_ = nullptr; } + if (model_) { llama_model_free(model_); model_ = nullptr; } + vocab_ = nullptr; + initialized_ = false; + stats_ = VlmStats{}; + LOG_DEBUG("VLM", "Shutdown complete"); +} + +bool VlmEngine::init(const VlmConfig& config) { + if (initialized_) shutdown(); + + config_ = config; + + // Initialize backend (loads Metal, etc.) — safe to call multiple times + static std::once_flag backend_init_flag; + std::call_once(backend_init_flag, [] { ggml_backend_load_all(); }); + + // Load language model + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = config.n_gpu_layers; + model_params.use_mmap = config.use_mmap; + model_params.use_mlock = config.use_mlock; + + LOG_DEBUG("VLM", "Loading VLM model: %s", config.model_path.c_str()); + model_ = llama_model_load_from_file(config.model_path.c_str(), model_params); + if (!model_) { + LOG_ERROR("VLM", "Failed to load VLM model"); + return false; + } + + vocab_ = llama_model_get_vocab(model_); + + // Create inference context + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = config.n_ctx; + ctx_params.n_batch = config.n_batch; + ctx_params.n_threads = config.n_threads; + ctx_params.n_threads_batch = config.n_threads_batch; + ctx_params.no_perf = false; + ctx_params.flash_attn_type = config.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; + + ctx_ = llama_init_from_model(model_, ctx_params); + if (!ctx_) { + LOG_ERROR("VLM", "Failed to create VLM context"); + llama_model_free(model_); + model_ = nullptr; + return false; + } + + // Initialize mtmd (vision projector) + LOG_DEBUG("VLM", "Loading vision projector: %s", config.mmproj_path.c_str()); + mtmd_context_params mtmd_params = mtmd_context_params_default(); + mtmd_params.use_gpu = (config.n_gpu_layers > 0); + mtmd_params.n_threads = config.n_threads_batch; + mtmd_params.flash_attn_type = config.flash_attn ? LLAMA_FLASH_ATTN_TYPE_AUTO : LLAMA_FLASH_ATTN_TYPE_DISABLED; + + ctx_mtmd_ = mtmd_init_from_file(config.mmproj_path.c_str(), model_, mtmd_params); + if (!ctx_mtmd_) { + LOG_ERROR("VLM", "Failed to load vision projector (mmproj)"); + llama_free(ctx_); + llama_model_free(model_); + ctx_ = nullptr; + model_ = nullptr; + return false; + } + + if (!mtmd_support_vision(ctx_mtmd_)) { + LOG_ERROR("VLM", "Model does not support vision input"); + mtmd_free(ctx_mtmd_); + llama_free(ctx_); + llama_model_free(model_); + ctx_mtmd_ = nullptr; + ctx_ = nullptr; + model_ = nullptr; + return false; + } + + // Setup sampler chain + auto sparams = llama_sampler_chain_default_params(); + sampler_ = llama_sampler_chain_init(sparams); + if (config.temperature > 0.0f) { + llama_sampler_chain_add(sampler_, llama_sampler_init_temp(config.temperature)); + llama_sampler_chain_add(sampler_, llama_sampler_init_top_k(config.top_k)); + llama_sampler_chain_add(sampler_, llama_sampler_init_top_p(config.top_p, 1)); + llama_sampler_chain_add(sampler_, llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); + } else { + llama_sampler_chain_add(sampler_, llama_sampler_init_greedy()); + } + + initialized_ = true; + LOG_INFO("VLM", "Initialized (vision support: yes)"); + return true; +} + +std::string VlmEngine::analyze_image(const std::string& image_path, + const std::string& prompt, + TokenCallback on_token) { + if (!initialized_) return ""; + + cancelled_.store(false, std::memory_order_relaxed); + stats_ = VlmStats{}; + + // Clear KV cache + llama_memory_clear(llama_get_memory(ctx_), true); + if (sampler_) llama_sampler_reset(sampler_); + + // 1. Load image + LOG_DEBUG("VLM", "Loading image: %s", image_path.c_str()); + mtmd_bitmap* bitmap = mtmd_helper_bitmap_init_from_file(ctx_mtmd_, image_path.c_str()); + if (!bitmap) { + LOG_ERROR("VLM", "Failed to load image: %s", image_path.c_str()); + return ""; + } + + // 2. Build prompt with media marker using ChatML template (Qwen3-VL format) + // The model expects: <|im_start|>system\n...<|im_end|>\n<|im_start|>user\n\nprompt<|im_end|>\n<|im_start|>assistant\n + std::string marker = mtmd_default_marker(); + std::string full_prompt = + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + marker + "\n" + prompt + "<|im_end|>\n" + "<|im_start|>assistant\n"; + + mtmd_input_text input_text; + input_text.text = full_prompt.c_str(); + input_text.add_special = true; + input_text.parse_special = true; + + // 3. Tokenize (combines text tokens + image tokens) + mtmd_input_chunks* chunks = mtmd_input_chunks_init(); + const mtmd_bitmap* bitmap_ptr = bitmap; + + int64_t t_encode_start = now_us(); + int32_t tokenize_result = mtmd_tokenize(ctx_mtmd_, chunks, &input_text, &bitmap_ptr, 1); + if (tokenize_result != 0) { + LOG_ERROR("VLM", "Failed to tokenize image+text (error=%d)", tokenize_result); + mtmd_input_chunks_free(chunks); + mtmd_bitmap_free(bitmap); + return ""; + } + + size_t n_tokens = mtmd_helper_get_n_tokens(chunks); + stats_.prompt_tokens = n_tokens; + LOG_DEBUG("VLM", "Tokenized: %zu total tokens (text + image)", n_tokens); + + // 4. Evaluate all chunks (text + image encoding + decoding) + int64_t t_prompt_start = now_us(); + llama_pos n_past = 0; + int32_t eval_result = mtmd_helper_eval_chunks( + ctx_mtmd_, ctx_, chunks, + n_past, // n_past + 0, // seq_id + config_.n_batch, // n_batch + true, // logits_last + &n_past // updated n_past + ); + + stats_.image_encode_us = now_us() - t_encode_start; + stats_.prompt_eval_us = now_us() - t_prompt_start; + + // Clean up image resources + mtmd_input_chunks_free(chunks); + mtmd_bitmap_free(bitmap); + + if (eval_result != 0) { + LOG_ERROR("VLM", "Failed to evaluate image+text chunks (error=%d)", eval_result); + return ""; + } + + LOG_DEBUG("VLM", "Image encoded in %.1fms, prompt eval in %.1fms", + stats_.image_encode_us / 1000.0, stats_.prompt_eval_us / 1000.0); + + // 5. Generate tokens (same pattern as LlmEngine::generate) + std::string result; + int64_t t_gen_start = now_us(); + bool first_token = true; + + for (int i = 0; i < config_.max_tokens; i++) { + if (cancelled_.load(std::memory_order_relaxed)) { + LOG_DEBUG("VLM", "Generation cancelled"); + break; + } + + int32_t new_token = llama_sampler_sample(sampler_, ctx_, -1); + + if (first_token) { + stats_.first_token_us = now_us() - t_prompt_start; + first_token = false; + } + + if (llama_vocab_is_eog(vocab_, new_token)) { + break; + } + + // Decode token to text + char buf[256]; + int n = llama_token_to_piece(vocab_, new_token, buf, sizeof(buf), 0, true); + if (n < 0) continue; + std::string piece(buf, n); + + result += piece; + stats_.generated_tokens++; + + if (on_token) { + TokenOutput tok; + tok.text = piece; + tok.token_id = new_token; + tok.is_eos = false; + tok.is_tool_call = false; + on_token(tok); + } + + // Feed token back for next iteration + llama_batch batch = llama_batch_get_one(&new_token, 1); + if (llama_decode(ctx_, batch) != 0) { + LOG_ERROR("VLM", "Failed to decode token"); + break; + } + } + + stats_.generation_us = now_us() - t_gen_start; + + LOG_DEBUG("VLM", "Generated %lld tokens (%.1f tok/s), first token: %.1fms", + stats_.generated_tokens, stats_.gen_tps(), + stats_.first_token_us / 1000.0); + + return result; +} + +bool VlmEngine::is_supported_image(const std::string& path) { + // Get extension (case-insensitive) + auto dot = path.rfind('.'); + if (dot == std::string::npos) return false; + + std::string ext = path.substr(dot); + // Convert to lowercase + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + + return ext == ".jpg" || ext == ".jpeg" || + ext == ".png" || ext == ".bmp" || + ext == ".gif" || ext == ".webp" || + ext == ".tga"; +} + +} // namespace rastack diff --git a/src/engines/vlm_engine.h b/src/engines/vlm_engine.h new file mode 100644 index 0000000..57739a2 --- /dev/null +++ b/src/engines/vlm_engine.h @@ -0,0 +1,88 @@ +#pragma once + +#include "core/types.h" +#include +#include +#include + +// Forward declare llama types +struct llama_model; +struct llama_context; +struct llama_sampler; +struct llama_vocab; + +// Forward declare mtmd types +struct mtmd_context; + +namespace rastack { + +struct VlmConfig { + std::string model_path; // Path to VLM language model GGUF + std::string mmproj_path; // Path to vision projector (mmproj) GGUF + int n_gpu_layers = 99; + int n_ctx = 4096; // VLM needs larger context for image tokens + int n_batch = 512; + int n_threads = 1; + int n_threads_batch = 8; + float temperature = 0.7f; + float top_p = 0.9f; + int top_k = 40; + int max_tokens = 512; + bool use_mmap = true; + bool use_mlock = false; + bool flash_attn = true; +}; + +struct VlmStats { + int64_t prompt_tokens = 0; + int64_t generated_tokens = 0; + int64_t prompt_eval_us = 0; + int64_t generation_us = 0; + int64_t image_encode_us = 0; // Time spent encoding the image + double prompt_tps() const { return prompt_tokens > 0 ? prompt_tokens * 1e6 / prompt_eval_us : 0; } + double gen_tps() const { return generated_tokens > 0 ? generated_tokens * 1e6 / generation_us : 0; } + int64_t first_token_us = 0; +}; + +class VlmEngine { +public: + VlmEngine(); + ~VlmEngine(); + + // Initialize model + vision projector + bool init(const VlmConfig& config); + + // Release all resources + void shutdown(); + + // Analyze an image with a text prompt + // Returns the generated description/analysis text + std::string analyze_image(const std::string& image_path, + const std::string& prompt, + TokenCallback on_token = nullptr); + + // Cancel ongoing generation + void cancel() { cancelled_.store(true, std::memory_order_release); } + + // Get stats from last generation + const VlmStats& last_stats() const { return stats_; } + + bool is_initialized() const { return initialized_; } + + // Check if an image file is a supported format + static bool is_supported_image(const std::string& path); + +private: + llama_model* model_ = nullptr; + llama_context* ctx_ = nullptr; + llama_sampler* sampler_ = nullptr; + const llama_vocab* vocab_ = nullptr; + mtmd_context* ctx_mtmd_ = nullptr; + + VlmConfig config_; + VlmStats stats_; + bool initialized_ = false; + std::atomic cancelled_{false}; +}; + +} // namespace rastack diff --git a/src/models/model_registry.h b/src/models/model_registry.h index 79d3da4..e0084d1 100644 --- a/src/models/model_registry.h +++ b/src/models/model_registry.h @@ -287,7 +287,7 @@ inline bool is_metalrt_model_installed(const LlmModelDef& m) { struct MetalRTComponentModel { std::string id; std::string name; - std::string component; // "stt" or "tts" + std::string component; // "stt", "tts", or "vlm" std::string hf_repo; // HuggingFace repo path (org/repo) std::string hf_subdir; // subdirectory within repo (empty for flat repos) std::string dir_name; // local dir under metalrt_models_dir() @@ -350,6 +350,7 @@ inline std::vector metalrt_component_models() { }; } + inline bool is_metalrt_component_installed(const MetalRTComponentModel& m) { std::string dir = metalrt_models_dir() + "/" + m.dir_name; if (access(dir.c_str(), R_OK) != 0) return false; diff --git a/src/models/vlm_model_registry.h b/src/models/vlm_model_registry.h new file mode 100644 index 0000000..5556d7a --- /dev/null +++ b/src/models/vlm_model_registry.h @@ -0,0 +1,94 @@ +#pragma once +// ============================================================================= +// RCLI VLM Model Registry +// ============================================================================= +// +// Registry of supported VLM (Vision Language Model) models. +// Each model consists of a language model GGUF + an mmproj (vision projector) GGUF. +// +// ============================================================================= + +#include +#include +#include + +namespace rcli { + +struct VlmModelDef { + std::string id; // Unique slug: "smolvlm-500m" + std::string name; // Display name: "SmolVLM 500M Instruct" + std::string model_filename; // Language model GGUF filename + std::string mmproj_filename; // Vision projector GGUF filename + std::string model_url; // HuggingFace download URL for language model + std::string mmproj_url; // HuggingFace download URL for mmproj + int model_size_mb; // Approximate model download size + int mmproj_size_mb; // Approximate mmproj download size + std::string description; // One-line description + bool is_default; // Default model for `rcli vlm` +}; + +inline std::vector all_vlm_models() { + return { + { + /* id */ "qwen3-vl-2b", + /* name */ "Qwen3 VL 2B Instruct", + /* model_filename */ "Qwen3-VL-2B-Instruct-Q8_0.gguf", + /* mmproj_filename */ "mmproj-Qwen3-VL-2B-Instruct-Q8_0.gguf", + /* model_url */ "https://huggingface.co/ggml-org/Qwen3-VL-2B-Instruct-GGUF/resolve/main/Qwen3-VL-2B-Instruct-Q8_0.gguf", + /* mmproj_url */ "https://huggingface.co/ggml-org/Qwen3-VL-2B-Instruct-GGUF/resolve/main/mmproj-Qwen3-VL-2B-Instruct-Q8_0.gguf", + /* model_size_mb */ 1830, + /* mmproj_size_mb */ 445, + /* description */ "Qwen3 Vision-Language model. High quality image analysis.", + /* is_default */ false, + }, + { + /* id */ "lfm2-vl-1.6b", + /* name */ "Liquid LFM2 VL 1.6B", + /* model_filename */ "LFM2-VL-1.6B-Q8_0.gguf", + /* mmproj_filename */ "mmproj-LFM2-VL-1.6B-Q8_0.gguf", + /* model_url */ "https://huggingface.co/LiquidAI/LFM2-VL-1.6B-GGUF/resolve/main/LFM2-VL-1.6B-Q8_0.gguf", + /* mmproj_url */ "https://huggingface.co/LiquidAI/LFM2-VL-1.6B-GGUF/resolve/main/mmproj-LFM2-VL-1.6B-Q8_0.gguf", + /* model_size_mb */ 1250, + /* mmproj_size_mb */ 210, + /* description */ "Liquid Foundation Model for vision. Fast, 128K context.", + /* is_default */ false, + }, + { + /* id */ "smolvlm-500m", + /* name */ "SmolVLM 500M Instruct", + /* model_filename */ "SmolVLM-500M-Instruct-Q8_0.gguf", + /* mmproj_filename */ "mmproj-SmolVLM-500M-Instruct-Q8_0.gguf", + /* model_url */ "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/SmolVLM-500M-Instruct-Q8_0.gguf", + /* mmproj_url */ "https://huggingface.co/ggml-org/SmolVLM-500M-Instruct-GGUF/resolve/main/mmproj-SmolVLM-500M-Instruct-Q8_0.gguf", + /* model_size_mb */ 437, + /* mmproj_size_mb */ 109, + /* description */ "Smallest VLM. Fast image analysis, lower quality.", + /* is_default */ false, + }, + }; +} + +inline std::pair get_default_vlm_model() { + auto models = all_vlm_models(); + for (auto& m : models) { + if (m.is_default) return {true, m}; + } + return {false, {}}; +} + +inline bool is_vlm_model_installed(const std::string& models_dir, const VlmModelDef& m) { + std::string model_path = models_dir + "/" + m.model_filename; + std::string mmproj_path = models_dir + "/" + m.mmproj_filename; + return access(model_path.c_str(), R_OK) == 0 && + access(mmproj_path.c_str(), R_OK) == 0; +} + +inline std::pair find_installed_vlm(const std::string& models_dir) { + auto models = all_vlm_models(); + for (auto& m : models) { + if (is_vlm_model_installed(models_dir, m)) return {true, m}; + } + return {false, {}}; +} + +} // namespace rcli diff --git a/src/pipeline/orchestrator.h b/src/pipeline/orchestrator.h index 8648374..51a5527 100644 --- a/src/pipeline/orchestrator.h +++ b/src/pipeline/orchestrator.h @@ -6,6 +6,7 @@ #include "core/ring_buffer.h" #include "engines/stt_engine.h" #include "engines/llm_engine.h" +#include "engines/vlm_engine.h" #include "engines/metalrt_engine.h" #include "engines/metalrt_stt_engine.h" #include "engines/metalrt_tts_engine.h" @@ -93,12 +94,16 @@ class Orchestrator { VadEngine& vad() { return vad_; } ToolEngine& tools() { return tools_; } AudioIO& audio() { return audio_; } + VlmEngine& vlm() { return vlm_; } RingBuffer* playback_ring_buffer() { return playback_rb_.get(); } // Active LLM backend LlmBackend active_llm_backend() const { return active_backend_; } bool using_metalrt() const { return active_backend_ == LlmBackend::METALRT; } + // Access the pipeline config (e.g. for MetalRT model dir during VLM swap) + const PipelineConfig& config() const { return config_; } + // Update the base system prompt (e.g. when personality changes) void set_system_prompt(const std::string& prompt) { config_.system_prompt = prompt; } @@ -168,6 +173,7 @@ class Orchestrator { SttEngine stt_; OfflineSttEngine offline_stt_; // Whisper for file pipeline LlmEngine llm_; + VlmEngine vlm_; MetalRTEngine metalrt_; MetalRTSttEngine metalrt_stt_; MetalRTTtsEngine metalrt_tts_; diff --git a/src/pipeline/text_sanitizer.h b/src/pipeline/text_sanitizer.h index b21b1a0..5c454a3 100644 --- a/src/pipeline/text_sanitizer.h +++ b/src/pipeline/text_sanitizer.h @@ -73,6 +73,33 @@ inline std::string sanitize_for_tts(const std::string& text) { out = std::move(cleaned); } + // 4b. Strip emote/action markers like *laughs*, *sighs*, *smiles*, etc. + // These are non-speakable stage directions that LLMs often generate. + { + std::string cleaned; + cleaned.reserve(out.size()); + for (size_t i = 0; i < out.size(); i++) { + if (out[i] == '*') { + size_t close = out.find('*', i + 1); + if (close != std::string::npos && close - i <= 30) { + // Check it looks like an emote (single word or short phrase, no nested formatting) + bool is_emote = true; + for (size_t j = i + 1; j < close; j++) { + if (out[j] == '*' || out[j] == '\n') { is_emote = false; break; } + } + if (is_emote) { + i = close; // skip past closing * + // Also skip trailing space if present + if (i + 1 < out.size() && out[i + 1] == ' ') i++; + continue; + } + } + } + cleaned += out[i]; + } + out = std::move(cleaned); + } + // 5. Strip markdown symbols and non-speakable formatting { std::string cleaned; @@ -215,6 +242,84 @@ inline std::string sanitize_for_tts(const std::string& text) { } } + // 6c. Replace brand names / proper nouns that G2P spells letter-by-letter + // with phonetic approximations so TTS pronounces them naturally. + { + struct Phonetic { const char* from; const char* to; }; + static const Phonetic table[] = { + {"Spotify", "Spotifye"}, + {"spotify", "spotifye"}, + {"SPOTIFY", "Spotifye"}, + {"YouTube", "You Tube"}, + {"Youtube", "You Tube"}, + {"youtube", "you tube"}, + {"YOUTUBE", "You Tube"}, + {"WiFi", "Why Fye"}, + {"wifi", "why fye"}, + {"WIFI", "Why Fye"}, + {"Wi-Fi", "Why Fye"}, + {"iPhone", "eye phone"}, + {"iphone", "eye phone"}, + {"IPHONE", "eye phone"}, + {"iPad", "eye pad"}, + {"ipad", "eye pad"}, + {"IPAD", "eye pad"}, + {"macOS", "mac O S"}, + {"MacOS", "mac O S"}, + {"iOS", "eye O S"}, + {"AirPods", "Air Pods"}, + {"airpods", "air pods"}, + {"AIRPODS", "Air Pods"}, + {"ChatGPT", "Chat G P T"}, + {"WhatsApp", "Whats App"}, + {"whatsapp", "whats app"}, + {"WHATSAPP", "Whats App"}, + {"TikTok", "Tick Tock"}, + {"tiktok", "tick tock"}, + {"TIKTOK", "Tick Tock"}, + {"LinkedIn", "Linked In"}, + {"linkedin", "linked in"}, + {"LINKEDIN", "Linked In"}, + }; + for (auto& p : table) { + std::string needle(p.from); + std::string replacement(p.to); + size_t pos = 0; + while ((pos = out.find(needle, pos)) != std::string::npos) { + bool left_ok = (pos == 0 || out[pos - 1] == ' ' || out[pos - 1] == '\n' || + out[pos - 1] == '"' || out[pos - 1] == '\''); + size_t end = pos + needle.size(); + bool right_ok = (end >= out.size() || out[end] == ' ' || out[end] == ',' || + out[end] == '.' || out[end] == '!' || out[end] == '?' || + out[end] == '\n' || out[end] == ';' || out[end] == ':' || + out[end] == '\'' || out[end] == '"'); + if (left_ok && right_ok) { + out.replace(pos, needle.size(), replacement); + pos += replacement.size(); + } else { + pos += needle.size(); + } + } + } + } + + // 6d. Replace hyphens between letters/words with spaces so G2P does not + // spell out hyphenated compounds (e.g. "well-known" → "well known"). + { + std::string cleaned; + cleaned.reserve(out.size()); + for (size_t i = 0; i < out.size(); i++) { + if (out[i] == '-' && i > 0 && i + 1 < out.size() && + std::isalpha((unsigned char)out[i - 1]) && + std::isalpha((unsigned char)out[i + 1])) { + cleaned += ' '; + } else { + cleaned += out[i]; + } + } + out = std::move(cleaned); + } + // 7. Collapse multiple whitespace to single space, trim { std::string cleaned; diff --git a/src/test/test_pipeline.cpp b/src/test/test_pipeline.cpp index a4b7bfb..d73a1b8 100644 --- a/src/test/test_pipeline.cpp +++ b/src/test/test_pipeline.cpp @@ -783,31 +783,36 @@ static void test_metalrt_llm(const std::string& models_dir) { engine.reset_conversation(); engine.generate("hi"); - // Benchmark 3 prompts - const char* prompts[] = { - "What is 2+2?", - "Write a haiku about the sea.", - "Explain gravity in one sentence.", - }; - - TEST_SECTION("MetalRT LLM Inference (Metal GPU)"); - for (int i = 0; i < 3; i++) { + // Benchmark across max_tokens sweep: 64, 128, 256, 512, 1024, 2048 + const int token_limits[] = { 64, 128, 256, 512, 1024, 2048 }; + const char* prompt = "Write a detailed essay about the history and future of artificial intelligence, " + "covering early pioneers, neural networks, deep learning breakthroughs, " + "large language models, and predictions for the next decade."; + + TEST_SECTION("MetalRT LLM Token Sweep Benchmark (Metal GPU)"); + fprintf(stderr, "\n \033[1;33m%-12s %8s %12s %10s %12s %10s %10s\033[0m\n", + "max_tokens", "gen_tok", "decode_ms", "tok/s", "prefill_ms", "pf_tok/s", "wall_ms"); + fprintf(stderr, " \033[33m%s\033[0m\n", + "------------ -------- ------------ ---------- ------------ ---------- ----------"); + + for (int limit : token_limits) { + engine.set_max_tokens(limit); + engine.set_ignore_eos(true); engine.reset_conversation(); + t0 = std::chrono::steady_clock::now(); - std::string result = engine.generate(prompts[i]); + std::string result = engine.generate(prompt); double gen_ms = elapsed_ms(t0); const auto& stats = engine.last_stats(); - TEST_INFO("--- Run %d ---", i + 1); - TEST_INFO(" Prompt: \"%s\"", prompts[i]); - TEST_INFO(" Response: \"%.*s%s\"", (int)std::min(result.size(), (size_t)80), - result.c_str(), result.size() > 80 ? "..." : ""); - TEST_INFO(" Backend: MetalRT (Metal GPU)"); - TEST_INFO(" Prefill: %.1f ms (%d tokens, %.0f tok/s)", - stats.prompt_eval_us / 1000.0, stats.prompt_tokens, stats.prompt_tps()); - TEST_INFO(" Decode: %.1f ms (%d tokens, %.0f tok/s)", - stats.generation_us / 1000.0, stats.generated_tokens, stats.gen_tps()); - TEST_INFO(" Wall: %.1f ms", gen_ms); + fprintf(stderr, " %-12d %8d %10.1f ms %8.1f %10.1f ms %8.0f %8.1f ms\n", + limit, + stats.generated_tokens, + stats.generation_us / 1000.0, + stats.gen_tps(), + stats.prompt_eval_us / 1000.0, + stats.prompt_tps(), + gen_ms); TEST("run produces output", !result.empty()); } }