From a8055322f1c53817292e4b7dabb3133baaf6be8d Mon Sep 17 00:00:00 2001 From: 9LogM <127714779+9LogM@users.noreply.github.com> Date: Sun, 3 May 2026 20:30:20 -0500 Subject: [PATCH] Pivot orchestrator from VLM-per-command to continuous-control VLA loop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drops the symbolic VLM + mission state machine in favor of a continuous control loop: each tick the orchestrator hands the latest RGB frame + held instruction + telemetry to a VLA backend and dispatches at most one tool call. Tool schema slims to the action-space the VLA emits (arm/disarm/set_velocity_ned/land/abort/kill). - Replace vlm_base / vlm_vla / vlm_gemini with vla_base + vla (no-op default backend; drop a model into vla.py to wire one in). - Remove OrchestratorMission, mission re-prompt loop, terminal-tool set, and stuck-detection — none apply under continuous control. - Replace mission_max_steps / telemetry_history_seconds config with control_hz / frame_max_age_s; drop perception.vlm.* in favor of perception.vla.{model_path,device}. - Voice-arm latch is now an orchestrator-side flag set by the command thread, not a per-command field — VLA-emitted arm without a prior spoken arm is rejected. - Frame staleness gate (frame_max_age_s) idles the loop when RGB is missing/old instead of letting the model extrapolate. - Drop Gemini from Dockerfile/pyproject and GEMINI_API_KEY from .env.example and preflight. - TUI alarm row vlm_error -> vla_error. - Delete docs/phase-2.md (mission-state design doc, superseded). - Sweep all VLM/Gemini references out of comments, docstrings, and README; project is now genuinely VLM-free. 43/43 tests green excluding the pre-existing replay slow-joiner flake. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 4 - CMakeLists.txt | 6 +- README.md | 42 +- common/config.yaml | 27 +- docs/phase-2.md | 98 ---- include/lexaire/safety.hpp | 2 +- include/services_panel.hpp | 4 +- python/Dockerfile | 2 +- python/lexaire/config.py | 9 +- python/lexaire/messages.py | 4 +- python/pyproject.toml | 5 +- python/services/orchestrator/main.py | 537 ++++++--------------- python/services/orchestrator/tools.py | 68 +-- python/services/orchestrator/vla.py | 111 +++++ python/services/orchestrator/vla_base.py | 59 +++ python/services/orchestrator/vlm_base.py | 45 -- python/services/orchestrator/vlm_gemini.py | 188 -------- python/services/perception/detectors.py | 3 +- python/services/perception/main.py | 4 +- python/services/stt/main.py | 2 +- scripts/preflight.sh | 14 +- src/flight_bridge/handlers.cpp | 6 +- src/main.cpp | 8 +- 23 files changed, 397 insertions(+), 851 deletions(-) delete mode 100644 docs/phase-2.md create mode 100644 python/services/orchestrator/vla.py create mode 100644 python/services/orchestrator/vla_base.py delete mode 100644 python/services/orchestrator/vlm_base.py delete mode 100644 python/services/orchestrator/vlm_gemini.py diff --git a/.env.example b/.env.example index 4174807..099abcf 100644 --- a/.env.example +++ b/.env.example @@ -1,10 +1,6 @@ # Copy this file to `.env` and fill in real values. # `.env` is gitignored and never committed. -# Google AI Studio key (https://aistudio.google.com/app/apikey). -# Used by the orchestrator when perception.vlm.provider == "gemini". -GEMINI_API_KEY= - # IP of the Raspberry Pi running the relay + sensor publisher. Compose # substitutes this into extra_hosts so containers resolve `drone.local` # to the Pi without depending on the host's DNS / mDNS. diff --git a/CMakeLists.txt b/CMakeLists.txt index d20dede..244a3d1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,9 +8,9 @@ find_package(MAVSDK REQUIRED) find_package(Boost REQUIRED COMPONENTS system) find_package(Threads REQUIRED) # Wide-char ncurses (libncursesw) is required so the TUI renders any UTF-8 -# byte sequence — including Gemini-produced text in the orchestrator status -# row — as proper glyphs instead of raw multi-byte garbage. setlocale() in -# main() activates the byte-to-wchar pipeline. +# byte sequence — including VLA-emitted thought text in the orchestrator +# status row — as proper glyphs instead of raw multi-byte garbage. +# setlocale() in main() activates the byte-to-wchar pipeline. set(CURSES_NEED_WIDE TRUE) find_package(Curses REQUIRED) find_package(yaml-cpp REQUIRED) diff --git a/README.md b/README.md index 0d8879f..b8b2c7f 100644 --- a/README.md +++ b/README.md @@ -39,7 +39,7 @@ Per-machine values (secrets and the Pi's IP) live in `.env`. Copy the example an ```bash cp .env.example .env -# Edit .env: set GEMINI_API_KEY and DRONE_PI_IP +# Edit .env: set DRONE_PI_IP ``` ### Pi setup @@ -144,7 +144,7 @@ Different mechanism (the publisher's source isn't on the GCS, so the build conte | Service | Source | Role | |---|---|---| | `perception` | [`python/services/perception/`](python/services/perception/) | Subscribes to the sensor publisher's RGB+depth streams, runs YOLO11 on each frame, publishes a scene graph (label + bbox + camera-frame xyz) at `perception.tick_hz`. | -| `orchestrator` | [`python/services/orchestrator/`](python/services/orchestrator/) | Pulls voice commands from the STT service, fuses them with the latest scene + telemetry + RGB frame, calls the Gemini 2.5 Flash VLM for a tool-call decision, dispatches the calls to the flight bridge over REQ/REP. Owns the mission state machine and re-prompt loop. | +| `orchestrator` | [`python/services/orchestrator/`](python/services/orchestrator/) | Pulls voice commands from the STT service, holds the operator's current instruction, and runs a continuous control loop: each tick it hands the latest RGB frame + telemetry + instruction to a VLA backend, gets back at most one tool call, and dispatches it to the flight bridge over REQ/REP. The VLA backend is pluggable (see [`python/services/orchestrator/vla.py`](python/services/orchestrator/vla.py)); default is a no-op until a model is wired in. | | `flight-bridge` (C++) | [`src/flight_bridge/`](src/flight_bridge/) | The system's only MAVSDK consumer. Enforces the non-overridable safety envelope ([`include/lexaire/safety.hpp`](include/lexaire/safety.hpp)) below the tool-call layer; runs the heartbeat-loss watchdog (auto RTL/HOLD); publishes telemetry + QGC liveness on the PUB stream the TUI reads. | | `stt` | [`python/services/stt/`](python/services/stt/) | Voice command source. Modes: text-input via stdin / `--once` / `--from-file`, or `--audio-file` for pre-recorded WAV (uses `faster-whisper`). Mic capture is a follow-up. Profile-gated: `docker compose --profile tools run --rm stt --once "land"`. | | `replay` | [`python/services/replay/`](python/services/replay/) | Field-debug tool: SUBs the live sensor channels and writes a JSONL recording (`record`), or replays one back as PUBs (`play`). Profile-gated. | @@ -157,19 +157,19 @@ The sensor publisher (default: [`RS-L515-Docker`](https://github.com/9LogM/RS-L5 ┌──────────────────────────────────────────────────────────────────────┐ │ │ │ STT ──── PUSH ────► orchestrator ──── REQ/REP ────► flight-bridge │ -│ (voice) │ ▲ ▲ │ │ -│ │ │ │ │ MAVSDK │ -│ │ scene telemetry ▼ │ -│ │ │ │ autopilot │ -│ │ PUB PUB │ -│ │ │ │ │ -│ ▼ │ │ │ -│ Gemini │ │ │ -│ (RGB)│ │ │ -│ │ │ │ -│ L515 ──► perception ──────┘ │ │ -│ │ │ -│ flight-bridge ──────────────────┘ │ +│ (voice) │ ▲ │ │ +│ │ │ │ MAVSDK │ +│ │ telemetry ▼ │ +│ │ │ autopilot │ +│ │ PUB │ +│ │ │ │ +│ ▼ │ │ +│ VLA │ │ +│ (RGB+lang)│ │ +│ │ │ +│ L515 ──► perception ──────────┘ │ +│ │ +│ flight-bridge ──────────────────────────────────────────────────────│ │ │ └──────────────────────────────────────────────────────────────────────┘ ``` @@ -180,15 +180,14 @@ Project-shared defaults live in [`common/config.yaml`](common/config.yaml); secr - `sensor.publisher_repo` — URL of the docker-based ZMQ sensor publisher repo. The TUI auto-clones it on the Pi and keeps it in sync with origin (default L515). - `sensor.channels.{rgb,depth,imu,infrared,confidence}` — ZMQ endpoints the publisher exposes. Each can be left blank to disable that stream; perception/orchestrator require `rgb` and `depth` and fail at startup if either is blank, replay subscribes to whichever are non-empty. -- `perception.vlm.{provider,model,api_key_env,temperature}` — currently `gemini` with `gemini-2.5-flash`. Requires `GEMINI_API_KEY` in `.env`. -- `perception.detector.{model,weights,score_threshold,...}` — YOLO11; `yolo11n.pt` is auto-downloaded on first run. +- `perception.vla.{model_path,device}` — VLA backend to run inside the orchestrator. `model_path` left blank gives a no-op backend (control loop runs, no actions emitted) so the stack boots without a model. Wire your chosen VLA in [`python/services/orchestrator/vla.py`](python/services/orchestrator/vla.py). +- `perception.detector.{model,weights,score_threshold,...}` — YOLO11; `yolo11n.pt` is auto-downloaded on first run. Detector output is observability-only under VLA mode (the orchestrator does not subscribe to scene messages). - `safety.{max_altitude_m,geofence_radius_m,max_velocity_mps,require_spoken_arm,heartbeat_loss_action,heartbeat_loss_threshold_s}` — non-overridable bridge-side gate plus heartbeat-loss recovery thresholds. -- `orchestrator.{mission_max_steps,telemetry_history_seconds}` — mission re-prompt loop cap and telemetry ring-buffer depth fed to the VLM. -- `stt.{abort_keyword,whisper_model,whisper_device,...}` — voice command settings; the abort keyword (default `"abort"`) short-circuits the VLM and goes straight to the abort tool. +- `orchestrator.{control_hz,frame_max_age_s}` — VLA tick rate and the staleness gate that turns a missing/old RGB frame into an idle tick. +- `stt.{abort_keyword,whisper_model,whisper_device,...}` — voice command settings; the abort keyword (default `"abort"`) short-circuits the VLA and goes straight to the abort tool. `.env` (copy from `.env.example`): -- `GEMINI_API_KEY` — required when `perception.vlm.provider == "gemini"`. - `DRONE_PI_IP` — the Pi's IP address. Compose substitutes it into `extra_hosts` so every container resolves `drone.local`. ### Roadmap @@ -196,7 +195,8 @@ Project-shared defaults live in [`common/config.yaml`](common/config.yaml); secr Lexaire ships in phases: - **Phase 1** — Perception + orchestrator + flight bridge end-to-end against a desk autopilot. ✅ -- **Phase 2** — First flight: multi-step missions, telemetry-aware reasoning, recovery on connection loss. See [`docs/phase-2.md`](docs/phase-2.md). +- **Phase 2** — Safety + recovery shakedown: heartbeat watchdog, frame-staleness gate, voice-arm gate, abort drains the queue, telemetry-aware safety envelope. ✅ +- **VLA control loop** — Continuous-control orchestrator (RGB + held instruction → tool call at `orchestrator.control_hz`). Drop a model in [`python/services/orchestrator/vla.py`](python/services/orchestrator/vla.py). - **Phase 3** — Live microphone capture for STT. - **Phase 4** — RTAB-Map SLAM for persistent spatial memory ("go back to the table you saw earlier"). diff --git a/common/config.yaml b/common/config.yaml index 39e5a8b..a706795 100644 --- a/common/config.yaml +++ b/common/config.yaml @@ -40,11 +40,16 @@ sensor: perception: tick_hz: 2 - vlm: - provider: gemini # gemini - model: gemini-2.5-flash - api_key_env: GEMINI_API_KEY - temperature: 0.2 + vla: + # Path / repo / model identifier. The default backend is a no-op + # until you set this — orchestrator boots, control loop runs, + # but VLA emits no actions. See vla.py for backend wiring. + model_path: null # e.g. "openvla/openvla-7b" or a local checkpoint + device: auto # auto | cpu | cuda:0 + # YOLO detector is independent of the orchestrator under VLA mode. + # The orchestrator no longer subscribes to scene messages; perception + # is kept as an opt-in observability service. Set tick_hz: 0 above + # to disable. detector: model: yolo # yolo weights: yolo11n.pt # n|s|m|l|x size vs. speed @@ -70,10 +75,14 @@ safety: heartbeat_loss_threshold_s: 2.0 orchestrator: - # Telemetry ring-buffer depth (seconds of history retained for the VLM). - telemetry_history_seconds: 5.0 - # Hard cap on a mission's re-prompt loop. Bounds Gemini calls per mission. - mission_max_steps: 10 + # Continuous control loop tick rate. The VLA is invoked at this rate; + # each invocation may emit at most one tool call (set_velocity_ned in + # the common case). Typical VLA models: 5-30 Hz. + control_hz: 10.0 + # Frame staleness gate. The orchestrator serves rgb=None to the VLA + # if no fresh frame has arrived within this window — closed-loop + # control on a frozen frame is unsafe. + frame_max_age_s: 0.2 services: # Endpoints bound by the named service; everyone else connects. diff --git a/docs/phase-2.md b/docs/phase-2.md deleted file mode 100644 index 52a387e..0000000 --- a/docs/phase-2.md +++ /dev/null @@ -1,98 +0,0 @@ -# Phase 2 — First flight: multi-step missions and real-drone verification - -Phase 1 proved the perception → reasoning → tool dispatch loop end-to-end against a desk autopilot. The orchestrator can take a voice command, fuse it with a YOLO scene + a Gemini-reasoned RGB frame, and dispatch a tool call that the flight bridge executes against MAVSDK. - -Phase 2 takes the same loop into actual flight in a controlled indoor environment. - -## Phase 2A vs 2B — what's blocked on positioning - -Indoor flight needs a position source. The L515 publishes RGB + depth + IMU but doesn't run a SLAM/VIO solver, so PX4 has no way to hold position from those streams without one. That's a significant chunk of work (essentially most of Phase 4). - -We split Phase 2 into two halves so the no-positioning work isn't gated on the positioning decision: - -- **Phase 2A — safety + recovery shakedown.** Real-flight verification of the parts that don't need positional autonomy: arm, auto-takeoff to fixed altitude, hover via internal IMU stabilization, abort, auto-land, RTL. The safety envelope ([`include/lexaire/safety.hpp`](../include/lexaire/safety.hpp)) is exercised airborne. The bridge heartbeat + orchestrator mission-state work is testable here. **Goes first.** -- **Phase 2B — voice-named navigation.** "Fly to the doorway and hold there." Requires position holding indoors. Gated on the positioning choice below. - -### Indoor positioning options (Phase 2B prerequisite) - -| Option | What it needs | Effort | Trade-offs | -|---|---|---|---| -| **A. L515 + RTAB-Map / VINS-Fusion** | A SLAM/VIO solver consuming the existing L515 streams; a `VISION_POSITION_ESTIMATE` MAVLink bridge into PX4; `EKF2_AID_MASK` configured for vision pose. | 1–2 weeks focused | This is most of Phase 4 SLAM pulled forward. End state is the right architecture. Risk: VIO drift / latency / camera-FC calibration. | -| **B. PMW3901 optical flow + lidar range finder** | ~$30 of hardware on the airframe + PX4 params (`SENS_FLOW_*`, `EKF2_AID_MASK`). | A weekend | Quick win; PX4's flow stack is mature. Doesn't reuse the L515. Stops working over visually-uniform floors. | -| **C. External motion capture (Vicon / OptiTrack / OpenVR)** | A mocap rig and a `VISION_POSITION_ESTIMATE` bridge. | A day if you have the rig | Lab-only. | -| **D. Defer 2B** | Phase 2A only; 2B waits until either the L515 SLAM stack lands as part of Phase 4, or you decide on flow hardware. | Zero | Loses voice-named navigation but unblocks safety/recovery testing today. | - -The decision lives outside this doc — once you pick, Phase 2B's scope and prereqs lock in. - -## Goals - -### 1. Live flight — Phase 2A - -The flight bridge talks to the PX4 FC via MAVSDK on every run. - -- Validate the safety envelope **in flight**: takeoff capped at `safety.max_altitude_m`, geofence rejecting `goto_ned` outside `safety.geofence_radius_m` (Phase 2B only — needs position), velocity rejection on `set_velocity_ned` over `safety.max_velocity_mps` (Phase 2B). The altitude cap and `require_spoken_arm` gates are testable in 2A. -- Verify `abort` lands a flying drone via `Action.land()` (controlled descent + disarm-on-touchdown), and that the separate `kill` tool reserves `Action.kill()` for true emergencies. - -### 2. Multi-step mission orchestration — Phase 2A scaffold, 2B exercises - -The orchestrator currently handles one tool call per voice command, then returns to idle. That's enough for "arm" or "takeoff" but breaks for anything compositional like "fly to the doorway and hold there" — the VLM has to emit goto + hold in a single response and the orchestrator dispatches them sequentially with no state tracking between. - -Phase 2 adds an explicit mission state machine: - -- `OrchestratorMission` dataclass: `goal_text`, `current_step`, `step_history`, `started_ts_ns`. -- The VLM sees the active mission as part of `VlmContext` so it can reason about progress. -- When a tool call returns ok, the orchestrator advances the mission step and re-prompts the VLM with the new state. Loop until VLM emits a terminal call (`hold`, `land`, `abort`) or the user gives a new command. -- Mission abort (user voice or safety trip) cleanly cancels and returns to idle. - -The state machine itself is testable in 2A with a stub VLM and a fake bridge in unit/integration tests. 2B exercises it under real flight. - -### 3. Telemetry-aware reasoning — Phase 2A - -`safety_snapshot` is currently a flat dict of envelope limits. The VLM gets a one-shot telemetry snapshot but no history. - -Phase 2 widens the context: - -- Maintain a 5-second ring buffer of telemetry samples in the orchestrator. -- Expose to the VLM as `ctx.telemetry_history` so it can reason about "approaching geofence", "battery dropping", "altitude unstable". -- Add a tick-based orchestrator wakeup (currently the orchestrator only wakes on a voice command — it can't proactively call `hold` if it sees something concerning). - -### 4. Recovery / connection loss — Phase 2A - -Right now if the flight bridge loses the MAVSDK connection mid-flight, handlers return `connection_error` and the orchestrator just moves on. If the orchestrator dies, the bridge has no fallback policy. - -Phase 2 adds an explicit recovery layer: - -- Bridge: if airborne and the autopilot heartbeat drops for >2 s, command RTL (configurable to HOLD via `safety.heartbeat_loss_action`). -- Orchestrator: if it loses the bridge REQ socket for >3 s, surface it on the status PUB so the TUI can show it red, and reject any further tool calls until reconnect. -- TUI: a single banner row that calls out "FLIGHT BRIDGE OFFLINE" or similar — currently the services panel just goes stale silently. - -## Out of scope (still later) - -- **Live microphone capture for STT.** The pipeline accepts text or pre-recorded WAV; mic-in-Docker is host-specific and is a Phase 3 concern. -- **SLAM persistent memory.** "Go back to the table you saw earlier" needs RTAB-Map and a map store; Phase 4. (Note: option A above does *one* component of Phase 4 — the live pose estimate — but not the persistence/mapping layer.) -- **Outdoor / GPS-denied flight.** L515 is IR-based, doesn't work outdoors. Future hardware swap (D435i) and an RTK-style position source come with that. -- **Local LLM (Ollama).** Gemini works fine; offline operation is deferred until there's a concrete reason to swap. - -## Acceptance criteria - -### Phase 2A — done when, in a controlled indoor space: - -1. The drone arms, auto-takes-off to a fixed altitude (≤ `safety.max_altitude_m`), hovers, and lands — entirely via voice — with the safety envelope active. -2. The altitude cap and spoken-arm gate are exercised airborne (a takeoff over the cap is rejected; an LLM-only arm without a voice-confirmed flag is rejected). -3. Mid-flight `abort` voice command produces an `Action.land()` (not motor-kill). -4. A simulated bridge-side MAVSDK heartbeat drop for >2 s triggers an automatic RTL. -5. A simulated orchestrator-side bridge-REQ timeout >3 s flips the TUI offline banner and prevents new tool calls until the bridge comes back. -6. The TUI services panel shows the orchestrator's mission state in real time. - -### Phase 2B — done when (gated on the positioning choice above): - -7. The drone arms, takes off, navigates to a voice-named object ("the chair"), hovers, and returns to launch — entirely via voice. -8. Geofence rejection fires airborne when the VLM tries to fly outside `safety.geofence_radius_m`. -9. Velocity-cap rejection fires airborne on `set_velocity_ned` over the limit. - -## Defaults baked in by Phase 2A code (overridable via config) - -- `safety.heartbeat_loss_action: rtl` — bridge auto-action on autopilot heartbeat drop. `hold` also valid. -- `safety.heartbeat_loss_threshold_s: 2.0` — how long to wait before triggering. -- `orchestrator.telemetry_history_seconds: 5.0` — telemetry ring-buffer depth in seconds. -- `orchestrator.mission_max_steps: 10` — hard cap on mission re-prompt loop to bound Gemini calls per mission. diff --git a/include/lexaire/safety.hpp b/include/lexaire/safety.hpp index 85be740..8817853 100644 --- a/include/lexaire/safety.hpp +++ b/include/lexaire/safety.hpp @@ -1,7 +1,7 @@ #pragma once // Safety envelope enforced by the flight bridge below the tool-call layer. -// Regardless of what the VLM decides, a tool call that violates the envelope +// Regardless of what the VLA decides, a tool call that violates the envelope // is rejected. #include diff --git a/include/services_panel.hpp b/include/services_panel.hpp index 2d4df5b..bf31666 100644 --- a/include/services_panel.hpp +++ b/include/services_panel.hpp @@ -26,8 +26,8 @@ struct ServicesSnapshot { // One-line summaries from the most recent message on each channel. std::string perception_summary; std::string telemetry_summary; - // "idle" | "thinking" | "executing" | "aborted" | "abort_failed" - // | "bridge_offline" | "vlm_error" + // "idle" | "executing" | "aborted" | "abort_failed" + // | "bridge_offline" | "vla_error" std::string orch_state; std::string orch_thought; diff --git a/python/Dockerfile b/python/Dockerfile index 830f2d0..c94b3e8 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -24,7 +24,7 @@ COPY python/pyproject.toml ./ COPY python/lexaire ./lexaire COPY python/services ./services -RUN pip install -e ".[gemini,detector-yolo,stt-whisper]" \ +RUN pip install -e ".[detector-yolo,stt-whisper]" \ && pip uninstall -y opencv-python opencv-python-headless \ && pip install --no-cache-dir opencv-python-headless diff --git a/python/lexaire/config.py b/python/lexaire/config.py index f303db7..b6db2f4 100644 --- a/python/lexaire/config.py +++ b/python/lexaire/config.py @@ -4,14 +4,13 @@ Usage: cfg = load_config() # finds common/config.yaml from cwd or repo root cfg.sensor.channels.rgb # -> "tcp://drone.local:5555" - cfg.perception.vlm.provider # -> "gemini" + cfg.perception.vla.model_path # -> None (no-op backend) or model id cfg.get("safety.max_altitude_m", default=5.0) The loader also walks up to find a `.env` file and populates `os.environ` -without overwriting existing values. Fields whose name ends in `_env` -(e.g. `perception.vlm.api_key_env`) hold the env-var name; the consumer -reads `os.environ[]` itself — there is no automatic resolution -at load time. +without overwriting existing values. Fields whose name ends in `_env` hold +the env-var name; the consumer reads `os.environ[]` itself — there +is no automatic resolution at load time. """ from __future__ import annotations diff --git a/python/lexaire/messages.py b/python/lexaire/messages.py index c7e19b8..929b8ec 100644 --- a/python/lexaire/messages.py +++ b/python/lexaire/messages.py @@ -71,8 +71,8 @@ class ToolResult: @dataclass class OrchestratorStatus: ts_ns: int - # "idle" | "thinking" | "executing" | "aborted" | "abort_failed" - # | "bridge_offline" | "vlm_error" + # "idle" | "executing" | "aborted" | "abort_failed" + # | "bridge_offline" | "vla_error" state: str last_thought: str = "" diff --git a/python/pyproject.toml b/python/pyproject.toml index 794d4fd..ea81261 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -16,9 +16,8 @@ dependencies = [ ] [project.optional-dependencies] -gemini = [ - "google-genai>=1.0", -] +# Concrete VLA model deps go here when you pick one — e.g.: +# vla-openvla = ["transformers>=4.40", "torch>=2.2", ...] detector-yolo = [ "ultralytics>=8.3", ] diff --git a/python/services/orchestrator/main.py b/python/services/orchestrator/main.py index 029099a..cc6073c 100644 --- a/python/services/orchestrator/main.py +++ b/python/services/orchestrator/main.py @@ -1,26 +1,26 @@ """ -Orchestrator service. +Orchestrator service (VLA mode). -Ties voice commands, perception, and telemetry together. On every new user -command, it asks the VLM to decide what to do given the current state, then -dispatches the resulting ToolCalls to the flight bridge via REQ/REP. +Runs a continuous control loop at `orchestrator.control_hz`. On every +tick the VLA backend gets the latest RGB frame, the operator's current +spoken instruction (held in context across ticks until the operator +updates it), and telemetry, and emits at most one tool call. Threads: * command_thread — PULLs voice commands from the STT service queue. - * scene_thread — SUBs the perception scene channel. * telemetry_thread — SUBs the flight bridge telemetry. - * frame_thread — pulls RGB frames from the sensor subscriber so the VLM - has pixel context on every decision. - * main thread — decision loop: wake on a new command, call VLM, - dispatch tool calls, publish status. + * frame_thread — pulls RGB frames from the sensor subscriber. + * main thread — control loop: tick at control_hz, call VLA, + dispatch one tool, publish status. + +Voice commands are held as the current `instruction` in the context. +A new voice command replaces the previous instruction; an abort or +the abort-keyword short-circuits straight to the abort tool. """ from __future__ import annotations import argparse -import collections -import concurrent.futures -import dataclasses import json import queue import re @@ -40,98 +40,56 @@ ) from lexaire.subscriber import SensorSubscriber -from .vlm_base import VLM, VlmContext - - -# Tool calls that end a mission cleanly. After any of these the orchestrator -# stops re-prompting the VLM and returns to idle. -_TERMINAL_TOOLS = frozenset({"hold", "land", "abort", "return_to_launch", "kill", "disarm"}) - -# Mission re-prompt loop bails when the same (tool, error) appears at least -# _STUCK_FAILURE_THRESHOLD times in the trailing _STUCK_FAILURE_WINDOW steps. -_STUCK_FAILURE_WINDOW = 4 -_STUCK_FAILURE_THRESHOLD = 2 - +from .vla_base import VLA, VlaContext -@dataclasses.dataclass -class OrchestratorMission: - """Active multi-step mission state. Surfaced to the VLM via VlmContext.""" - goal_text: str - started_ts_ns: int - # Tools that actually dispatched. step_history holds executed + - # skipped entries; current_step counts only executed. - current_step: int = 0 - step_history: list[dict] = dataclasses.field(default_factory=list) - def record(self, tool_name: str, args: dict, result: ToolResult, - *, executed: bool = True) -> None: - self.step_history.append({ - "tool": tool_name, - "args": args, - "ok": result.ok, - "error": result.error, - "ts_ns": now_ns(), - }) - if executed: - self.current_step += 1 - - def to_dict(self) -> dict: - return dataclasses.asdict(self) - - -def _build_vlm(cfg) -> VLM: - provider = cfg.get("perception.vlm.provider", "gemini") - if provider == "gemini": - from .vlm_gemini import build_gemini - return build_gemini(cfg) - raise ValueError(f"unknown vlm provider: {provider}") +def _build_vla(cfg) -> VLA: + from .vla import build_vla + return build_vla(cfg) class Orchestrator: def __init__(self, cfg): self.cfg = cfg self.log = logs.configure("orchestrator", cfg.get("logging.level", "INFO")) - self.vlm = _build_vlm(cfg) + self.vla = _build_vla(cfg) self.stop_event = threading.Event() - self.command_q: queue.Queue[VoiceCommand] = queue.Queue(maxsize=16) - self.history: list[str] = [] self._threads: list[threading.Thread] = [] - self.latest_scene: dict = {} - self.latest_scene_lock = threading.Lock() + # Continuous control: tick rate the main loop drives the VLA at. + self._control_hz = float(cfg.get("orchestrator.control_hz", 10.0)) + if self._control_hz <= 0: + raise ValueError("orchestrator.control_hz must be > 0") + self._control_period_s = 1.0 / self._control_hz + + # Operator's current instruction. Updated by the command thread, + # read by the main loop. None means "no instruction yet — idle." + self._instruction: Optional[str] = None + self._instruction_lock = threading.Lock() self.latest_telem: dict = {} self.latest_telem_lock = threading.Lock() - # Telemetry ring buffer (oldest -> newest), pruned by wallclock age in - # _run_telemetry_thread. Configured depth in seconds; the buffer - # holds however many samples the bridge published in that window. - self._telem_history_s = float(cfg.get("orchestrator.telemetry_history_seconds", 5.0)) - self._telem_history: collections.deque = collections.deque() - - # One private zmq.Context for every socket this orchestrator owns — - # PUB, SUB, PULL, REQ, plus the SensorSubscriber's. Avoids the - # process-wide singleton and gives stop() a clean term(). + self._zmq = zmq.Context() - # Hold onto the latest frame so the VLM sees current pixels. self.sub = SensorSubscriber( self._zmq, rgb_endpoint=cfg.require("sensor.channels.rgb"), depth_endpoint=cfg.require("sensor.channels.depth"), ) self.latest_fs = None - self.latest_fs_ts: float = 0.0 # monotonic at write + self.latest_fs_ts: float = 0.0 self.latest_fs_lock = threading.Lock() - # If the publisher dies, the VLM would otherwise reason over a - # frozen scene and emit geometry tool calls against a moved world. + # Tight default for VLA closed-loop control — a 2 s gate would + # let the model command setpoints against pixels that are 60 + # ticks old at 30 Hz. self._frame_max_age_s = float( - cfg.get("orchestrator.frame_max_age_s", 2.0)) + cfg.get("orchestrator.frame_max_age_s", 0.2)) self.command_pull = transport.pull(self._zmq, cfg.require("services.orchestrator_command_pull")) - self.status_pub = transport.pub(self._zmq, cfg.require("services.orchestrator_status_pub")) - self.scene_sub = transport.sub(self._zmq, cfg.require("services.perception_scene_pub")) - self.telem_sub = transport.sub(self._zmq, cfg.require("services.telemetry_pub")) + self.status_pub = transport.pub(self._zmq, cfg.require("services.orchestrator_status_pub")) + self.telem_sub = transport.sub(self._zmq, cfg.require("services.telemetry_pub")) self.flight_req_endpoint = cfg.require("services.flight_bridge_rep") self.safety_snapshot = { @@ -140,59 +98,40 @@ def __init__(self, cfg): "max_velocity_mps": float(cfg.get("safety.max_velocity_mps", 1.5)), } - # Set only on REQ timeout, cleared on any reply. self._bridge_offline_flag = False self._bridge_lock = threading.Lock() - # Lazy REQ socket; recycled on timeout because REQ doesn't tolerate - # a missed recv (strict send/recv state machine). self._req_socket: Optional[zmq.Socket] = None - # Word-boundary match so "laboratory" / "abortive" don't trip the - # abort path. STT also flags is_abort up front; this is the - # belt-and-suspenders check on text we receive directly. - # Guard against empty/whitespace abort_keyword: an empty string - # would compile to `\b\b`, which matches every word boundary, - # turning every voice command into is_abort=True. + # Word-boundary so "laboratory"/"abortive" don't trip. Empty/ + # whitespace keyword would compile to `\b\b` and match every + # word boundary. abort_kw = cfg.get("stt.abort_keyword", "abort").strip() if not abort_kw: raise ValueError( - "stt.abort_keyword is empty/whitespace — would treat every " + "stt.abort_keyword is empty/whitespace — would flag every " "command as abort. Set a non-empty keyword in config.yaml." ) self._abort_pattern = re.compile( rf"\b{re.escape(abort_kw)}\b", re.IGNORECASE) - # Voice-arm authorization. The bridge's safety gate (safety.hpp) - # only allows arm when args.voice_confirmed is true; this regex - # decides whether the orchestrator is allowed to set that flag. - # Set per command in _on_command, consulted in _dispatch_tool_call. - # Without this, a VLM-emitted arm during an unrelated mission - # ("take a picture") would still pass the gate. + # Voice-arm gate. Held until disarm/abort, not per command. If + # the operator says "arm" once, subsequent commands inherit the + # authorization until an explicit disarm or an abort fires. self._arm_pattern = re.compile(r"\barm\b", re.IGNORECASE) - self._current_arm_authorized = False - - # Set by the command thread when an abort arrives so the main - # thread can abandon any in-flight VLM call instead of waiting - # out the full RTT (Gemini calls have no built-in timeout). - self._preempt_event = threading.Event() - - # Two workers so a stale post-preempt VLM call doesn't block the - # next normal command's submit. Stale futures keep running to - # completion in the background; we just never read their result. - self._vlm_executor = concurrent.futures.ThreadPoolExecutor( - max_workers=2, thread_name_prefix="orch-vlm") + self._arm_authorized = False + self._arm_authorized_lock = threading.Lock() # -- Lifecycle ------------------------------------------------------------ def start(self): self.sub.start() self._spawn(self._run_command_thread, "orch-command") - self._spawn(self._run_scene_thread, "orch-scene") self._spawn(self._run_telemetry_thread, "orch-telem") self._spawn(self._run_frame_thread, "orch-frame") self._publish_status("idle", "startup") - self.log.info("orchestrator up vlm=%s", type(self.vlm).__name__) + self.log.info("orchestrator up vla=%s control_hz=%.1f", + type(self.vla).__name__, self._control_hz) def _spawn(self, target, name): t = threading.Thread(target=target, name=name, daemon=True) @@ -201,13 +140,10 @@ def _spawn(self, target, name): def stop(self): self.stop_event.set() - self._preempt_event.set() - # Don't wait for in-flight VLM calls — they own external HTTP RTT. - self._vlm_executor.shutdown(wait=False, cancel_futures=True) self.sub.stop() for t in self._threads: t.join(timeout=1.5) - for s in (self.command_pull, self.status_pub, self.scene_sub, self.telem_sub): + for s in (self.command_pull, self.status_pub, self.telem_sub): if s is not None: s.close() self._reset_req_socket() @@ -231,47 +167,46 @@ def _run_command_thread(self): is_abort=bool(hdr.get("is_abort", False)), ) self.log.info("command: %r (abort=%s)", cmd.text, cmd.is_abort) - if cmd.is_abort: - # Abort jumps the queue: drain pending commands so the - # next get() returns abort, not the head of the FIFO. - drained = 0 - try: - while True: - self.command_q.get_nowait() - drained += 1 - except queue.Empty: - pass - if drained: - self.log.warning("abort drained %d pending commands", drained) - self.command_q.put_nowait(cmd) - # Wake the main thread out of any in-flight VLM call so - # the abort doesn't wait for Gemini's RTT (or forever - # if the call hangs). - self._preempt_event.set() - else: - try: - self.command_q.put_nowait(cmd) - except queue.Full: - self.log.warning("command queue full — dropping") + # Short-circuit aborts: dispatch directly, clear the + # arm-authorization, drop instruction. + if cmd.is_abort or self._abort_pattern.search(cmd.text): + self._on_abort(cmd) + continue + # Latch arm authorization on first "arm" until disarm/abort. + if self._arm_pattern.search(cmd.text): + with self._arm_authorized_lock: + self._arm_authorized = True + # Replace the current instruction. Continuous: VLA picks + # this up on its next tick; no per-command mission. + with self._instruction_lock: + self._instruction = cmd.text except Exception: - # log.exception preserves traceback so a malformed payload - # is debuggable; .warning('%s', e) loses the stack. self.log.exception("command decode error") - def _run_scene_thread(self): - poller = zmq.Poller() - poller.register(self.scene_sub, zmq.POLLIN) - while not self.stop_event.is_set(): - events = dict(poller.poll(250)) - if self.scene_sub not in events: - continue - try: - frame = self.scene_sub.recv() - hdr = decode_header(frame) - with self.latest_scene_lock: - self.latest_scene = hdr - except Exception: - self.log.exception("scene decode error") + def _on_abort(self, cmd: VoiceCommand): + """Abort fast-path. Bypasses the VLA, dispatches abort directly, + clears in-flight instruction and arm authorization.""" + self._publish_status("aborted", f"abort keyword: {cmd.text!r}") + with self._instruction_lock: + self._instruction = None + with self._arm_authorized_lock: + self._arm_authorized = False + result = self._dispatch_tool_call(ToolCall( + request_id=VLA.new_request_id(), + name="abort", + args={}, + )) + if not result.ok: + if self._bridge_offline(): + self._publish_status( + "bridge_offline", + f"abort {cmd.text!r} did not reach the flight bridge", + ) + else: + self._publish_status( + "abort_failed", + f"abort rejected by bridge: {result.error}", + ) def _run_telemetry_thread(self): poller = zmq.Poller() @@ -283,16 +218,8 @@ def _run_telemetry_thread(self): try: frame = self.telem_sub.recv() hdr = decode_header(frame) - now = time.monotonic() with self.latest_telem_lock: self.latest_telem = hdr - self._telem_history.append((now, hdr)) - # Prune samples older than the configured window. The - # bridge publishes at ~10 Hz so the buffer naturally - # grows to ~10*window-seconds entries. - cutoff = now - self._telem_history_s - while self._telem_history and self._telem_history[0][0] < cutoff: - self._telem_history.popleft() except Exception: self.log.exception("telem decode error") @@ -306,242 +233,70 @@ def _run_frame_thread(self): self.latest_fs = fs self.latest_fs_ts = now - # -- Decision loop -------------------------------------------------------- + # -- Control loop --------------------------------------------------------- def run(self): + """Continuous control loop at control_hz. On each tick: + 1. Build context from latest frame / telemetry / instruction. + 2. Call VLA.decide(). + 3. Dispatch the resulting tool call (if any). + 4. Sleep to the next tick. + """ + next_tick = time.monotonic() while not self.stop_event.is_set(): + ctx = self._build_context() try: - cmd = self.command_q.get(timeout=0.2) - except queue.Empty: - continue - self._on_command(cmd) - - def _on_command(self, cmd: VoiceCommand): - # Each new voice command gets a fresh attempt — without this, a - # single REQ timeout would lock out the orchestrator forever - # (the on-success clear in _dispatch_tool_call is unreachable - # while the bridge_offline short-circuit at the top of that - # function returns early). - with self._bridge_lock: - self._bridge_offline_flag = False - - # Clear any preempt signal from the previous command — we're - # about to start consuming from the command queue and the - # main-loop poll below should only react to NEW signals. - self._preempt_event.clear() - - # Decide arm authorization once per user command. This flips - # voice_confirmed in _dispatch_tool_call only when the operator - # actually said "arm" — without this, a VLM-emitted arm during - # any other utterance ("take a picture") would defeat the gate. - self._current_arm_authorized = bool(self._arm_pattern.search(cmd.text)) - - if cmd.is_abort or self._abort_pattern.search(cmd.text): - self._publish_status("aborted", f"abort keyword: {cmd.text!r}") - result = self._dispatch_tool_call(ToolCall( - request_id=VLM.new_request_id(), - name="abort", - args={}, - )) - if not result.ok: - if self._bridge_offline(): - self._publish_status( - "bridge_offline", - f"abort {cmd.text!r} did not reach the flight bridge", - ) - else: - self._publish_status( - "abort_failed", - f"abort rejected by bridge: {result.error}", - ) - # Aborts are the most safety-critical events; they MUST land - # in the audit trail even though the rest of the mission state - # machine is short-circuited above. - suffix = "" if result.ok else f" ({result.error})" - self.history.append(f"user: {cmd.text} | end: aborted{suffix}") - self.history[:] = self.history[-20:] - return - - max_steps = int(self.cfg.get("orchestrator.mission_max_steps", 10)) - mission = OrchestratorMission(goal_text=cmd.text, started_ts_ns=now_ns()) - # decision_count bounds VLM calls; mission.current_step counts - # individual tool calls in a batch. - decision_count = 0 - terminal_call: Optional[str] = None - stuck = False - vlm_error: Optional[str] = None - preempted = False - - # Re-prompt the VLM with each tool result; bounded by max_steps - # decisions and preempted by a fresh voice command (next run() - # iteration picks it up). - while decision_count < max_steps and not self.stop_event.is_set(): - if self._preempt_event.is_set() or not self.command_q.empty(): - self.log.info("mission preempted by new voice command") - preempted = True - break - - ctx = self._build_context(cmd, mission=mission) - step_label = f"decision {decision_count + 1}/{max_steps}" - self._publish_status("thinking", f"{step_label}: {cmd.text!r}") - - # Run the VLM call on a worker so we can poll _preempt_event - # and bail when the operator says "abort" mid-call. The stale - # future keeps running to completion; we just discard it. - fut = self._vlm_executor.submit(self.vlm.decide, ctx) - while not fut.done(): - if self.stop_event.is_set() or self._preempt_event.is_set(): - break - time.sleep(0.05) - if self._preempt_event.is_set() or self.stop_event.is_set(): - self.log.info("VLM call abandoned (preempt or shutdown)") - preempted = True - break - # fut.result() re-raises any exception thrown inside vlm.decide. - # Backends should self-handle quota / network / parse errors and - # return a vlm_error: VlmDecision; an unexpected raise here - # would otherwise propagate to run() and kill the orchestrator - # main thread, with worker threads still alive queueing commands - # nobody dispatches. - try: - decision = fut.result() + decision = self.vla.decide(ctx) except Exception as e: - self.log.exception("VLM decide() raised unhandled: %s", e) - vlm_error = f"vlm_error: {e!r}" - break - decision_count += 1 - self.log.info("%s decision: %s -> %d tool_calls", - step_label, decision.thought, len(decision.tool_calls)) - - if not decision.tool_calls: - if decision.thought.startswith("vlm_error:"): - # Quota / network / key error from the VLM backend. - # Surface as its own status so the TUI shows red, not - # "idle" with a stale thought string. - vlm_error = decision.thought - self.log.error("VLM call failed: %s", decision.thought) - break - - self._publish_status("executing", decision.thought) - offline = False - stuck = False - for i, tc in enumerate(decision.tool_calls): - if self.stop_event.is_set() or self._preempt_event.is_set(): - self._record_skipped(mission, decision.tool_calls[i:], - "skipped:preempted") - preempted = True - break - result = self._dispatch_tool_call(tc) - mission.record(tc.name, dict(tc.args or {}), result) - if (result.error in ("bridge_offline", "timeout") - or (isinstance(result.error, str) - and result.error.startswith("wire_error"))): - self._record_skipped(mission, decision.tool_calls[i + 1:], - "skipped:bridge_offline") - offline = True - break - if not result.ok and result.error is not None: - matches = sum( - 1 for step in mission.step_history[-_STUCK_FAILURE_WINDOW:] - if step["tool"] == tc.name and step["error"] == result.error - ) - # End the mission once the VLM is stuck on a loop it can't - # escape on its own (canonical case: PX4 'Arming denied'). - self._record_skipped(mission, decision.tool_calls[i + 1:], - "skipped:prior_failure") - if matches >= _STUCK_FAILURE_THRESHOLD: - self.log.warning( - "mission stuck: %s failed %d times with %r; bailing", - tc.name, matches, result.error) - stuck = True - break - break - if tc.name in _TERMINAL_TOOLS: - self._record_skipped(mission, decision.tool_calls[i + 1:], - "skipped:after_terminal") - terminal_call = tc.name - break - - if terminal_call is not None or offline or stuck or preempted: - break - - # Order matters: stop_event takes precedence over the no-flags - # fallthroughs below, otherwise a SIGTERM mid-batch reports - # "no_tool_calls" or "max_decisions=N" depending on where the - # outer-loop budget was at, which is misleading. - if self.stop_event.is_set(): - end_reason = "stopped" - elif preempted: - end_reason = "preempted" - elif terminal_call: - end_reason = f"terminal:{terminal_call}" - elif self._bridge_offline(): - end_reason = "bridge_offline" - elif vlm_error is not None: - end_reason = "vlm_error" - elif decision_count >= max_steps: - end_reason = f"max_decisions={max_steps}" - elif stuck: - end_reason = "stuck_on_repeated_failure" - else: - end_reason = "no_tool_calls" - self.history.append( - f"user: {cmd.text} | decisions: {decision_count} " - f"| tools: {mission.current_step} | end: {end_reason}" - ) - self.history[:] = self.history[-20:] - # Bridge-offline and vlm_error already / want to surface their own - # state — don't overwrite with "idle" at end of mission. - if end_reason == "vlm_error": - self._publish_status("vlm_error", vlm_error or "vlm_error") - elif end_reason != "bridge_offline": - self._publish_status("idle", f"mission done ({end_reason})") - - @staticmethod - def _record_skipped(mission: OrchestratorMission, - remaining: list, reason: str) -> None: - """Drop k+1..N from a partially-executed batch into step_history - as failures so the audit trail and the VLM's `mission` view - reflect what actually didn't run, instead of vanishing them. - executed=False so current_step stays accurate.""" - for skipped in remaining: - mission.record( - skipped.name, - dict(skipped.args or {}), - ToolResult(request_id=skipped.request_id, - ok=False, error=reason), - executed=False, - ) + self.log.exception("VLA decide() raised unhandled: %s", e) + self._publish_status("vla_error", f"vla_error: {type(e).__name__}") + # Don't kill the loop on a single bad inference; the + # next tick may recover. Skip this tick's dispatch. + next_tick += self._control_period_s + self._sleep_to_next_tick(next_tick) + continue - def _build_context( - self, - cmd: VoiceCommand, - mission: Optional[OrchestratorMission] = None, - ) -> VlmContext: - with self.latest_scene_lock: - scene = dict(self.latest_scene) + if decision.tool_call is not None: + self._publish_status("executing", + decision.thought or decision.tool_call.name) + self._dispatch_tool_call(decision.tool_call) + else: + self._publish_status("idle", decision.thought or "idle") + + next_tick += self._control_period_s + self._sleep_to_next_tick(next_tick) + + def _sleep_to_next_tick(self, next_tick: float): + """Sleep until next_tick (monotonic), responsive to stop_event.""" + while not self.stop_event.is_set(): + now = time.monotonic() + remaining = next_tick - now + if remaining <= 0: + return + time.sleep(min(remaining, 0.05)) + + def _build_context(self) -> VlaContext: with self.latest_telem_lock: telem = dict(self.latest_telem) - telem_hist = [hdr for (_t, hdr) in self._telem_history] with self.latest_fs_lock: - age = time.monotonic() - self.latest_fs_ts if self.latest_fs is not None else None + age = (time.monotonic() - self.latest_fs_ts + if self.latest_fs is not None else None) rgb = (self.latest_fs.rgb.copy() if self.latest_fs is not None and age is not None and age <= self._frame_max_age_s else None) - return VlmContext( - user_command=cmd.text, - telemetry=telem, - scene=scene, + with self._instruction_lock: + instruction = self._instruction + return VlaContext( + instruction=instruction, rgb=rgb, + telemetry=telem, safety=dict(self.safety_snapshot), - telemetry_history=telem_hist, - mission=mission.to_dict() if mission is not None else None, ) + # -- Tool dispatch -------------------------------------------------------- + def _bridge_offline(self) -> bool: - """True only after a dispatch has actually timed out and no later - reply has reset the flag.""" with self._bridge_lock: return self._bridge_offline_flag @@ -554,7 +309,6 @@ def _ensure_req_socket(self) -> zmq.Socket: s.setsockopt(zmq.SNDTIMEO, 3000) s.connect(self.flight_req_endpoint) except Exception: - # Close the half-built socket on partial-construction failure. try: s.close(linger=0) except Exception: @@ -572,9 +326,6 @@ def _reset_req_socket(self) -> None: self._req_socket = None def _dispatch_tool_call(self, tc: ToolCall) -> ToolResult: - # Once the offline flag is set, fail the call immediately rather - # than wait out another REQ timeout. The flag clears on the first - # successful reply, so the next dispatch after recovery goes through. if self._bridge_offline(): self.log.warning("tool %s rejected: bridge_offline", tc.name) self._publish_status("bridge_offline", @@ -582,11 +333,18 @@ def _dispatch_tool_call(self, tc: ToolCall) -> ToolResult: return ToolResult(request_id=tc.request_id, ok=False, error="bridge_offline") # voice_confirmed is the orchestrator's authoritative signal — - # overwrite unconditionally on arm so a VLM-hallucinated value - # can't bypass the bridge's spoken-arm gate. + # overwrite unconditionally on arm so a VLA-emitted value can't + # bypass the bridge's spoken-arm gate. args = dict(tc.args) if tc.args else {} if tc.name == "arm": - args["voice_confirmed"] = self._current_arm_authorized + with self._arm_authorized_lock: + args["voice_confirmed"] = self._arm_authorized + # Disarm clears the arm-authorization latch on the orchestrator + # side too (the bridge clears its own armed_with_voice via + # subscribe_armed; this keeps the two in step). + if tc.name == "disarm": + with self._arm_authorized_lock: + self._arm_authorized = False try: req = self._ensure_req_socket() @@ -597,10 +355,6 @@ def _dispatch_tool_call(self, tc: ToolCall) -> ToolResult: }).encode("utf-8")) reply = req.recv() data = json.loads(reply.decode("utf-8")) - # The bridge always replies with a JSON object, but a wire-level - # bug (or someone connecting another producer to the REP socket) - # could send a list/string. data.get on those raises - # AttributeError, which the catch-block widened. if not isinstance(data, dict): raise TypeError(f"non-dict reply: {type(data).__name__}") result = ToolResult( @@ -609,14 +363,12 @@ def _dispatch_tool_call(self, tc: ToolCall) -> ToolResult: error=data.get("error"), data=data.get("data"), ) - # Any reply (ok or error) means the bridge is alive; only - # timeouts leave _bridge_offline_flag set. with self._bridge_lock: self._bridge_offline_flag = False if not result.ok: self.log.warning("tool %s failed: %s", tc.name, result.error) else: - self.log.info("tool %s ok", tc.name) + self.log.debug("tool %s ok", tc.name) return result except zmq.error.Again: self.log.warning("tool %s timed out", tc.name) @@ -627,8 +379,6 @@ def _dispatch_tool_call(self, tc: ToolCall) -> ToolResult: f"{tc.name} timed out — bridge unreachable") return ToolResult(request_id=tc.request_id, ok=False, error="timeout") except zmq.ZMQError as e: - # Transport-level failure (wedged REQ state, ENOTCONN, ...). - # Flag offline so the next dispatch short-circuits at the gate. self.log.warning("tool %s zmq error: %r", tc.name, e) self._reset_req_socket() with self._bridge_lock: @@ -656,14 +406,11 @@ def _publish_status(self, state: str, note: str): try: self.status_pub.send(encode_header(msg)) except zmq.ZMQError as e: - # Status PUB is non-load-bearing for control flow, but a - # send failure here means the TUI has stopped seeing us — - # surface rather than swallow. self.log.warning("status_pub send failed (%s): %r", e, msg) def cli() -> int: - p = argparse.ArgumentParser(description="Lexaire orchestrator") + p = argparse.ArgumentParser(description="Lexaire orchestrator (VLA)") p.add_argument("--config", help="path to config.yaml") args = p.parse_args() diff --git a/python/services/orchestrator/tools.py b/python/services/orchestrator/tools.py index 6587389..5fbcd35 100644 --- a/python/services/orchestrator/tools.py +++ b/python/services/orchestrator/tools.py @@ -1,8 +1,13 @@ """ -Tool schema exposed to the VLM. Mirrors the handler set in +Tool schema exposed to the VLA. Mirrors the handler set in `src/flight_bridge/handlers.cpp` — keep the two in sync. Safety limits are -enforced in the flight bridge; the VLM sees them as context, not as gates +enforced in the flight bridge; the VLA sees them as context, not as gates it can negotiate. + +Pure-VLA action space: continuous velocity setpoints + a small set of +discrete state transitions and emergency primitives. No `takeoff`, +`goto_ned`, `return_to_launch`, or `hold` — those are emergent from +velocity setpoints under closed-loop control. """ from __future__ import annotations @@ -23,62 +28,10 @@ def tool_schemas() -> list[dict[str, Any]]: "description": "Disarm the vehicle. Only on the ground.", "parameters": {"type": "object", "properties": {}, "required": []}, }, - { - "name": "takeoff", - "description": "Take off to the given altitude (meters above home).", - "parameters": { - "type": "object", - "properties": { - "altitude_m": { - "type": "number", - "description": "Target altitude AGL in meters. Must be <= safety.max_altitude_m.", - }, - }, - "required": ["altitude_m"], - }, - }, - { - "name": "land", - "description": "Land in place at the current position.", - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - { - "name": "return_to_launch", - "description": "Return to the home position and land.", - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - { - "name": "hold", - "description": "Switch to HOLD — hover at the current position.", - "parameters": {"type": "object", "properties": {}, "required": []}, - }, - { - "name": "goto_ned", - "description": ( - "Fly to a position in the local NED (north-east-down) frame relative to home. " - "Negative `d` means above home (d = -height_m)." - ), - "parameters": { - "type": "object", - "properties": { - "n": {"type": "number", "description": "North offset, meters."}, - "e": {"type": "number", "description": "East offset, meters."}, - "d": {"type": "number", "description": "Down offset, meters (negative = above)."}, - "yaw_deg": { - "type": "number", - "description": ( - "Absolute heading in degrees (0=North, CW positive). " - "Omit to hold current heading." - ), - }, - }, - "required": ["n", "e", "d"], - }, - }, { "name": "set_velocity_ned", "description": ( - "Command a NED velocity setpoint (m/s) for smooth motion. " + "Continuous control setpoint: NED-frame velocity (m/s). " "At least one of vx/vy/vz must be set — the bridge rejects " "a no-arg call to avoid silently stopping an active " "autonomous mode. {vx:0, vy:0, vz:0} is a legitimate hover." @@ -105,6 +58,11 @@ def tool_schemas() -> list[dict[str, Any]]: ], }, }, + { + "name": "land", + "description": "Controlled landing in place (kept as a safety primitive).", + "parameters": {"type": "object", "properties": {}, "required": []}, + }, { "name": "abort", "description": ( diff --git a/python/services/orchestrator/vla.py b/python/services/orchestrator/vla.py new file mode 100644 index 0000000..33d7f0d --- /dev/null +++ b/python/services/orchestrator/vla.py @@ -0,0 +1,111 @@ +""" +VLA backend. + +Concrete VLA model wired in here. Inputs: latest RGB frame, operator's +most recent instruction (held in context), telemetry, safety envelope. +Output: at most one tool call per tick. + +This file is intentionally a thin wrapper. The orchestrator handles the +control loop, voice intake, and dispatch. The VLA's only job is to map +(observation, language) -> action. + +Pick your model: + + OpenVLA — github.com/openvla/openvla (HF transformers, ~7B params) + π0 — github.com/Physical-Intelligence/openpi + Octo — github.com/octo-models/octo (~93M, lightweight) + RT-2 family — closed-source; you'd need a custom inference server + custom — your fine-tune + +Local vs remote inference is your call: instantiate the model in +__init__ for local, or treat decide() as a thin RPC for remote. +""" + +from __future__ import annotations + +import logging +from typing import Optional + +from lexaire.messages import ToolCall + +from .vla_base import VLA, VlaContext, VlaDecision + +log = logging.getLogger(__name__) + + +class _NoOpVLA(VLA): + """Default backend until a real VLA is wired in. Always returns no + action. Lets the orchestrator's control loop exercise end-to-end on + bench without a model loaded.""" + + def decide(self, ctx: VlaContext) -> VlaDecision: + return VlaDecision(thought="no_op", tool_call=None) + + +class GenericVlaBackend(VLA): + """Concrete VLA wrapper. + + Replace `_load_model` and `_infer` with your model's load/forward + calls. The control loop's tick rate, frame staleness gate, and + voice-arm wiring are owned by the orchestrator — this class only + needs to be deterministic given (rgb, instruction, telemetry). + """ + + def __init__(self, model_path: str, device: str = "auto", + control_rate_hz: float = 10.0): + self._model_path = model_path + self._device = device + self._control_rate_hz = control_rate_hz + self._model = self._load_model() + log.info("VLA backend ready model=%s device=%s rate=%.1fHz", + model_path, device, control_rate_hz) + + def _load_model(self): + # Replace with the chosen VLA's loader (HF transformers, local + # checkpoint, gRPC client, etc.). Returning None for now so the + # class is a clear "TODO: drop the model in here" landmark. + raise NotImplementedError( + "GenericVlaBackend._load_model: drop your VLA here. " + "See module docstring for candidate models." + ) + + def _infer(self, rgb, instruction: str, telemetry: dict) -> Optional[dict]: + # Replace with the chosen VLA's forward pass. Return action dict + # in the shape `{vx, vy, vz, yaw_deg}` (omit any of vx/vy/vz that + # the model doesn't emit), or None for no-op. + raise NotImplementedError("GenericVlaBackend._infer: forward pass goes here.") + + def decide(self, ctx: VlaContext) -> VlaDecision: + # No instruction yet → idle. Avoids commanding random setpoints + # before the operator has said anything. + if not ctx.instruction: + return VlaDecision(thought="idle (no instruction)", tool_call=None) + # No fresh frame → idle. The orchestrator already age-gates rgb; + # if it's None, the publisher is dead and the VLA shouldn't + # extrapolate from staleness. + if ctx.rgb is None: + return VlaDecision(thought="idle (no fresh frame)", tool_call=None) + action = self._infer(ctx.rgb, ctx.instruction, ctx.telemetry) + if action is None: + return VlaDecision(thought="model emitted no-op", tool_call=None) + return VlaDecision( + thought=f"vla -> {action}", + tool_call=ToolCall( + request_id=self.new_request_id(), + name="set_velocity_ned", + args=action, + ), + ) + + +def build_vla(cfg) -> VLA: + """Factory. Reads `perception.vla.*` from config; defaults to no-op + so a config without a model path doesn't crash startup.""" + model_path = cfg.get("perception.vla.model_path") + if not model_path: + log.warning("perception.vla.model_path not set — using no-op VLA") + return _NoOpVLA() + device = cfg.get("perception.vla.device", "auto") + rate = float(cfg.get("orchestrator.control_hz", 10.0)) + return GenericVlaBackend(model_path=model_path, device=device, + control_rate_hz=rate) diff --git a/python/services/orchestrator/vla_base.py b/python/services/orchestrator/vla_base.py new file mode 100644 index 0000000..edf7499 --- /dev/null +++ b/python/services/orchestrator/vla_base.py @@ -0,0 +1,59 @@ +"""VLA backend interface. + +Backends subclass `VLA`, receive a `VlaContext` per tick, and return a +`VlaDecision` (zero or one tool call). Continuous control: one decide() +per orchestrator control tick (config: orchestrator.control_hz). + +The shape is deliberately narrow — the orchestrator does not run a +mission state machine. The VLA holds whatever planning state it needs +internally (or operates purely reactively). +""" + +from __future__ import annotations + +import abc +import uuid +from dataclasses import dataclass, field +from typing import Optional + +import numpy as np + +from lexaire.messages import ToolCall + + +@dataclass +class VlaContext: + """Per-tick input to the VLA. + + `instruction` is the operator's most recent voice/text command, held + in context across ticks until the operator updates it. None means the + VLA is idle (no instruction → emit no tool call). + """ + + instruction: Optional[str] + rgb: Optional[np.ndarray] # (H, W, 3) BGR; None when frame is stale + telemetry: dict # latest TelemetryHeader + safety: dict # safety envelope (max alt / geofence / max v) + + +@dataclass +class VlaDecision: + """Per-tick output. `tool_call` is None when the VLA elects to do + nothing this tick (operator hasn't said anything, or model is still + warming up, or VLA explicitly emits a no-op). + + `thought` is opaque debug text surfaced in the TUI — backends may + leave it empty. + """ + + thought: str = "" + tool_call: Optional[ToolCall] = None + + +class VLA(abc.ABC): + @abc.abstractmethod + def decide(self, ctx: VlaContext) -> VlaDecision: ... + + @staticmethod + def new_request_id() -> str: + return uuid.uuid4().hex[:12] diff --git a/python/services/orchestrator/vlm_base.py b/python/services/orchestrator/vlm_base.py deleted file mode 100644 index 2cc0bdb..0000000 --- a/python/services/orchestrator/vlm_base.py +++ /dev/null @@ -1,45 +0,0 @@ -"""VLM interface. Backends subclass this.""" - -from __future__ import annotations - -import abc -import uuid -from dataclasses import dataclass, field -from typing import Optional - -import numpy as np - -from lexaire.messages import ToolCall - - -@dataclass -class VlmContext: - """Everything the VLM sees when asked to make a decision.""" - - user_command: str - telemetry: dict # decoded TelemetryHeader (latest sample) - scene: dict # decoded SceneHeader (latest) - rgb: Optional[np.ndarray] = None # (H, W, 3) BGR, current frame — may be None - safety: dict = field(default_factory=dict) - # Recent telemetry samples (oldest -> newest) so the VLM can reason about - # trends — battery dropping, approaching geofence, altitude unstable. - # Depth is bounded by orchestrator.telemetry_history_seconds. - telemetry_history: list[dict] = field(default_factory=list) - # Active multi-step mission, if any. None when the orchestrator is idle; - # populated for each re-prompt while a mission is in flight. - mission: Optional[dict] = None - - -@dataclass -class VlmDecision: - thought: str - tool_calls: list[ToolCall] = field(default_factory=list) - - -class VLM(abc.ABC): - @abc.abstractmethod - def decide(self, ctx: VlmContext) -> VlmDecision: ... - - @staticmethod - def new_request_id() -> str: - return uuid.uuid4().hex[:12] diff --git a/python/services/orchestrator/vlm_gemini.py b/python/services/orchestrator/vlm_gemini.py deleted file mode 100644 index c57debd..0000000 --- a/python/services/orchestrator/vlm_gemini.py +++ /dev/null @@ -1,188 +0,0 @@ -""" -Gemini VLM backend. - -Calls a Gemini multimodal model with the current RGB frame, structured -scene+telemetry context, and the tool schema. Emits function-call responses -as ToolCall objects. - -Install: `pip install 'lexaire[gemini]'`. Model is set by -`perception.vlm.model` in config.yaml (default `gemini-2.5-flash`). -""" - -from __future__ import annotations - -import io -import json as _json -import logging -import os -from typing import Optional - -import numpy as np -from PIL import Image - -from lexaire.messages import ToolCall - -from .tools import tool_schemas -from .vlm_base import VLM, VlmContext, VlmDecision - -log = logging.getLogger(__name__) - - -class GeminiVLM(VLM): - def __init__(self, model_name: str, api_key: str, temperature: float = 0.2): - try: - from google import genai # type: ignore - from google.genai import types as genai_types # type: ignore - except ImportError as e: - raise RuntimeError( - "google-genai not installed. " - "pip install 'lexaire[gemini]' or `pip install google-genai`." - ) from e - - if not api_key: - raise RuntimeError("GEMINI_API_KEY is unset — cannot start Gemini backend") - - self._types = genai_types - self._client = genai.Client(api_key=api_key) - self._model_name = model_name - - fn_decls = [ - genai_types.FunctionDeclaration( - name=t["name"], - description=t["description"], - parameters=t["parameters"], - ) - for t in tool_schemas() - ] - self._config = genai_types.GenerateContentConfig( - system_instruction=_SYSTEM_PROMPT, - temperature=temperature, - tools=[genai_types.Tool(function_declarations=fn_decls)], - ) - - def decide(self, ctx: VlmContext) -> VlmDecision: - parts: list = [] - img = self._encode_rgb(ctx.rgb) - if img is not None: - parts.append(img) - - parts.append(self._build_prompt(ctx)) - - thought_parts: list[str] = [] - calls: list[ToolCall] = [] - - try: - resp = self._client.models.generate_content( - model=self._model_name, - contents=parts, - config=self._config, - ) - # Cap at the first candidate — if candidate_count >1, alternates' - # tool_calls would concatenate and dispatch together. - for candidate in (getattr(resp, "candidates", []) or [])[:1]: - content = getattr(candidate, "content", None) - if content is None: - continue - for part in getattr(content, "parts", []) or []: - fn = getattr(part, "function_call", None) - if fn is not None: - args = dict(fn.args) if fn.args is not None else {} - calls.append(ToolCall( - request_id=self.new_request_id(), - name=str(fn.name), - args=args, - )) - continue - text = getattr(part, "text", None) - if text: - thought_parts.append(str(text)) - except Exception as e: - log.exception("Gemini call failed: %s", e) - # str(e) on google-genai errors can include the request URL - # with the API-key tail; this string flows over the status - # PUB to the TUI. Publish only the class name. - return VlmDecision(thought=f"vlm_error: {type(e).__name__}", tool_calls=[]) - - thought = " ".join(thought_parts).strip() or "(no text)" - return VlmDecision(thought=thought, tool_calls=calls) - - # -- helpers -------------------------------------------------------------- - - def _encode_rgb(self, rgb: Optional[np.ndarray]): - if rgb is None: - return None - # BGR -> RGB, JPEG-in-memory for Gemini's inline image input. - pil = Image.fromarray(rgb[..., ::-1].copy()) - buf = io.BytesIO() - pil.save(buf, format="JPEG", quality=85) - return self._types.Part.from_bytes(data=buf.getvalue(), mime_type="image/jpeg") - - def _build_prompt(self, ctx: VlmContext) -> str: - scene_compact = [] - for d in (ctx.scene or {}).get("detections", []) or []: - scene_compact.append({ - "label": d.get("label"), - "bbox": d.get("bbox_xywh"), - "depth_m": d.get("depth_m"), - "xyz_cam_m": d.get("xyz_cam_m"), - }) - - telem = ctx.telemetry or {} - safety = ctx.safety or {} - - # Compact recent telemetry trend — only the fields that matter for - # decision-making, downsampled to ~5 samples to keep the prompt small. - trend_keys = ("rel_alt_m", "ground_speed_mps", "battery_pct", "flight_mode", "armed") - history = ctx.telemetry_history or [] - if len(history) > 5: - step = max(1, len(history) // 5) - history = history[::step] - trend = [ - {k: h.get(k) for k in trend_keys if h.get(k) is not None} - for h in history - ] - - parts = [ - "You are the pilot. Use the tools to satisfy the pilot's spoken command " - "while respecting the safety envelope (the flight bridge enforces it " - "below you; ignore it at your peril).", - "", - f"Pilot command: {ctx.user_command!r}", - "", - f"Scene (local detector): {_json.dumps(scene_compact)}", - "", - f"Telemetry now: {_json.dumps({k: telem.get(k) for k in ('connected','armed','flight_mode','rel_alt_m','yaw_deg','ground_speed_mps','battery_pct')})}", - ] - if trend: - parts.append(f"Telemetry trend (oldest→newest): {_json.dumps(trend)}") - if ctx.mission: - parts.append(f"Active mission: {_json.dumps(ctx.mission)}") - parts.extend([ - "", - f"Safety envelope: {_json.dumps(safety)}", - "", - "If the command is ambiguous or unsafe, call `hold`. If the pilot says anything that " - "sounds like an emergency stop, call `abort`. Otherwise, reason about the RGB frame " - "plus the scene list and emit the tool calls needed to satisfy the command. " - "When a mission is active, prefer one tool call per turn — you'll be re-prompted " - "after each call so you can react to the result before the next step.", - ]) - return "\n".join(parts) - - -_SYSTEM_PROMPT = ( - "You are the in-flight pilot of a quadcopter. Input to you: an RGB frame from the " - "drone's forward camera, a structured list of detections with camera-frame 3D " - "coordinates, the current telemetry, and the pilot's latest spoken command. " - "Output: function calls that satisfy the command. Prefer smaller, incremental " - "maneuvers over single large jumps. Never emit a command you aren't certain of — " - "`hold` is always a safe default." -) - - -def build_gemini(cfg) -> GeminiVLM: - model = cfg.get("perception.vlm.model", "gemini-2.5-flash") - api_key_var = cfg.get("perception.vlm.api_key_env", "GEMINI_API_KEY") - api_key = os.environ.get(api_key_var, "") - temp = float(cfg.get("perception.vlm.temperature", 0.2)) - return GeminiVLM(model_name=model, api_key=api_key, temperature=temp) diff --git a/python/services/perception/detectors.py b/python/services/perception/detectors.py index 04cc8a7..c44a085 100644 --- a/python/services/perception/detectors.py +++ b/python/services/perception/detectors.py @@ -1,6 +1,7 @@ """ Detectors emit a per-tick scene graph (label, bbox, depth, camera-frame xyz) -that the orchestrator consumes alongside its own VLM reasoning. +for observability. The orchestrator's VLA does not subscribe to scene +messages — perception is independent and used by the TUI / replay tooling. Currently shipped: YoloDetector (Ultralytics YOLO11, COCO-80). Requires `pip install 'lexaire[detector-yolo]'`; downloads weights on first run. diff --git a/python/services/perception/main.py b/python/services/perception/main.py index af6f652..7ce0a64 100644 --- a/python/services/perception/main.py +++ b/python/services/perception/main.py @@ -5,8 +5,8 @@ throttles to `perception.tick_hz`, runs a detector on the latest synchronized frameset, and publishes a SceneHeader on `services.perception_scene_pub`. Provides a cheap, always-available scene -graph at a stable tick rate; the orchestrator may still reason on pixels -directly via its own VLM. +graph at a stable tick rate for the TUI and replay tooling. The +orchestrator's VLA reasons on pixels directly and does not subscribe here. """ from __future__ import annotations diff --git a/python/services/stt/main.py b/python/services/stt/main.py index 86dc485..3f5361c 100644 --- a/python/services/stt/main.py +++ b/python/services/stt/main.py @@ -17,7 +17,7 @@ The abort keyword from `stt.abort_keyword` (default `"abort"`) flips `is_abort=True` on the outgoing command so the orchestrator can short-circuit -straight to the abort tool without waiting on the VLM. +straight to the abort tool without waiting on the VLA. """ from __future__ import annotations diff --git a/scripts/preflight.sh b/scripts/preflight.sh index 05dfc65..6c0f70e 100644 --- a/scripts/preflight.sh +++ b/scripts/preflight.sh @@ -39,18 +39,16 @@ section "Local config" ENV_FILE="$REPO_ROOT/.env" if [ -f "$ENV_FILE" ]; then ok ".env present" - if grep -Eq '^GEMINI_API_KEY=.+' "$ENV_FILE"; then - ok "GEMINI_API_KEY set in .env" + if grep -Eq '^DRONE_PI_IP=.+' "$ENV_FILE"; then + ok "DRONE_PI_IP set in .env" else - fail "GEMINI_API_KEY missing or empty in .env" + warn "DRONE_PI_IP missing or empty in .env (compose extra_hosts won't resolve drone.local)" fi else - # Fall back to the live process env — works when the script runs inside - # a container that received GEMINI_API_KEY via env_file or -e. - if [ -n "${GEMINI_API_KEY:-}" ]; then - warn ".env file not found at $ENV_FILE, but GEMINI_API_KEY is set in environment" + if [ -n "${DRONE_PI_IP:-}" ]; then + warn ".env file not found at $ENV_FILE, but DRONE_PI_IP is set in environment" else - fail ".env not found at $ENV_FILE and GEMINI_API_KEY not in environment" + warn ".env not found at $ENV_FILE — compose will use the :- fallback for DRONE_PI_IP" fi fi diff --git a/src/flight_bridge/handlers.cpp b/src/flight_bridge/handlers.cpp index 5582a5e..f48dffa 100644 --- a/src/flight_bridge/handlers.cpp +++ b/src/flight_bridge/handlers.cpp @@ -14,8 +14,8 @@ namespace { // `nlohmann::json::value(key, default)` returns the default when the key is // missing OR when the stored value can't be converted to the requested type. -// That silently masks typos and wrong-typed args from the VLM (e.g. a -// hallucinated "alt_m" on a takeoff would have triggered a 1.5 m takeoff +// That silently masks typos and wrong-typed args from the VLA (e.g. a +// hallucinated "vz" on a set_velocity_ned would have been read as 0.0 // instead of erroring out). The helpers below give "either the value or a // clear error string" semantics so wrong types fail loud at the handler. @@ -198,7 +198,7 @@ ToolResult handle_kill(const ToolCall& c, FlightCtx& ctx) { // ---- Operator-only handlers ---- // The dispatch table includes the handlers below, but tool_schemas() in -// python/services/orchestrator/tools.py does NOT, so the VLM can't reach +// python/services/orchestrator/tools.py does NOT, so the VLA can't reach // them. They're the bridge's escape hatch for talking to the REQ socket // directly. Do not add to tool_schemas() without thinking through the // safety implications. diff --git a/src/main.cpp b/src/main.cpp index b6a7b73..1b26fda 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -290,9 +290,9 @@ static void render(const AppContext& ctx) { // Force the orchestrator row red on alarm states even if the // orchestrator itself is publishing healthily. const bool bridge_offline = (snap.orch_state == "bridge_offline"); - const bool vlm_error = (snap.orch_state == "vlm_error"); + const bool vla_error = (snap.orch_state == "vla_error"); const bool abort_failed = (snap.orch_state == "abort_failed"); - const bool orch_alarm = bridge_offline || vlm_error || abort_failed; + const bool orch_alarm = bridge_offline || vla_error || abort_failed; auto row = [&](const char* name, long long ms, const std::string& detail, int forced_pair = 0) { @@ -321,9 +321,9 @@ static void render(const AppContext& ctx) { attron(COLOR_PAIR(3) | A_BOLD); mvprintw(r++, 4, ">> FLIGHT BRIDGE OFFLINE - tool calls suspended"); attroff(COLOR_PAIR(3) | A_BOLD); - } else if (vlm_error) { + } else if (vla_error) { attron(COLOR_PAIR(3) | A_BOLD); - mvprintw(r++, 4, ">> VLM ERROR - check API key, quota, network"); + mvprintw(r++, 4, ">> VLA ERROR - check model load, inference path"); attroff(COLOR_PAIR(3) | A_BOLD); } else if (abort_failed) { attron(COLOR_PAIR(3) | A_BOLD);