diff --git a/crates/larql-server/Cargo.toml b/crates/larql-server/Cargo.toml
index 0447fd01..1f19e9b4 100644
--- a/crates/larql-server/Cargo.toml
+++ b/crates/larql-server/Cargo.toml
@@ -46,6 +46,8 @@ serde = { workspace = true, features = ["derive"] }
 serde_json = { workspace = true }
 thiserror = { workspace = true }
 base64 = "0.22"
+utoipa = { version = "5", features = ["axum_extras", "preserve_order"] }
+utoipa-swagger-ui = { version = "9", features = ["axum"] }
 
 [features]
 default = []
diff --git a/crates/larql-server/src/bootstrap.rs b/crates/larql-server/src/bootstrap.rs
index 81557952..5201df0c 100644
--- a/crates/larql-server/src/bootstrap.rs
+++ b/crates/larql-server/src/bootstrap.rs
@@ -523,6 +523,10 @@ pub struct Cli {
     #[arg(long)]
     pub cors: bool,
 
+    /// Disable the built-in Swagger UI and /v1/openapi.json endpoint.
+    #[arg(long)]
+    pub no_docs: bool,
+
     /// API key for authentication (clients send Authorization: Bearer <key>).
     #[arg(long)]
     pub api_key: Option<String>,
@@ -869,6 +873,14 @@ pub async fn serve(cli: Cli) -> Result<(), BoxError> {
         }
     }
 
+    // OpenAPI / Swagger UI. Mounted before auth so the docs stay reachable
+    // without the API key — consistent with --cors behavior. Flip the
+    // ordering if operators want docs gated.
+    if !cli.no_docs {
+        app = app.merge(crate::openapi::swagger_router());
+        info!("OpenAPI: /swagger-ui and /v1/openapi.json enabled");
+    }
+
     // Auth middleware.
     if cli.api_key.is_some() {
         app = app.layer(middleware::from_fn_with_state(
diff --git a/crates/larql-server/src/error.rs b/crates/larql-server/src/error.rs
index 3de32e98..fd821885 100644
--- a/crates/larql-server/src/error.rs
+++ b/crates/larql-server/src/error.rs
@@ -2,6 +2,15 @@
 
 use axum::http::StatusCode;
 use axum::response::{IntoResponse, Response};
+use serde::Serialize;
+use utoipa::ToSchema;
+
+/// JSON body returned for every error response.
+#[derive(Debug, Serialize, ToSchema)]
+pub struct ErrorBody {
+    /// Human-readable error message.
+    pub error: String,
+}
 
 #[derive(Debug, thiserror::Error)]
 pub enum ServerError {
@@ -30,7 +39,6 @@ impl IntoResponse for ServerError {
             ServerError::Internal(msg) => (StatusCode::INTERNAL_SERVER_ERROR, msg.clone()),
         };
 
-        let body = serde_json::json!({ "error": message });
-        (status, axum::Json(body)).into_response()
+        (status, axum::Json(ErrorBody { error: message })).into_response()
     }
 }
diff --git a/crates/larql-server/src/lib.rs b/crates/larql-server/src/lib.rs
index bcab84ff..d97a0177 100644
--- a/crates/larql-server/src/lib.rs
+++ b/crates/larql-server/src/lib.rs
@@ -17,6 +17,7 @@ pub mod ffn_l2_cache;
 pub mod grpc;
 pub mod grpc_expert;
 pub mod http;
+pub mod openapi;
 pub mod ratelimit;
 pub mod routes;
 pub mod session;
diff --git a/crates/larql-server/src/openapi.rs b/crates/larql-server/src/openapi.rs
new file mode 100644
index 00000000..73fb9758
--- /dev/null
+++ b/crates/larql-server/src/openapi.rs
@@ -0,0 +1,581 @@
+//! OpenAPI / Swagger UI aggregation.
+//!
+//! Spec JSON is served at `/v1/openapi.json` and the browse-friendly
+//! Swagger UI at `/swagger-ui`. Both can be disabled with `--no-docs`.
+//!
+//! Handlers are annotated in place with `#[utoipa::path]`. This module
+//! owns:
+//! - `ApiDoc` — the aggregator `#[derive(OpenApi)]` struct.
+//! - `schemas` — synthetic response structs for handlers that return
+//!   `Json<serde_json::Value>` (most of the browse/inference surface).
+//! - `params` — shared request parameters (e.g. `model_id`).
+//! - `swagger_router()` — helper that returns a ready-to-merge router
+//!   hosting both the UI and the spec JSON.
+
+use utoipa::OpenApi;
+use utoipa_swagger_ui::SwaggerUi;
+
+use crate::error::ErrorBody;
+
+pub mod params {
+    use utoipa::IntoParams;
+
+    /// Path parameter selecting which vindex to target in multi-model mode.
+    #[derive(IntoParams)]
+    #[into_params(parameter_in = Path)]
+    #[allow(dead_code)]
+    pub struct ModelIdParam {
+        /// The id of a loaded vindex, e.g. `gemma-3-1b-it`.
+        pub model_id: String,
+    }
+}
+
+pub mod schemas {
+    //! Synthetic response schemas.
+    //!
+    //! Populated as each handler group is annotated. Structs here are
+    //! `Serialize + ToSchema` mirrors of the actual JSON the handlers
+    //! emit via `Json<serde_json::Value>`. They are never constructed at
+    //! runtime — they exist purely for spec generation.
+
+    use serde::Serialize;
+    use utoipa::ToSchema;
+
+    // ---- browse ------------------------------------------------------
+
+    /// One knowledge edge returned from `/v1/describe`.
+    #[derive(Serialize, ToSchema)]
+    pub struct DescribeEdge {
+        /// Top token at this feature (trimmed).
+        pub target: String,
+        /// Gate activation score (rounded to 0.1).
+        pub gate_score: f32,
+        /// Layer the feature lives on.
+        pub layer: usize,
+        /// Feature index within the layer.
+        pub feature: usize,
+        /// Relation label (present when a probe-confirmed label exists).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub relation: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct DescribeResponse {
+        pub entity: String,
+        pub model: String,
+        pub edges: Vec<DescribeEdge>,
+        pub latency_ms: f64,
+    }
+
+    /// One walk hit returned from `/v1/walk`.
+    #[derive(Serialize, ToSchema)]
+    pub struct WalkHit {
+        pub layer: usize,
+        pub feature: usize,
+        pub gate_score: f32,
+        pub target: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub relation: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct WalkResponse {
+        pub prompt: String,
+        pub hits: Vec<WalkHit>,
+        pub latency_ms: f64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct RelationEntry {
+        pub name: String,
+        pub count: usize,
+        pub max_score: f32,
+        pub min_layer: usize,
+        pub max_layer: usize,
+        pub examples: Vec<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct RelationsResponse {
+        pub relations: Vec<RelationEntry>,
+        pub total: usize,
+        pub latency_ms: f64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct LayerBands {
+        pub syntax: [usize; 2],
+        pub knowledge: [usize; 2],
+        pub output: [usize; 2],
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct LoadedCapabilities {
+        pub browse: bool,
+        pub inference: bool,
+        pub ffn_service: bool,
+        pub embed_service: bool,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct StatsResponse {
+        pub model: String,
+        pub family: String,
+        pub layers: usize,
+        pub features: usize,
+        pub features_per_layer: usize,
+        pub hidden_size: usize,
+        pub vocab_size: usize,
+        pub extract_level: String,
+        pub dtype: String,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub mode: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub layer_bands: Option<LayerBands>,
+        pub loaded: LoadedCapabilities,
+    }
+
+    /// One entry in the OpenAI-compatible `/v1/models` list.
+    #[derive(Serialize, ToSchema)]
+    pub struct ModelEntry {
+        pub id: String,
+        pub object: String,
+        pub created: u64,
+        pub owned_by: String,
+        /// Route prefix for this model. `/v1/{id}` in multi-model mode, `/v1` otherwise.
+        pub path: String,
+        /// Total features across all layers.
+        pub features: usize,
+        pub loaded: bool,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct ModelsListResponse {
+        pub object: String,
+        pub data: Vec<ModelEntry>,
+    }
+
+    // ---- inference ---------------------------------------------------
+
+    #[derive(Serialize, ToSchema)]
+    pub struct SelectRow {
+        pub layer: usize,
+        pub feature: usize,
+        pub target: String,
+        pub confidence: f32,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub relation: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct SelectResponse {
+        pub rows: Vec<SelectRow>,
+        pub total: usize,
+        pub latency_ms: f64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct Prediction {
+        pub token: String,
+        pub probability: f64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct InferResponse {
+        pub prompt: String,
+        pub mode: String,
+        /// Single-mode (`walk` or `dense`).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub predictions: Option<Vec<Prediction>>,
+        /// Populated in `compare` mode.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub walk: Option<Vec<Prediction>>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub dense: Option<Vec<Prediction>>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub walk_ms: Option<f64>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub dense_ms: Option<f64>,
+        pub latency_ms: f64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct ExplainLayerEntry {
+        pub layer: usize,
+        pub top_features: Vec<serde_json::Value>,
+        pub top_tokens: Vec<(String, f64)>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct ExplainResponse {
+        pub prompt: String,
+        pub predictions: Vec<Prediction>,
+        pub layers: Vec<ExplainLayerEntry>,
+        pub latency_ms: f64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct InsertResponse {
+        pub success: bool,
+        pub entity: String,
+        pub relation: String,
+        pub target: String,
+        pub layers_written: Vec<usize>,
+        pub latency_ms: f64,
+    }
+
+    // ---- patches -----------------------------------------------------
+
+    /// Request body for `POST /v1/patches/apply`. Provide either a `url`
+    /// pointing at a `.vlp` file (local path or `hf://` URL) or an
+    /// inline `patch` object. One of the two is required.
+    #[derive(Serialize, ToSchema)]
+    pub struct ApplyPatchBody {
+        /// Local path, `http(s)://`, or `hf://` URL to a `.vlp` patch file.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub url: Option<String>,
+        /// Inline patch payload. See VindexPatch docs for schema; includes
+        /// `description`, `base_model`, and `operations` (INSERT / DELETE).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub patch: Option<serde_json::Value>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct ApplyPatchResponse {
+        pub applied: String,
+        pub operations: usize,
+        pub active_patches: usize,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub session: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct PatchEntry {
+        pub name: String,
+        pub operations: usize,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub base_model: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct ListPatchesResponse {
+        pub patches: Vec<PatchEntry>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub session: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct RemovePatchResponse {
+        pub removed: String,
+        pub active_patches: usize,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub session: Option<String>,
+    }
+
+    // ---- admin -------------------------------------------------------
+
+    #[derive(Serialize, ToSchema)]
+    pub struct HealthResponse {
+        pub status: String,
+        pub uptime_seconds: u64,
+        pub requests_served: u64,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct TokenEncodeResponse {
+        pub token_ids: Vec<u32>,
+        pub text: String,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct TokenDecodeResponse {
+        pub text: String,
+        pub token_ids: Vec<u32>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct EmbedSingleJsonResponse {
+        pub token_id: u32,
+        pub embedding: Vec<f32>,
+        pub hidden_size: usize,
+    }
+
+    // ---- openai ------------------------------------------------------
+    //
+    // These mirror the OpenAI wire contract at a high level.
+    // Full nested types (tools, tool_calls, logprobs, usage) are documented
+    // inline as open JSON objects to avoid a deep ToSchema tree.
+
+    /// Subset of the OpenAI `POST /v1/embeddings` request body.
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiEmbeddingsRequest {
+        /// Model id. Required in multi-model mode; ignored otherwise.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
+        /// String, string[], int[] (single sequence), or int[][] (batch of sequences).
+        pub input: serde_json::Value,
+        /// `"float"` (default) or `"base64"`.
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub encoding_format: Option<String>,
+        /// Requested output dimensionality (ignored; returns native hidden size).
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub dimensions: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub user: Option<String>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiEmbeddingObject {
+        pub object: String,
+        pub index: usize,
+        /// `[f32]` when `encoding_format = "float"`, or a base64 string otherwise.
+        pub embedding: serde_json::Value,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiEmbeddingsResponse {
+        pub object: String,
+        pub data: Vec<OpenAiEmbeddingObject>,
+        pub model: String,
+        pub usage: serde_json::Value,
+    }
+
+    /// OpenAI `POST /v1/completions` request.
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiCompletionsRequest {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
+        /// Prompt — string or string[].
+        pub prompt: serde_json::Value,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub max_tokens: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub temperature: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub top_p: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub stream: Option<bool>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub n: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub stop: Option<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub echo: Option<bool>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub logprobs: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub seed: Option<u64>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub user: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub frequency_penalty: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub presence_penalty: Option<f32>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiCompletionsResponse {
+        pub id: String,
+        pub object: String,
+        pub created: u64,
+        pub model: String,
+        pub choices: Vec<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub usage: Option<serde_json::Value>,
+    }
+
+    /// OpenAI `POST /v1/chat/completions` request. `messages` is an array
+    /// of `{role: "system"|"user"|"assistant"|"tool", content, ...}`; tools
+    /// and structured output are open JSON (see OpenAI docs).
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiChatRequest {
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub model: Option<String>,
+        pub messages: Vec<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub max_tokens: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub temperature: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub top_p: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub stream: Option<bool>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub n: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub stop: Option<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub logprobs: Option<bool>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub top_logprobs: Option<u32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub seed: Option<u64>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub user: Option<String>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub frequency_penalty: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub presence_penalty: Option<f32>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub response_format: Option<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub tools: Option<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub tool_choice: Option<serde_json::Value>,
+    }
+
+    #[derive(Serialize, ToSchema)]
+    pub struct OpenAiChatResponse {
+        pub id: String,
+        pub object: String,
+        pub created: u64,
+        pub model: String,
+        pub choices: Vec<serde_json::Value>,
+        #[serde(skip_serializing_if = "Option::is_none")]
+        pub usage: Option<serde_json::Value>,
+    }
+}
+
+#[derive(OpenApi)]
+#[openapi(
+    info(
+        title = "larql-server",
+        version = env!("CARGO_PKG_VERSION"),
+        description = "HTTP API for vindex knowledge queries, inference, and remote MoE expert shards.",
+    ),
+    tags(
+        (name = "browse",    description = "Knowledge graph browse (no weights required)"),
+        (name = "inference", description = "Forward passes, explain, insert, warmup"),
+        (name = "openai",    description = "OpenAI-compatible endpoints"),
+        (name = "expert",    description = "Remote MoE shard endpoints (binary wire)"),
+        (name = "patches",   description = "Runtime patch overlay"),
+        (name = "admin",     description = "Health, models, embed, tokens, WebSocket"),
+    ),
+    paths(
+        // browse
+        crate::routes::describe::handle_describe,
+        crate::routes::walk::handle_walk,
+        crate::routes::relations::handle_relations,
+        crate::routes::stats::handle_stats,
+        crate::routes::topology::handle_topology,
+        crate::routes::models::handle_models,
+        // inference
+        crate::routes::select::handle_select,
+        crate::routes::infer::handle_infer,
+        crate::routes::explain::handle_explain,
+        crate::routes::insert::handle_insert,
+        crate::routes::warmup::handle_warmup,
+        // patches
+        crate::routes::patches::handle_apply_patch,
+        crate::routes::patches::handle_list_patches,
+        crate::routes::patches::handle_remove_patch,
+        // admin
+        crate::routes::health::handle_health,
+        crate::routes::embed::handle_embed,
+        crate::routes::embed::handle_embed_single,
+        crate::routes::embed::handle_logits,
+        crate::routes::embed::handle_token_encode,
+        crate::routes::embed::handle_token_decode,
+        crate::routes::stream::handle_stream,
+        // openai
+        crate::routes::openai::embeddings::handle_embeddings,
+        crate::routes::openai::completions::handle_completions,
+        crate::routes::openai::chat::handle_chat_completions,
+        // expert
+        crate::routes::walk_ffn::handle_walk_ffn,
+        crate::routes::walk_ffn::handle_walk_ffn_q8k,
+        crate::routes::expert::single::handle_expert,
+        crate::routes::expert::batch_legacy::handle_expert_batch,
+        crate::routes::expert::layer_batch::handle_experts_layer_batch,
+        crate::routes::expert::layer_batch::handle_experts_layer_batch_f16,
+        crate::routes::expert::multi_layer_batch::handle_experts_multi_layer_batch,
+        crate::routes::expert::multi_layer_batch::handle_experts_multi_layer_batch_q8k,
+        // multi-model variants — same handlers with a `{model_id}` path prefix
+        crate::routes::describe::handle_describe_multi,
+        crate::routes::walk::handle_walk_multi,
+        crate::routes::relations::handle_relations_multi,
+        crate::routes::stats::handle_stats_multi,
+        crate::routes::select::handle_select_multi,
+        crate::routes::infer::handle_infer_multi,
+        crate::routes::explain::handle_explain_multi,
+        crate::routes::insert::handle_insert_multi,
+        crate::routes::patches::handle_apply_patch_multi,
+        crate::routes::patches::handle_list_patches_multi,
+        crate::routes::patches::handle_remove_patch_multi,
+        crate::routes::embed::handle_embed_multi,
+        crate::routes::embed::handle_embed_single_multi,
+        crate::routes::embed::handle_logits_multi,
+        crate::routes::embed::handle_token_encode_multi,
+        crate::routes::embed::handle_token_decode_multi,
+    ),
+    components(schemas(
+        ErrorBody,
+        // browse
+        schemas::DescribeEdge,
+        schemas::DescribeResponse,
+        schemas::WalkHit,
+        schemas::WalkResponse,
+        schemas::RelationEntry,
+        schemas::RelationsResponse,
+        schemas::LayerBands,
+        schemas::LoadedCapabilities,
+        schemas::StatsResponse,
+        schemas::ModelEntry,
+        schemas::ModelsListResponse,
+        crate::routes::topology::TopologyResponse,
+        // inference
+        crate::routes::select::SelectRequest,
+        schemas::SelectRow,
+        schemas::SelectResponse,
+        crate::routes::infer::InferRequest,
+        schemas::Prediction,
+        schemas::InferResponse,
+        crate::routes::explain::ExplainRequest,
+        schemas::ExplainLayerEntry,
+        schemas::ExplainResponse,
+        crate::routes::insert::InsertRequest,
+        schemas::InsertResponse,
+        crate::routes::warmup::WarmupRequest,
+        crate::routes::warmup::WarmupResponse,
+        // patches
+        schemas::ApplyPatchBody,
+        schemas::ApplyPatchResponse,
+        schemas::PatchEntry,
+        schemas::ListPatchesResponse,
+        schemas::RemovePatchResponse,
+        // admin
+        schemas::HealthResponse,
+        schemas::TokenEncodeResponse,
+        schemas::TokenDecodeResponse,
+        schemas::EmbedSingleJsonResponse,
+        crate::routes::embed::EmbedRequest,
+        crate::routes::embed::EmbedResponse,
+        crate::routes::embed::LogitsRequest,
+        crate::routes::embed::LogitsResponse,
+        crate::routes::embed::TokenProb,
+        // openai
+        schemas::OpenAiEmbeddingsRequest,
+        schemas::OpenAiEmbeddingObject,
+        schemas::OpenAiEmbeddingsResponse,
+        schemas::OpenAiCompletionsRequest,
+        schemas::OpenAiCompletionsResponse,
+        schemas::OpenAiChatRequest,
+        schemas::OpenAiChatResponse,
+        // expert
+        crate::routes::expert::SingleExpertRequest,
+        crate::routes::expert::SingleExpertResponse,
+        crate::routes::expert::BatchExpertItem,
+        crate::routes::expert::BatchExpertRequest,
+        crate::routes::expert::BatchExpertResult,
+        crate::routes::expert::BatchExpertResponse,
+    )),
+)]
+pub struct ApiDoc;
+
+/// Build a router hosting Swagger UI at `/swagger-ui` and the spec at
+/// `/v1/openapi.json`. Merge into the main app router.
+pub fn swagger_router() -> axum::Router {
+    SwaggerUi::new("/swagger-ui")
+        .url("/v1/openapi.json", ApiDoc::openapi())
+        .into()
+}
diff --git a/crates/larql-server/src/routes/describe.rs b/crates/larql-server/src/routes/describe.rs
index 77b69686..2af653ed 100644
--- a/crates/larql-server/src/routes/describe.rs
+++ b/crates/larql-server/src/routes/describe.rs
@@ -18,15 +18,21 @@ use crate::state::{elapsed_ms, AppState, LoadedModel};
 
 const DESCRIBE_CACHE_CONTROL: &str = "public, max-age=86400";
 
-#[derive(Deserialize)]
+#[derive(Deserialize, utoipa::IntoParams)]
+#[into_params(parameter_in = Query)]
 pub struct DescribeParams {
+    /// Entity to describe, e.g. `France`.
     pub entity: String,
+    /// Layer band to scan: `knowledge` (default), `syntax`, `output`, or `all`.
     #[serde(default = "default_band")]
     pub band: String,
+    /// Include low-score edges in the response.
     #[serde(default)]
     pub verbose: bool,
+    /// Maximum number of edges to return.
     #[serde(default = "default_limit")]
     pub limit: usize,
+    /// Minimum gate score to include an edge.
     #[serde(default = "default_min_score")]
     pub min_score: f32,
 }
@@ -257,6 +263,19 @@ async fn describe_with_cache(
         .into_response())
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/describe",
+    tag = "browse",
+    params(DescribeParams),
+    responses(
+        (status = 200, description = "Edges for the queried entity", body = crate::openapi::schemas::DescribeResponse),
+        (status = 304, description = "Not modified (ETag match)"),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_describe(
     State(state): State<Arc<AppState>>,
     headers: HeaderMap,
@@ -267,6 +286,22 @@ pub async fn handle_describe(
     describe_with_cache(&state, model, &headers, params).await
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/describe",
+    tag = "browse",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        DescribeParams,
+    ),
+    responses(
+        (status = 200, body = crate::openapi::schemas::DescribeResponse),
+        (status = 304),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_describe_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/embed.rs b/crates/larql-server/src/routes/embed.rs
index 605c3596..f463f635 100644
--- a/crates/larql-server/src/routes/embed.rs
+++ b/crates/larql-server/src/routes/embed.rs
@@ -15,6 +15,7 @@ use axum::http::header;
 use axum::response::{IntoResponse, Response};
 use axum::Json;
 use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
 
 use larql_inference::forward::predict::logits_to_predictions_pub;
 use larql_vindex::ndarray::Array2;
@@ -28,12 +29,12 @@ use crate::state::{AppState, LoadedModel};
 
 // ── Request / response types ──────────────────────────────────────────────────
 
-#[derive(Deserialize)]
+#[derive(Deserialize, ToSchema)]
 pub struct EmbedRequest {
     pub token_ids: Vec<u32>,
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct EmbedResponse {
     /// Row-major: seq_len × hidden_size f32 values.
     pub residual: Vec<Vec<f32>>,
@@ -42,7 +43,7 @@ pub struct EmbedResponse {
     pub latency_ms: f32,
 }
 
-#[derive(Deserialize)]
+#[derive(Deserialize, ToSchema)]
 pub struct LogitsRequest {
     /// Flat f32 residual of length hidden_size (one position, post-all-layers).
     pub residual: Vec<f32>,
@@ -105,14 +106,14 @@ fn parse_binary_logits_request(bytes: &[u8]) -> Result<Vec<f32>, ServerError> {
         .collect())
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct TokenProb {
     pub token_id: u32,
     pub token: String,
     pub prob: f32,
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct LogitsResponse {
     pub top_k: Vec<TokenProb>,
     pub latency_ms: f32,
@@ -183,6 +184,25 @@ pub(crate) fn embed_tokens(
 ///
 /// JSON response: `{"residual": [[f32, ...], ...], "seq_len": N, ...}`.
 /// Binary response: seq_len×hidden_size f32 LE, prefixed by two u32 headers.
+#[utoipa::path(
+    post,
+    path = "/v1/embed",
+    tag = "admin",
+    request_body(
+        content = EmbedRequest,
+        description = "JSON `{token_ids: [u32]}` OR binary `application/x-larql-ffn`: \
+                       `[num_tokens u32 LE][token_ids u32 LE...]`.",
+    ),
+    responses(
+        (status = 200, description = "JSON response", body = EmbedResponse),
+        (status = 200, content_type = "application/x-larql-ffn",
+         body = Vec<u8>,
+         description = "Binary response when the request used `Content-Type: application/x-larql-ffn`: \
+                        `[seq_len u32][hidden u32][seq_len × hidden f32 LE]`."),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_embed(
     State(state): State<Arc<AppState>>,
     headers: axum::http::HeaderMap,
@@ -191,6 +211,19 @@ pub async fn handle_embed(
     handle_embed_inner(&state, None, headers, body).await
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/embed",
+    tag = "admin",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body(content = EmbedRequest, description = "JSON or binary `application/x-larql-ffn`."),
+    responses(
+        (status = 200, body = EmbedResponse),
+        (status = 200, content_type = "application/x-larql-ffn", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_embed_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
@@ -281,6 +314,21 @@ async fn handle_embed_inner(
 /// Accepts JSON (`{"residual": [...], "top_k": 5, "temperature": 1.0}`) or
 /// binary (`Content-Type: application/x-larql-ffn`, raw hidden_size f32 LE
 /// bytes). Returns JSON top-k tokens.
+#[utoipa::path(
+    post,
+    path = "/v1/logits",
+    tag = "admin",
+    request_body(
+        content = LogitsRequest,
+        description = "JSON `{residual: [f32; hidden_size], top_k, temperature}` OR binary \
+                       `application/x-larql-ffn` with raw hidden_size f32 LE bytes.",
+    ),
+    responses(
+        (status = 200, description = "Top-K tokens from lm_head", body = LogitsResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_logits(
     State(state): State<Arc<AppState>>,
     headers: axum::http::HeaderMap,
@@ -289,6 +337,18 @@ pub async fn handle_logits(
     handle_logits_inner(&state, None, headers, body).await
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/logits",
+    tag = "admin",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body(content = LogitsRequest),
+    responses(
+        (status = 200, body = LogitsResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_logits_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
@@ -380,6 +440,18 @@ async fn handle_logits_inner(
 // ─────────────────────────────────────────────────────────────────────────────
 
 /// `GET /v1/token/encode?text=Paris`
+#[utoipa::path(
+    get,
+    path = "/v1/token/encode",
+    tag = "admin",
+    params(
+        ("text" = String, Query, description = "Text to tokenize."),
+    ),
+    responses(
+        (status = 200, description = "Token IDs for the text", body = crate::openapi::schemas::TokenEncodeResponse),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_token_encode(
     State(state): State<Arc<AppState>>,
     Query(q): Query<TokenEncodeQuery>,
@@ -387,6 +459,19 @@ pub async fn handle_token_encode(
     handle_token_encode_inner(&state, None, q)
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/token/encode",
+    tag = "admin",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        ("text" = String, Query, description = "Text to tokenize."),
+    ),
+    responses(
+        (status = 200, body = crate::openapi::schemas::TokenEncodeResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_token_encode_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
@@ -418,6 +503,19 @@ fn handle_token_encode_inner(
 // ─────────────────────────────────────────────────────────────────────────────
 
 /// `GET /v1/token/decode?ids=9515,235,1234`
+#[utoipa::path(
+    get,
+    path = "/v1/token/decode",
+    tag = "admin",
+    params(
+        ("ids" = String, Query, description = "Comma-separated token IDs, e.g. `9515,235,1234`."),
+    ),
+    responses(
+        (status = 200, description = "Decoded text", body = crate::openapi::schemas::TokenDecodeResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_token_decode(
     State(state): State<Arc<AppState>>,
     Query(q): Query<TokenDecodeQuery>,
@@ -425,6 +523,20 @@ pub async fn handle_token_decode(
     handle_token_decode_inner(&state, None, q)
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/token/decode",
+    tag = "admin",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        ("ids" = String, Query, description = "Comma-separated token IDs."),
+    ),
+    responses(
+        (status = 200, body = crate::openapi::schemas::TokenDecodeResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_token_decode_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
@@ -482,6 +594,20 @@ fn handle_token_decode_inner(
 ///
 /// Response (JSON, if Accept: application/json):
 ///   {"token_id": N, "embedding": [f32, ...], "hidden_size": N}
+#[utoipa::path(
+    get,
+    path = "/v1/embed/{token_id}",
+    tag = "admin",
+    params(
+        ("token_id" = u32, Path, description = "Vocabulary token id."),
+    ),
+    responses(
+        (status = 200, description = "Binary f32 LE embedding (default)", content_type = "application/x-larql-ffn", body = Vec<u8>),
+        (status = 200, description = "JSON response when `Accept: application/json`", body = crate::openapi::schemas::EmbedSingleJsonResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_embed_single(
     State(state): State<Arc<AppState>>,
     Path(token_id): Path<u32>,
@@ -490,6 +616,21 @@ pub async fn handle_embed_single(
     handle_embed_single_inner(&state, None, token_id, headers)
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/embed/{token_id}",
+    tag = "admin",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        ("token_id" = u32, Path),
+    ),
+    responses(
+        (status = 200, content_type = "application/x-larql-ffn", body = Vec<u8>),
+        (status = 200, body = crate::openapi::schemas::EmbedSingleJsonResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_embed_single_multi(
     State(state): State<Arc<AppState>>,
     Path((model_id, token_id)): Path<(String, u32)>,
diff --git a/crates/larql-server/src/routes/expert/batch_legacy.rs b/crates/larql-server/src/routes/expert/batch_legacy.rs
index a5a90e0c..ee987129 100644
--- a/crates/larql-server/src/routes/expert/batch_legacy.rs
+++ b/crates/larql-server/src/routes/expert/batch_legacy.rs
@@ -28,6 +28,20 @@ use crate::state::AppState;
 use super::single::run_expert;
 use super::{BatchExpertRequest, BatchExpertResponse, BatchExpertResult};
 
+#[utoipa::path(
+    post,
+    path = "/v1/expert/batch",
+    tag = "expert",
+    request_body(
+        content = crate::routes::expert::BatchExpertRequest,
+        description = "JSON `{requests:[{layer, expert_id, residual}]}` OR binary \
+                       `application/x-larql-expert` wire (see docs/server-spec.md).",
+    ),
+    responses(
+        (status = 200, description = "Per-item expert outputs", body = crate::routes::expert::BatchExpertResponse),
+        (status = 400, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_expert_batch(
     State(state): State<Arc<AppState>>,
     headers: axum::http::HeaderMap,
diff --git a/crates/larql-server/src/routes/expert/layer_batch.rs b/crates/larql-server/src/routes/expert/layer_batch.rs
index 492471c7..79e3ac71 100644
--- a/crates/larql-server/src/routes/expert/layer_batch.rs
+++ b/crates/larql-server/src/routes/expert/layer_batch.rs
@@ -57,6 +57,21 @@ fn compute_semaphore() -> &'static Semaphore {
     })
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/experts/layer-batch",
+    tag = "expert",
+    request_body(
+        content_type = "application/octet-stream",
+        description = "Binary wire: one pre-norm residual + K (expert_id, weight) pairs for a single layer. \
+                       Router-weighted sum is returned as f32. See `docs/server-spec.md` for the exact packed layout.",
+    ),
+    responses(
+        (status = 200, content_type = "application/x-larql-ffn",
+         description = "Weighted-sum f32 output", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_experts_layer_batch(
     State(state): State<Arc<AppState>>,
     body: Bytes,
@@ -123,6 +138,20 @@ pub async fn handle_experts_layer_batch(
     Ok(resp)
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/experts/layer-batch-f16",
+    tag = "expert",
+    request_body(
+        content_type = "application/octet-stream",
+        description = "Same shape as `/v1/experts/layer-batch` but residual is f16 to cut upload bandwidth by 2×.",
+    ),
+    responses(
+        (status = 200, content_type = "application/x-larql-ffn",
+         description = "Weighted-sum f32 output", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_experts_layer_batch_f16(
     State(state): State<Arc<AppState>>,
     body: Bytes,
diff --git a/crates/larql-server/src/routes/expert/mod.rs b/crates/larql-server/src/routes/expert/mod.rs
index 5fd01da5..a5fc5e60 100644
--- a/crates/larql-server/src/routes/expert/mod.rs
+++ b/crates/larql-server/src/routes/expert/mod.rs
@@ -25,6 +25,7 @@
 //!   Metal expert buffer cache, called from boot.
 
 use serde::{Deserialize, Serialize};
+use utoipa::ToSchema;
 
 pub mod batch_legacy;
 pub mod cpu;
@@ -57,37 +58,37 @@ pub use warmup::warmup_metal_expert_cache;
 // Kept in `mod.rs` because they're shared across the single + batch_legacy
 // handlers and trivially small.
 
-#[derive(Deserialize)]
+#[derive(Deserialize, ToSchema)]
 pub struct SingleExpertRequest {
     pub residual: Vec<f32>,
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct SingleExpertResponse {
     pub output: Vec<f32>,
     pub latency_ms: f64,
 }
 
-#[derive(Deserialize)]
+#[derive(Deserialize, ToSchema)]
 pub struct BatchExpertItem {
     pub layer: usize,
     pub expert_id: usize,
     pub residual: Vec<f32>,
 }
 
-#[derive(Deserialize)]
+#[derive(Deserialize, ToSchema)]
 pub struct BatchExpertRequest {
     pub requests: Vec<BatchExpertItem>,
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct BatchExpertResult {
     pub layer: usize,
     pub expert_id: usize,
     pub output: Vec<f32>,
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct BatchExpertResponse {
     pub results: Vec<BatchExpertResult>,
     pub latency_ms: f64,
diff --git a/crates/larql-server/src/routes/expert/multi_layer_batch.rs b/crates/larql-server/src/routes/expert/multi_layer_batch.rs
index 1f8de791..67560515 100644
--- a/crates/larql-server/src/routes/expert/multi_layer_batch.rs
+++ b/crates/larql-server/src/routes/expert/multi_layer_batch.rs
@@ -29,6 +29,21 @@ use crate::state::AppState;
 
 use super::cpu::{run_experts_cpu_batch, run_experts_cpu_batch_q8k_prenormed};
 
+#[utoipa::path(
+    post,
+    path = "/v1/experts/multi-layer-batch",
+    tag = "expert",
+    request_body(
+        content_type = "application/octet-stream",
+        description = "N packed layer tasks — each `(layer, residual, expert_ids, weights)`. \
+                       Server runs every task in parallel via rayon and returns N per-layer f32 outputs in one response.",
+    ),
+    responses(
+        (status = 200, content_type = "application/x-larql-ffn-multi",
+         description = "N per-layer f32 outputs, one per task", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_experts_multi_layer_batch(
     State(state): State<Arc<AppState>>,
     body: Bytes,
@@ -85,6 +100,21 @@ pub async fn handle_experts_multi_layer_batch(
 /// Q8K-prenormed variant: client pre-quantises h_norm, server skips
 /// `pre_experts_norm` and `quantize_h_norm_for_q4k` — just the matvec.
 /// 4× smaller upload; response is standard f32.
+#[utoipa::path(
+    post,
+    path = "/v1/experts/multi-layer-batch-q8k",
+    tag = "expert",
+    request_body(
+        content_type = "application/octet-stream",
+        description = "Same shape as `/v1/experts/multi-layer-batch` but the client has already applied `pre_experts_norm` \
+                       and quantised `h_norm` to Q8K, saving ~4× upload bandwidth. Response is standard f32.",
+    ),
+    responses(
+        (status = 200, content_type = "application/x-larql-ffn-multi",
+         description = "N per-layer f32 outputs, one per task", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_experts_multi_layer_batch_q8k(
     State(state): State<Arc<AppState>>,
     body: Bytes,
diff --git a/crates/larql-server/src/routes/expert/single.rs b/crates/larql-server/src/routes/expert/single.rs
index 33508888..48a8b901 100644
--- a/crates/larql-server/src/routes/expert/single.rs
+++ b/crates/larql-server/src/routes/expert/single.rs
@@ -137,6 +137,21 @@ pub fn run_expert(
     Ok(output)
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/expert/{layer}/{expert_id}",
+    tag = "expert",
+    params(
+        ("layer" = usize, Path),
+        ("expert_id" = usize, Path),
+    ),
+    request_body = SingleExpertRequest,
+    responses(
+        (status = 200, description = "Expert output vector", body = SingleExpertResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_expert(
     State(state): State<Arc<AppState>>,
     Path((layer, expert_id)): Path<(usize, usize)>,
diff --git a/crates/larql-server/src/routes/explain.rs b/crates/larql-server/src/routes/explain.rs
index 93cc92c8..dd882a65 100644
--- a/crates/larql-server/src/routes/explain.rs
+++ b/crates/larql-server/src/routes/explain.rs
@@ -10,7 +10,7 @@ use crate::band_utils::{get_layer_bands, BAND_KNOWLEDGE, BAND_OUTPUT, BAND_SYNTA
 use crate::error::ServerError;
 use crate::state::{elapsed_ms, AppState, LoadedModel};
 
-#[derive(Deserialize)]
+#[derive(Deserialize, utoipa::ToSchema)]
 pub struct ExplainRequest {
     pub prompt: String,
     #[serde(default = "default_top")]
@@ -287,6 +287,17 @@ fn explain_infer(
     Ok(body)
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/explain-infer",
+    tag = "inference",
+    request_body = ExplainRequest,
+    responses(
+        (status = 200, description = "Predictions with per-layer feature traces", body = crate::openapi::schemas::ExplainResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_explain(
     State(state): State<Arc<AppState>>,
     Json(req): Json<ExplainRequest>,
@@ -299,6 +310,18 @@ pub async fn handle_explain(
     Ok(Json(result))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/explain-infer",
+    tag = "inference",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body = ExplainRequest,
+    responses(
+        (status = 200, body = crate::openapi::schemas::ExplainResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_explain_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/health.rs b/crates/larql-server/src/routes/health.rs
index 3f776905..20916a5b 100644
--- a/crates/larql-server/src/routes/health.rs
+++ b/crates/larql-server/src/routes/health.rs
@@ -8,6 +8,14 @@ use axum::Json;
 use crate::band_utils::HEALTH_STATUS_OK;
 use crate::state::AppState;
 
+#[utoipa::path(
+    get,
+    path = "/v1/health",
+    tag = "admin",
+    responses(
+        (status = 200, description = "Server is alive", body = crate::openapi::schemas::HealthResponse),
+    ),
+)]
 pub async fn handle_health(State(state): State<Arc<AppState>>) -> Json<serde_json::Value> {
     state.bump_requests();
     let uptime = state.started_at.elapsed().as_secs();
diff --git a/crates/larql-server/src/routes/infer.rs b/crates/larql-server/src/routes/infer.rs
index 51975193..2bdbffb0 100644
--- a/crates/larql-server/src/routes/infer.rs
+++ b/crates/larql-server/src/routes/infer.rs
@@ -12,11 +12,14 @@ use crate::error::ServerError;
 use crate::session::extract_session_id;
 use crate::state::{elapsed_ms, AppState, LoadedModel};
 
-#[derive(Deserialize)]
+#[derive(Deserialize, utoipa::ToSchema)]
 pub struct InferRequest {
+    /// Prompt to run inference on.
     pub prompt: String,
+    /// Top-K next-token predictions to return.
     #[serde(default = "default_top")]
     pub top: usize,
+    /// Inference mode: `walk` (default), `dense`, or `compare`.
     #[serde(default = "default_mode")]
     pub mode: String,
 }
@@ -187,6 +190,18 @@ fn run_infer(
     Ok(serde_json::Value::Object(result))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/infer",
+    tag = "inference",
+    request_body = InferRequest,
+    responses(
+        (status = 200, description = "Next-token predictions", body = crate::openapi::schemas::InferResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 503, body = crate::error::ErrorBody, description = "Inference weights unavailable"),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_infer(
     State(state): State<Arc<AppState>>,
     headers: HeaderMap,
@@ -203,6 +218,19 @@ pub async fn handle_infer(
     Ok(Json(result))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/infer",
+    tag = "inference",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body = InferRequest,
+    responses(
+        (status = 200, body = crate::openapi::schemas::InferResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+        (status = 503, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_infer_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/insert.rs b/crates/larql-server/src/routes/insert.rs
index 5d692a62..7b80cfad 100644
--- a/crates/larql-server/src/routes/insert.rs
+++ b/crates/larql-server/src/routes/insert.rs
@@ -16,7 +16,7 @@ use crate::error::ServerError;
 use crate::session::extract_session_id;
 use crate::state::{elapsed_ms, AppState, LoadedModel};
 
-#[derive(Deserialize)]
+#[derive(Deserialize, utoipa::ToSchema)]
 pub struct InsertRequest {
     pub entity: String,
     pub relation: String,
@@ -237,6 +237,17 @@ fn run_insert(
     }))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/insert",
+    tag = "inference",
+    request_body = InsertRequest,
+    responses(
+        (status = 200, description = "Constellation insert result", body = crate::openapi::schemas::InsertResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_insert(
     State(state): State<Arc<AppState>>,
     headers: HeaderMap,
@@ -253,6 +264,18 @@ pub async fn handle_insert(
     Ok(Json(result))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/insert",
+    tag = "inference",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body = InsertRequest,
+    responses(
+        (status = 200, body = crate::openapi::schemas::InsertResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_insert_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/models.rs b/crates/larql-server/src/routes/models.rs
index 6dd8491d..8936f647 100644
--- a/crates/larql-server/src/routes/models.rs
+++ b/crates/larql-server/src/routes/models.rs
@@ -48,6 +48,14 @@ fn server_boot_unix_secs(state: &AppState) -> u64 {
     now_unix.saturating_sub(uptime)
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/models",
+    tag = "browse",
+    responses(
+        (status = 200, description = "OpenAI-compatible list of loaded models", body = crate::openapi::schemas::ModelsListResponse),
+    ),
+)]
 pub async fn handle_models(State(state): State<Arc<AppState>>) -> Json<serde_json::Value> {
     state.bump_requests();
 
diff --git a/crates/larql-server/src/routes/openai/chat.rs b/crates/larql-server/src/routes/openai/chat.rs
index 2cd9fe27..6951e5ec 100644
--- a/crates/larql-server/src/routes/openai/chat.rs
+++ b/crates/larql-server/src/routes/openai/chat.rs
@@ -225,6 +225,20 @@ pub struct ChatCompletionsResponse {
     pub usage: ChatUsage,
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/chat/completions",
+    tag = "openai",
+    request_body = crate::openapi::schemas::OpenAiChatRequest,
+    responses(
+        (status = 200, description = "Non-streaming JSON response.",
+         body = crate::openapi::schemas::OpenAiChatResponse),
+        (status = 200, description = "SSE stream when `stream: true`. Each event is `data: <ChatCompletionChunk JSON>\\n\\n`, terminated by `data: [DONE]`.",
+         content_type = "text/event-stream", body = String),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_chat_completions(
     State(state): State<Arc<AppState>>,
     Json(req): Json<ChatCompletionsRequest>,
diff --git a/crates/larql-server/src/routes/openai/completions.rs b/crates/larql-server/src/routes/openai/completions.rs
index 46afbd0e..2f71afc5 100644
--- a/crates/larql-server/src/routes/openai/completions.rs
+++ b/crates/larql-server/src/routes/openai/completions.rs
@@ -163,6 +163,20 @@ pub struct CompletionsResponse {
     pub usage: CompletionsUsage,
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/completions",
+    tag = "openai",
+    request_body = crate::openapi::schemas::OpenAiCompletionsRequest,
+    responses(
+        (status = 200, description = "Non-streaming JSON response.",
+         body = crate::openapi::schemas::OpenAiCompletionsResponse),
+        (status = 200, description = "SSE stream when `stream: true`. Each event is `data: <CompletionsChunk JSON>\\n\\n`, terminated by `data: [DONE]`.",
+         content_type = "text/event-stream", body = String),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_completions(
     State(state): State<Arc<AppState>>,
     Json(req): Json<CompletionsRequest>,
diff --git a/crates/larql-server/src/routes/openai/embeddings.rs b/crates/larql-server/src/routes/openai/embeddings.rs
index 85aa49a8..dfba9ae5 100644
--- a/crates/larql-server/src/routes/openai/embeddings.rs
+++ b/crates/larql-server/src/routes/openai/embeddings.rs
@@ -118,6 +118,18 @@ pub struct EmbeddingsResponse {
     pub usage: EmbeddingsUsage,
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/embeddings",
+    tag = "openai",
+    request_body = crate::openapi::schemas::OpenAiEmbeddingsRequest,
+    responses(
+        (status = 200, description = "Mean-pooled embeddings (not contrastively trained — use at your own risk).",
+         body = crate::openapi::schemas::OpenAiEmbeddingsResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_embeddings(
     State(state): State<Arc<AppState>>,
     Json(req): Json<EmbeddingsRequest>,
diff --git a/crates/larql-server/src/routes/patches.rs b/crates/larql-server/src/routes/patches.rs
index 5c70439b..f58c7bc9 100644
--- a/crates/larql-server/src/routes/patches.rs
+++ b/crates/larql-server/src/routes/patches.rs
@@ -163,6 +163,21 @@ async fn apply_patch_to_model(
     }
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/patches/apply",
+    tag = "patches",
+    request_body(
+        content = crate::openapi::schemas::ApplyPatchBody,
+        description = "Provide either a `url` (path / http(s):// / hf://) or an inline `patch` object. \
+                       Use the `X-Session-Id` header to scope the apply to a session.",
+    ),
+    responses(
+        (status = 200, description = "Patch applied", body = crate::openapi::schemas::ApplyPatchResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_apply_patch(
     State(state): State<Arc<AppState>>,
     headers: HeaderMap,
@@ -172,6 +187,18 @@ pub async fn handle_apply_patch(
     apply_patch_to_model(&state, None, &headers, req).await
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/patches/apply",
+    tag = "patches",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body(content = crate::openapi::schemas::ApplyPatchBody),
+    responses(
+        (status = 200, body = crate::openapi::schemas::ApplyPatchResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_apply_patch_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
@@ -214,6 +241,16 @@ async fn list_patches_for_model(
     Ok(Json(serde_json::json!({ "patches": patches })))
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/patches",
+    tag = "patches",
+    responses(
+        (status = 200, description = "Active patches for the current session or global state",
+         body = crate::openapi::schemas::ListPatchesResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_list_patches(
     State(state): State<Arc<AppState>>,
     headers: HeaderMap,
@@ -222,6 +259,16 @@ pub async fn handle_list_patches(
     list_patches_for_model(&state, None, &headers).await
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/patches",
+    tag = "patches",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    responses(
+        (status = 200, body = crate::openapi::schemas::ListPatchesResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_list_patches_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
@@ -268,6 +315,18 @@ async fn remove_patch_from_model(
     })))
 }
 
+#[utoipa::path(
+    delete,
+    path = "/v1/patches/{name}",
+    tag = "patches",
+    params(
+        ("name" = String, Path, description = "Patch description/name (or `inline-patch` if it was inlined without one)."),
+    ),
+    responses(
+        (status = 200, description = "Patch removed", body = crate::openapi::schemas::RemovePatchResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_remove_patch(
     State(state): State<Arc<AppState>>,
     headers: HeaderMap,
@@ -277,6 +336,19 @@ pub async fn handle_remove_patch(
     remove_patch_from_model(&state, None, &headers, &name).await
 }
 
+#[utoipa::path(
+    delete,
+    path = "/v1/{model_id}/patches/{name}",
+    tag = "patches",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        ("name" = String, Path, description = "Patch description/name."),
+    ),
+    responses(
+        (status = 200, body = crate::openapi::schemas::RemovePatchResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_remove_patch_multi(
     State(state): State<Arc<AppState>>,
     Path((model_id, name)): Path<(String, String)>,
diff --git a/crates/larql-server/src/routes/relations.rs b/crates/larql-server/src/routes/relations.rs
index 32aa8b83..15deceea 100644
--- a/crates/larql-server/src/routes/relations.rs
+++ b/crates/larql-server/src/routes/relations.rs
@@ -136,7 +136,8 @@ fn is_content_token(tok: &str) -> bool {
     )
 }
 
-#[derive(Deserialize, Default)]
+#[derive(Deserialize, Default, utoipa::IntoParams)]
+#[into_params(parameter_in = Query)]
 pub struct RelationsParams {
     /// Filter by label source (future use).
     #[serde(default)]
@@ -250,6 +251,16 @@ fn list_relations(model: &LoadedModel) -> Result<serde_json::Value, ServerError>
     }))
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/relations",
+    tag = "browse",
+    params(RelationsParams),
+    responses(
+        (status = 200, description = "Known relation types across the knowledge band", body = crate::openapi::schemas::RelationsResponse),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_relations(
     State(state): State<Arc<AppState>>,
     Query(_params): Query<RelationsParams>,
@@ -262,6 +273,20 @@ pub async fn handle_relations(
     Ok(Json(result))
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/relations",
+    tag = "browse",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        RelationsParams,
+    ),
+    responses(
+        (status = 200, body = crate::openapi::schemas::RelationsResponse),
+        (status = 404, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_relations_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/select.rs b/crates/larql-server/src/routes/select.rs
index 495ee5e7..f26292fd 100644
--- a/crates/larql-server/src/routes/select.rs
+++ b/crates/larql-server/src/routes/select.rs
@@ -9,7 +9,7 @@ use serde::Deserialize;
 use crate::error::ServerError;
 use crate::state::{elapsed_ms, AppState, LoadedModel};
 
-#[derive(Deserialize)]
+#[derive(Deserialize, utoipa::ToSchema)]
 pub struct SelectRequest {
     #[serde(default)]
     pub entity: Option<String>,
@@ -157,6 +157,17 @@ fn select_edges(
     }))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/select",
+    tag = "inference",
+    request_body = SelectRequest,
+    responses(
+        (status = 200, description = "Selected edges", body = crate::openapi::schemas::SelectResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_select(
     State(state): State<Arc<AppState>>,
     Json(req): Json<SelectRequest>,
@@ -169,6 +180,18 @@ pub async fn handle_select(
     Ok(Json(result))
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/{model_id}/select",
+    tag = "inference",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    request_body = SelectRequest,
+    responses(
+        (status = 200, body = crate::openapi::schemas::SelectResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_select_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/stats.rs b/crates/larql-server/src/routes/stats.rs
index 545abb2a..dbe2f0c6 100644
--- a/crates/larql-server/src/routes/stats.rs
+++ b/crates/larql-server/src/routes/stats.rs
@@ -79,6 +79,15 @@ async fn add_q4k_ffn(model: &LoadedModel, mut stats: serde_json::Value) -> serde
     stats
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/stats",
+    tag = "browse",
+    responses(
+        (status = 200, description = "Model + vindex statistics", body = crate::openapi::schemas::StatsResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_stats(
     State(state): State<Arc<AppState>>,
 ) -> Result<Json<serde_json::Value>, ServerError> {
@@ -88,6 +97,16 @@ pub async fn handle_stats(
     Ok(Json(add_q4k_ffn(model, stats).await))
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/stats",
+    tag = "browse",
+    params(("model_id" = String, Path, description = "Id of a loaded vindex.")),
+    responses(
+        (status = 200, body = crate::openapi::schemas::StatsResponse),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_stats_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/stream.rs b/crates/larql-server/src/routes/stream.rs
index c28cb7ea..fb8b805f 100644
--- a/crates/larql-server/src/routes/stream.rs
+++ b/crates/larql-server/src/routes/stream.rs
@@ -101,6 +101,25 @@ fn ws_infer_done(
     })
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/stream",
+    tag = "admin",
+    responses(
+        (status = 101, description = "\
+WebSocket upgrade. After upgrade:\n\n\
+**Client → server** (text frames, JSON):\n\
+- `{\"type\":\"describe\", \"entity\":\"France\", \"band\":\"all\"}` — streams layer-by-layer describe.\n\
+- `{\"type\":\"infer\", \"prompt\":\"The capital of France is\", \"top\":5, \"mode\":\"walk\"}` — \
+streams top-K predictions one at a time.\n\n\
+**Server → client** (text frames, JSON):\n\
+- `{\"type\":\"layer\", \"layer\":N, \"edges\":[...]}` — per-layer describe output.\n\
+- `{\"type\":\"done\", \"entity\":..., \"total_edges\":N, \"latency_ms\":M}` — describe finished.\n\
+- `{\"type\":\"prediction\", \"rank\":I, \"token\":..., \"probability\":P}` — inference result.\n\
+- `{\"type\":\"infer_done\", \"prompt\":..., \"mode\":..., \"predictions\":N, \"latency_ms\":M}` — inference finished.\n\
+- `{\"type\":\"error\", \"message\":...}` — protocol or runtime error.\n"),
+    ),
+)]
 pub async fn handle_stream(State(state): State<Arc<AppState>>, ws: WebSocketUpgrade) -> Response {
     ws.on_upgrade(move |socket| handle_socket(socket, state))
 }
diff --git a/crates/larql-server/src/routes/topology.rs b/crates/larql-server/src/routes/topology.rs
index ae4e8f52..5d2d045c 100644
--- a/crates/larql-server/src/routes/topology.rs
+++ b/crates/larql-server/src/routes/topology.rs
@@ -14,10 +14,11 @@ use axum::extract::State;
 use axum::http::StatusCode;
 use axum::Json;
 use serde::Serialize;
+use utoipa::ToSchema;
 
 use crate::state::AppState;
 
-#[derive(Serialize)]
+#[derive(Serialize, ToSchema)]
 pub struct TopologyResponse {
     /// Model identifier (e.g. `"google/gemma-4-26B-A4B-it"`).
     pub model_id: String,
@@ -31,6 +32,15 @@ pub struct TopologyResponse {
     pub owned_end: usize,
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/expert/topology",
+    tag = "browse",
+    responses(
+        (status = 200, description = "Expert ownership range for this shard", body = TopologyResponse),
+        (status = 404, description = "Server was not launched with --experts"),
+    ),
+)]
 pub async fn handle_topology(
     State(state): State<Arc<AppState>>,
 ) -> Result<Json<TopologyResponse>, StatusCode> {
diff --git a/crates/larql-server/src/routes/walk.rs b/crates/larql-server/src/routes/walk.rs
index 5ade4f2f..31e5ba4a 100644
--- a/crates/larql-server/src/routes/walk.rs
+++ b/crates/larql-server/src/routes/walk.rs
@@ -9,11 +9,15 @@ use serde::Deserialize;
 use crate::error::ServerError;
 use crate::state::{elapsed_ms, AppState, LoadedModel};
 
-#[derive(Deserialize)]
+#[derive(Deserialize, utoipa::IntoParams)]
+#[into_params(parameter_in = Query)]
 pub struct WalkParams {
+    /// Prompt text to scan for features.
     pub prompt: String,
+    /// Top-K features per layer.
     #[serde(default = "default_top")]
     pub top: usize,
+    /// Restrict scan to these layers — either a range (`"24-33"`) or a comma list (`"14,26,27"`).
     #[serde(default)]
     pub layers: Option<String>,
 }
@@ -88,6 +92,17 @@ fn walk_prompt(model: &LoadedModel, params: &WalkParams) -> Result<serde_json::V
     }))
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/walk",
+    tag = "browse",
+    params(WalkParams),
+    responses(
+        (status = 200, description = "Top feature hits for the prompt", body = crate::openapi::schemas::WalkResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_walk(
     State(state): State<Arc<AppState>>,
     Query(params): Query<WalkParams>,
@@ -100,6 +115,21 @@ pub async fn handle_walk(
     Ok(Json(result))
 }
 
+#[utoipa::path(
+    get,
+    path = "/v1/{model_id}/walk",
+    tag = "browse",
+    params(
+        ("model_id" = String, Path, description = "Id of a loaded vindex."),
+        WalkParams,
+    ),
+    responses(
+        (status = 200, body = crate::openapi::schemas::WalkResponse),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_walk_multi(
     State(state): State<Arc<AppState>>,
     Path(model_id): Path<String>,
diff --git a/crates/larql-server/src/routes/walk_ffn.rs b/crates/larql-server/src/routes/walk_ffn.rs
index 0e7ed79f..e3145163 100644
--- a/crates/larql-server/src/routes/walk_ffn.rs
+++ b/crates/larql-server/src/routes/walk_ffn.rs
@@ -684,6 +684,25 @@ fn run_walk_ffn(state: &AppState, req: &WalkFfnRequest) -> Result<serde_json::Va
 
 // ── HTTP handler ──────────────────────────────────────────────────────────────
 
+#[utoipa::path(
+    post,
+    path = "/v1/walk-ffn",
+    tag = "expert",
+    request_body(
+        content_type = "application/x-larql-ffn",
+        description = "Dense-FFN walk. Accepts JSON `WalkFfnRequest` (Content-Type `application/json`) \
+                       OR the packed binary `application/x-larql-ffn` wire (requires `full_output = true`). \
+                       See `docs/server-spec.md` for the full wire layout.",
+    ),
+    responses(
+        (status = 200, description = "JSON result when the request was JSON",
+         content_type = "application/json", body = Vec<u8>),
+        (status = 200, description = "Binary packed output when the request was `application/x-larql-ffn`",
+         content_type = "application/x-larql-ffn", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_walk_ffn(
     State(state): State<Arc<AppState>>,
     request: axum::extract::Request,
@@ -774,6 +793,22 @@ pub(crate) const Q8K_BATCH_CT: &str = "application/x-larql-ffn-q8k-batch";
 ///
 /// Returns 404 if the vindex doesn't have interleaved Q4K data (ffn-only
 /// servers without Q4K weights can't serve this endpoint).
+#[utoipa::path(
+    post,
+    path = "/v1/walk-ffn-q8k",
+    tag = "expert",
+    request_body(
+        content_type = "application/x-larql-ffn-q8k-batch",
+        description = "Q8K-prenormed dense-FFN batch: client has applied FFN input norm + Q8 quantisation. \
+                       404 if the vindex lacks interleaved Q4K data.",
+    ),
+    responses(
+        (status = 200, content_type = "application/x-larql-ffn-q8k-batch",
+         description = "Per-layer FFN delta as f32", body = Vec<u8>),
+        (status = 400, body = crate::error::ErrorBody),
+        (status = 404, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_walk_ffn_q8k(
     State(state): State<Arc<crate::state::AppState>>,
     request: axum::extract::Request,
diff --git a/crates/larql-server/src/routes/warmup.rs b/crates/larql-server/src/routes/warmup.rs
index a575ea02..1c4db6cb 100644
--- a/crates/larql-server/src/routes/warmup.rs
+++ b/crates/larql-server/src/routes/warmup.rs
@@ -26,7 +26,7 @@ use tracing::info;
 use crate::error::ServerError;
 use crate::state::{AppState, LoadedModel};
 
-#[derive(Default, Deserialize)]
+#[derive(Default, Deserialize, utoipa::ToSchema)]
 pub struct WarmupRequest {
     /// Specific layers to prefetch (`madvise WILLNEED`). Defaults to
     /// every owned layer when omitted — the typical case for boot
@@ -48,7 +48,7 @@ pub struct WarmupRequest {
     pub warmup_hnsw: bool,
 }
 
-#[derive(Serialize)]
+#[derive(Serialize, utoipa::ToSchema)]
 pub struct WarmupResponse {
     pub model: String,
     pub weights_loaded: bool,
@@ -164,6 +164,16 @@ pub async fn warmup_model_async(model: Arc<LoadedModel>, req: WarmupRequest) ->
         .expect("warmup spawn_blocking")
 }
 
+#[utoipa::path(
+    post,
+    path = "/v1/warmup",
+    tag = "inference",
+    request_body = WarmupRequest,
+    responses(
+        (status = 200, description = "Warmup completed", body = WarmupResponse),
+        (status = 500, body = crate::error::ErrorBody),
+    ),
+)]
 pub async fn handle_warmup(
     State(state): State<Arc<AppState>>,
     body: Option<Json<WarmupRequest>>,