diff --git a/crates/larql-server/Cargo.toml b/crates/larql-server/Cargo.toml index 0447fd01..1f19e9b4 100644 --- a/crates/larql-server/Cargo.toml +++ b/crates/larql-server/Cargo.toml @@ -46,6 +46,8 @@ serde = { workspace = true, features = ["derive"] } serde_json = { workspace = true } thiserror = { workspace = true } base64 = "0.22" +utoipa = { version = "5", features = ["axum_extras", "preserve_order"] } +utoipa-swagger-ui = { version = "9", features = ["axum"] } [features] default = [] diff --git a/crates/larql-server/src/bootstrap.rs b/crates/larql-server/src/bootstrap.rs index 81557952..5201df0c 100644 --- a/crates/larql-server/src/bootstrap.rs +++ b/crates/larql-server/src/bootstrap.rs @@ -523,6 +523,10 @@ pub struct Cli { #[arg(long)] pub cors: bool, + /// Disable the built-in Swagger UI and /v1/openapi.json endpoint. + #[arg(long)] + pub no_docs: bool, + /// API key for authentication (clients send Authorization: Bearer ). #[arg(long)] pub api_key: Option, @@ -869,6 +873,14 @@ pub async fn serve(cli: Cli) -> Result<(), BoxError> { } } + // OpenAPI / Swagger UI. Mounted before auth so the docs stay reachable + // without the API key — consistent with --cors behavior. Flip the + // ordering if operators want docs gated. + if !cli.no_docs { + app = app.merge(crate::openapi::swagger_router()); + info!("OpenAPI: /swagger-ui and /v1/openapi.json enabled"); + } + // Auth middleware. if cli.api_key.is_some() { app = app.layer(middleware::from_fn_with_state( diff --git a/crates/larql-server/src/error.rs b/crates/larql-server/src/error.rs index 3de32e98..fd821885 100644 --- a/crates/larql-server/src/error.rs +++ b/crates/larql-server/src/error.rs @@ -2,6 +2,15 @@ use axum::http::StatusCode; use axum::response::{IntoResponse, Response}; +use serde::Serialize; +use utoipa::ToSchema; + +/// JSON body returned for every error response. +#[derive(Debug, Serialize, ToSchema)] +pub struct ErrorBody { + /// Human-readable error message. + pub error: String, +} #[derive(Debug, thiserror::Error)] pub enum ServerError { @@ -30,7 +39,6 @@ impl IntoResponse for ServerError { ServerError::Internal(msg) => (StatusCode::INTERNAL_SERVER_ERROR, msg.clone()), }; - let body = serde_json::json!({ "error": message }); - (status, axum::Json(body)).into_response() + (status, axum::Json(ErrorBody { error: message })).into_response() } } diff --git a/crates/larql-server/src/lib.rs b/crates/larql-server/src/lib.rs index bcab84ff..d97a0177 100644 --- a/crates/larql-server/src/lib.rs +++ b/crates/larql-server/src/lib.rs @@ -17,6 +17,7 @@ pub mod ffn_l2_cache; pub mod grpc; pub mod grpc_expert; pub mod http; +pub mod openapi; pub mod ratelimit; pub mod routes; pub mod session; diff --git a/crates/larql-server/src/openapi.rs b/crates/larql-server/src/openapi.rs new file mode 100644 index 00000000..73fb9758 --- /dev/null +++ b/crates/larql-server/src/openapi.rs @@ -0,0 +1,581 @@ +//! OpenAPI / Swagger UI aggregation. +//! +//! Spec JSON is served at `/v1/openapi.json` and the browse-friendly +//! Swagger UI at `/swagger-ui`. Both can be disabled with `--no-docs`. +//! +//! Handlers are annotated in place with `#[utoipa::path]`. This module +//! owns: +//! - `ApiDoc` — the aggregator `#[derive(OpenApi)]` struct. +//! - `schemas` — synthetic response structs for handlers that return +//! `Json` (most of the browse/inference surface). +//! - `params` — shared request parameters (e.g. `model_id`). +//! - `swagger_router()` — helper that returns a ready-to-merge router +//! hosting both the UI and the spec JSON. + +use utoipa::OpenApi; +use utoipa_swagger_ui::SwaggerUi; + +use crate::error::ErrorBody; + +pub mod params { + use utoipa::IntoParams; + + /// Path parameter selecting which vindex to target in multi-model mode. + #[derive(IntoParams)] + #[into_params(parameter_in = Path)] + #[allow(dead_code)] + pub struct ModelIdParam { + /// The id of a loaded vindex, e.g. `gemma-3-1b-it`. + pub model_id: String, + } +} + +pub mod schemas { + //! Synthetic response schemas. + //! + //! Populated as each handler group is annotated. Structs here are + //! `Serialize + ToSchema` mirrors of the actual JSON the handlers + //! emit via `Json`. They are never constructed at + //! runtime — they exist purely for spec generation. + + use serde::Serialize; + use utoipa::ToSchema; + + // ---- browse ------------------------------------------------------ + + /// One knowledge edge returned from `/v1/describe`. + #[derive(Serialize, ToSchema)] + pub struct DescribeEdge { + /// Top token at this feature (trimmed). + pub target: String, + /// Gate activation score (rounded to 0.1). + pub gate_score: f32, + /// Layer the feature lives on. + pub layer: usize, + /// Feature index within the layer. + pub feature: usize, + /// Relation label (present when a probe-confirmed label exists). + #[serde(skip_serializing_if = "Option::is_none")] + pub relation: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct DescribeResponse { + pub entity: String, + pub model: String, + pub edges: Vec, + pub latency_ms: f64, + } + + /// One walk hit returned from `/v1/walk`. + #[derive(Serialize, ToSchema)] + pub struct WalkHit { + pub layer: usize, + pub feature: usize, + pub gate_score: f32, + pub target: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub relation: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct WalkResponse { + pub prompt: String, + pub hits: Vec, + pub latency_ms: f64, + } + + #[derive(Serialize, ToSchema)] + pub struct RelationEntry { + pub name: String, + pub count: usize, + pub max_score: f32, + pub min_layer: usize, + pub max_layer: usize, + pub examples: Vec, + } + + #[derive(Serialize, ToSchema)] + pub struct RelationsResponse { + pub relations: Vec, + pub total: usize, + pub latency_ms: f64, + } + + #[derive(Serialize, ToSchema)] + pub struct LayerBands { + pub syntax: [usize; 2], + pub knowledge: [usize; 2], + pub output: [usize; 2], + } + + #[derive(Serialize, ToSchema)] + pub struct LoadedCapabilities { + pub browse: bool, + pub inference: bool, + pub ffn_service: bool, + pub embed_service: bool, + } + + #[derive(Serialize, ToSchema)] + pub struct StatsResponse { + pub model: String, + pub family: String, + pub layers: usize, + pub features: usize, + pub features_per_layer: usize, + pub hidden_size: usize, + pub vocab_size: usize, + pub extract_level: String, + pub dtype: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub mode: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub layer_bands: Option, + pub loaded: LoadedCapabilities, + } + + /// One entry in the OpenAI-compatible `/v1/models` list. + #[derive(Serialize, ToSchema)] + pub struct ModelEntry { + pub id: String, + pub object: String, + pub created: u64, + pub owned_by: String, + /// Route prefix for this model. `/v1/{id}` in multi-model mode, `/v1` otherwise. + pub path: String, + /// Total features across all layers. + pub features: usize, + pub loaded: bool, + } + + #[derive(Serialize, ToSchema)] + pub struct ModelsListResponse { + pub object: String, + pub data: Vec, + } + + // ---- inference --------------------------------------------------- + + #[derive(Serialize, ToSchema)] + pub struct SelectRow { + pub layer: usize, + pub feature: usize, + pub target: String, + pub confidence: f32, + #[serde(skip_serializing_if = "Option::is_none")] + pub relation: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct SelectResponse { + pub rows: Vec, + pub total: usize, + pub latency_ms: f64, + } + + #[derive(Serialize, ToSchema)] + pub struct Prediction { + pub token: String, + pub probability: f64, + } + + #[derive(Serialize, ToSchema)] + pub struct InferResponse { + pub prompt: String, + pub mode: String, + /// Single-mode (`walk` or `dense`). + #[serde(skip_serializing_if = "Option::is_none")] + pub predictions: Option>, + /// Populated in `compare` mode. + #[serde(skip_serializing_if = "Option::is_none")] + pub walk: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub dense: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub walk_ms: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub dense_ms: Option, + pub latency_ms: f64, + } + + #[derive(Serialize, ToSchema)] + pub struct ExplainLayerEntry { + pub layer: usize, + pub top_features: Vec, + pub top_tokens: Vec<(String, f64)>, + } + + #[derive(Serialize, ToSchema)] + pub struct ExplainResponse { + pub prompt: String, + pub predictions: Vec, + pub layers: Vec, + pub latency_ms: f64, + } + + #[derive(Serialize, ToSchema)] + pub struct InsertResponse { + pub success: bool, + pub entity: String, + pub relation: String, + pub target: String, + pub layers_written: Vec, + pub latency_ms: f64, + } + + // ---- patches ----------------------------------------------------- + + /// Request body for `POST /v1/patches/apply`. Provide either a `url` + /// pointing at a `.vlp` file (local path or `hf://` URL) or an + /// inline `patch` object. One of the two is required. + #[derive(Serialize, ToSchema)] + pub struct ApplyPatchBody { + /// Local path, `http(s)://`, or `hf://` URL to a `.vlp` patch file. + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + /// Inline patch payload. See VindexPatch docs for schema; includes + /// `description`, `base_model`, and `operations` (INSERT / DELETE). + #[serde(skip_serializing_if = "Option::is_none")] + pub patch: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct ApplyPatchResponse { + pub applied: String, + pub operations: usize, + pub active_patches: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub session: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct PatchEntry { + pub name: String, + pub operations: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub base_model: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct ListPatchesResponse { + pub patches: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub session: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct RemovePatchResponse { + pub removed: String, + pub active_patches: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub session: Option, + } + + // ---- admin ------------------------------------------------------- + + #[derive(Serialize, ToSchema)] + pub struct HealthResponse { + pub status: String, + pub uptime_seconds: u64, + pub requests_served: u64, + } + + #[derive(Serialize, ToSchema)] + pub struct TokenEncodeResponse { + pub token_ids: Vec, + pub text: String, + } + + #[derive(Serialize, ToSchema)] + pub struct TokenDecodeResponse { + pub text: String, + pub token_ids: Vec, + } + + #[derive(Serialize, ToSchema)] + pub struct EmbedSingleJsonResponse { + pub token_id: u32, + pub embedding: Vec, + pub hidden_size: usize, + } + + // ---- openai ------------------------------------------------------ + // + // These mirror the OpenAI wire contract at a high level. + // Full nested types (tools, tool_calls, logprobs, usage) are documented + // inline as open JSON objects to avoid a deep ToSchema tree. + + /// Subset of the OpenAI `POST /v1/embeddings` request body. + #[derive(Serialize, ToSchema)] + pub struct OpenAiEmbeddingsRequest { + /// Model id. Required in multi-model mode; ignored otherwise. + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// String, string[], int[] (single sequence), or int[][] (batch of sequences). + pub input: serde_json::Value, + /// `"float"` (default) or `"base64"`. + #[serde(skip_serializing_if = "Option::is_none")] + pub encoding_format: Option, + /// Requested output dimensionality (ignored; returns native hidden size). + #[serde(skip_serializing_if = "Option::is_none")] + pub dimensions: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub user: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct OpenAiEmbeddingObject { + pub object: String, + pub index: usize, + /// `[f32]` when `encoding_format = "float"`, or a base64 string otherwise. + pub embedding: serde_json::Value, + } + + #[derive(Serialize, ToSchema)] + pub struct OpenAiEmbeddingsResponse { + pub object: String, + pub data: Vec, + pub model: String, + pub usage: serde_json::Value, + } + + /// OpenAI `POST /v1/completions` request. + #[derive(Serialize, ToSchema)] + pub struct OpenAiCompletionsRequest { + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + /// Prompt — string or string[]. + pub prompt: serde_json::Value, + #[serde(skip_serializing_if = "Option::is_none")] + pub max_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub temperature: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub top_p: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub stream: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub n: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub stop: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub echo: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub logprobs: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub seed: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub user: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub frequency_penalty: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub presence_penalty: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct OpenAiCompletionsResponse { + pub id: String, + pub object: String, + pub created: u64, + pub model: String, + pub choices: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub usage: Option, + } + + /// OpenAI `POST /v1/chat/completions` request. `messages` is an array + /// of `{role: "system"|"user"|"assistant"|"tool", content, ...}`; tools + /// and structured output are open JSON (see OpenAI docs). + #[derive(Serialize, ToSchema)] + pub struct OpenAiChatRequest { + #[serde(skip_serializing_if = "Option::is_none")] + pub model: Option, + pub messages: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub max_tokens: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub temperature: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub top_p: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub stream: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub n: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub stop: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub logprobs: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub top_logprobs: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub seed: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub user: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub frequency_penalty: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub presence_penalty: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub response_format: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tools: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tool_choice: Option, + } + + #[derive(Serialize, ToSchema)] + pub struct OpenAiChatResponse { + pub id: String, + pub object: String, + pub created: u64, + pub model: String, + pub choices: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + pub usage: Option, + } +} + +#[derive(OpenApi)] +#[openapi( + info( + title = "larql-server", + version = env!("CARGO_PKG_VERSION"), + description = "HTTP API for vindex knowledge queries, inference, and remote MoE expert shards.", + ), + tags( + (name = "browse", description = "Knowledge graph browse (no weights required)"), + (name = "inference", description = "Forward passes, explain, insert, warmup"), + (name = "openai", description = "OpenAI-compatible endpoints"), + (name = "expert", description = "Remote MoE shard endpoints (binary wire)"), + (name = "patches", description = "Runtime patch overlay"), + (name = "admin", description = "Health, models, embed, tokens, WebSocket"), + ), + paths( + // browse + crate::routes::describe::handle_describe, + crate::routes::walk::handle_walk, + crate::routes::relations::handle_relations, + crate::routes::stats::handle_stats, + crate::routes::topology::handle_topology, + crate::routes::models::handle_models, + // inference + crate::routes::select::handle_select, + crate::routes::infer::handle_infer, + crate::routes::explain::handle_explain, + crate::routes::insert::handle_insert, + crate::routes::warmup::handle_warmup, + // patches + crate::routes::patches::handle_apply_patch, + crate::routes::patches::handle_list_patches, + crate::routes::patches::handle_remove_patch, + // admin + crate::routes::health::handle_health, + crate::routes::embed::handle_embed, + crate::routes::embed::handle_embed_single, + crate::routes::embed::handle_logits, + crate::routes::embed::handle_token_encode, + crate::routes::embed::handle_token_decode, + crate::routes::stream::handle_stream, + // openai + crate::routes::openai::embeddings::handle_embeddings, + crate::routes::openai::completions::handle_completions, + crate::routes::openai::chat::handle_chat_completions, + // expert + crate::routes::walk_ffn::handle_walk_ffn, + crate::routes::walk_ffn::handle_walk_ffn_q8k, + crate::routes::expert::single::handle_expert, + crate::routes::expert::batch_legacy::handle_expert_batch, + crate::routes::expert::layer_batch::handle_experts_layer_batch, + crate::routes::expert::layer_batch::handle_experts_layer_batch_f16, + crate::routes::expert::multi_layer_batch::handle_experts_multi_layer_batch, + crate::routes::expert::multi_layer_batch::handle_experts_multi_layer_batch_q8k, + // multi-model variants — same handlers with a `{model_id}` path prefix + crate::routes::describe::handle_describe_multi, + crate::routes::walk::handle_walk_multi, + crate::routes::relations::handle_relations_multi, + crate::routes::stats::handle_stats_multi, + crate::routes::select::handle_select_multi, + crate::routes::infer::handle_infer_multi, + crate::routes::explain::handle_explain_multi, + crate::routes::insert::handle_insert_multi, + crate::routes::patches::handle_apply_patch_multi, + crate::routes::patches::handle_list_patches_multi, + crate::routes::patches::handle_remove_patch_multi, + crate::routes::embed::handle_embed_multi, + crate::routes::embed::handle_embed_single_multi, + crate::routes::embed::handle_logits_multi, + crate::routes::embed::handle_token_encode_multi, + crate::routes::embed::handle_token_decode_multi, + ), + components(schemas( + ErrorBody, + // browse + schemas::DescribeEdge, + schemas::DescribeResponse, + schemas::WalkHit, + schemas::WalkResponse, + schemas::RelationEntry, + schemas::RelationsResponse, + schemas::LayerBands, + schemas::LoadedCapabilities, + schemas::StatsResponse, + schemas::ModelEntry, + schemas::ModelsListResponse, + crate::routes::topology::TopologyResponse, + // inference + crate::routes::select::SelectRequest, + schemas::SelectRow, + schemas::SelectResponse, + crate::routes::infer::InferRequest, + schemas::Prediction, + schemas::InferResponse, + crate::routes::explain::ExplainRequest, + schemas::ExplainLayerEntry, + schemas::ExplainResponse, + crate::routes::insert::InsertRequest, + schemas::InsertResponse, + crate::routes::warmup::WarmupRequest, + crate::routes::warmup::WarmupResponse, + // patches + schemas::ApplyPatchBody, + schemas::ApplyPatchResponse, + schemas::PatchEntry, + schemas::ListPatchesResponse, + schemas::RemovePatchResponse, + // admin + schemas::HealthResponse, + schemas::TokenEncodeResponse, + schemas::TokenDecodeResponse, + schemas::EmbedSingleJsonResponse, + crate::routes::embed::EmbedRequest, + crate::routes::embed::EmbedResponse, + crate::routes::embed::LogitsRequest, + crate::routes::embed::LogitsResponse, + crate::routes::embed::TokenProb, + // openai + schemas::OpenAiEmbeddingsRequest, + schemas::OpenAiEmbeddingObject, + schemas::OpenAiEmbeddingsResponse, + schemas::OpenAiCompletionsRequest, + schemas::OpenAiCompletionsResponse, + schemas::OpenAiChatRequest, + schemas::OpenAiChatResponse, + // expert + crate::routes::expert::SingleExpertRequest, + crate::routes::expert::SingleExpertResponse, + crate::routes::expert::BatchExpertItem, + crate::routes::expert::BatchExpertRequest, + crate::routes::expert::BatchExpertResult, + crate::routes::expert::BatchExpertResponse, + )), +)] +pub struct ApiDoc; + +/// Build a router hosting Swagger UI at `/swagger-ui` and the spec at +/// `/v1/openapi.json`. Merge into the main app router. +pub fn swagger_router() -> axum::Router { + SwaggerUi::new("/swagger-ui") + .url("/v1/openapi.json", ApiDoc::openapi()) + .into() +} diff --git a/crates/larql-server/src/routes/describe.rs b/crates/larql-server/src/routes/describe.rs index 77b69686..2af653ed 100644 --- a/crates/larql-server/src/routes/describe.rs +++ b/crates/larql-server/src/routes/describe.rs @@ -18,15 +18,21 @@ use crate::state::{elapsed_ms, AppState, LoadedModel}; const DESCRIBE_CACHE_CONTROL: &str = "public, max-age=86400"; -#[derive(Deserialize)] +#[derive(Deserialize, utoipa::IntoParams)] +#[into_params(parameter_in = Query)] pub struct DescribeParams { + /// Entity to describe, e.g. `France`. pub entity: String, + /// Layer band to scan: `knowledge` (default), `syntax`, `output`, or `all`. #[serde(default = "default_band")] pub band: String, + /// Include low-score edges in the response. #[serde(default)] pub verbose: bool, + /// Maximum number of edges to return. #[serde(default = "default_limit")] pub limit: usize, + /// Minimum gate score to include an edge. #[serde(default = "default_min_score")] pub min_score: f32, } @@ -257,6 +263,19 @@ async fn describe_with_cache( .into_response()) } +#[utoipa::path( + get, + path = "/v1/describe", + tag = "browse", + params(DescribeParams), + responses( + (status = 200, description = "Edges for the queried entity", body = crate::openapi::schemas::DescribeResponse), + (status = 304, description = "Not modified (ETag match)"), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_describe( State(state): State>, headers: HeaderMap, @@ -267,6 +286,22 @@ pub async fn handle_describe( describe_with_cache(&state, model, &headers, params).await } +#[utoipa::path( + get, + path = "/v1/{model_id}/describe", + tag = "browse", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + DescribeParams, + ), + responses( + (status = 200, body = crate::openapi::schemas::DescribeResponse), + (status = 304), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_describe_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/embed.rs b/crates/larql-server/src/routes/embed.rs index 605c3596..f463f635 100644 --- a/crates/larql-server/src/routes/embed.rs +++ b/crates/larql-server/src/routes/embed.rs @@ -15,6 +15,7 @@ use axum::http::header; use axum::response::{IntoResponse, Response}; use axum::Json; use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; use larql_inference::forward::predict::logits_to_predictions_pub; use larql_vindex::ndarray::Array2; @@ -28,12 +29,12 @@ use crate::state::{AppState, LoadedModel}; // ── Request / response types ────────────────────────────────────────────────── -#[derive(Deserialize)] +#[derive(Deserialize, ToSchema)] pub struct EmbedRequest { pub token_ids: Vec, } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct EmbedResponse { /// Row-major: seq_len × hidden_size f32 values. pub residual: Vec>, @@ -42,7 +43,7 @@ pub struct EmbedResponse { pub latency_ms: f32, } -#[derive(Deserialize)] +#[derive(Deserialize, ToSchema)] pub struct LogitsRequest { /// Flat f32 residual of length hidden_size (one position, post-all-layers). pub residual: Vec, @@ -105,14 +106,14 @@ fn parse_binary_logits_request(bytes: &[u8]) -> Result, ServerError> { .collect()) } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct TokenProb { pub token_id: u32, pub token: String, pub prob: f32, } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct LogitsResponse { pub top_k: Vec, pub latency_ms: f32, @@ -183,6 +184,25 @@ pub(crate) fn embed_tokens( /// /// JSON response: `{"residual": [[f32, ...], ...], "seq_len": N, ...}`. /// Binary response: seq_len×hidden_size f32 LE, prefixed by two u32 headers. +#[utoipa::path( + post, + path = "/v1/embed", + tag = "admin", + request_body( + content = EmbedRequest, + description = "JSON `{token_ids: [u32]}` OR binary `application/x-larql-ffn`: \ + `[num_tokens u32 LE][token_ids u32 LE...]`.", + ), + responses( + (status = 200, description = "JSON response", body = EmbedResponse), + (status = 200, content_type = "application/x-larql-ffn", + body = Vec, + description = "Binary response when the request used `Content-Type: application/x-larql-ffn`: \ + `[seq_len u32][hidden u32][seq_len × hidden f32 LE]`."), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_embed( State(state): State>, headers: axum::http::HeaderMap, @@ -191,6 +211,19 @@ pub async fn handle_embed( handle_embed_inner(&state, None, headers, body).await } +#[utoipa::path( + post, + path = "/v1/{model_id}/embed", + tag = "admin", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body(content = EmbedRequest, description = "JSON or binary `application/x-larql-ffn`."), + responses( + (status = 200, body = EmbedResponse), + (status = 200, content_type = "application/x-larql-ffn", body = Vec), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_embed_multi( State(state): State>, Path(model_id): Path, @@ -281,6 +314,21 @@ async fn handle_embed_inner( /// Accepts JSON (`{"residual": [...], "top_k": 5, "temperature": 1.0}`) or /// binary (`Content-Type: application/x-larql-ffn`, raw hidden_size f32 LE /// bytes). Returns JSON top-k tokens. +#[utoipa::path( + post, + path = "/v1/logits", + tag = "admin", + request_body( + content = LogitsRequest, + description = "JSON `{residual: [f32; hidden_size], top_k, temperature}` OR binary \ + `application/x-larql-ffn` with raw hidden_size f32 LE bytes.", + ), + responses( + (status = 200, description = "Top-K tokens from lm_head", body = LogitsResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_logits( State(state): State>, headers: axum::http::HeaderMap, @@ -289,6 +337,18 @@ pub async fn handle_logits( handle_logits_inner(&state, None, headers, body).await } +#[utoipa::path( + post, + path = "/v1/{model_id}/logits", + tag = "admin", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body(content = LogitsRequest), + responses( + (status = 200, body = LogitsResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_logits_multi( State(state): State>, Path(model_id): Path, @@ -380,6 +440,18 @@ async fn handle_logits_inner( // ───────────────────────────────────────────────────────────────────────────── /// `GET /v1/token/encode?text=Paris` +#[utoipa::path( + get, + path = "/v1/token/encode", + tag = "admin", + params( + ("text" = String, Query, description = "Text to tokenize."), + ), + responses( + (status = 200, description = "Token IDs for the text", body = crate::openapi::schemas::TokenEncodeResponse), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_token_encode( State(state): State>, Query(q): Query, @@ -387,6 +459,19 @@ pub async fn handle_token_encode( handle_token_encode_inner(&state, None, q) } +#[utoipa::path( + get, + path = "/v1/{model_id}/token/encode", + tag = "admin", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + ("text" = String, Query, description = "Text to tokenize."), + ), + responses( + (status = 200, body = crate::openapi::schemas::TokenEncodeResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_token_encode_multi( State(state): State>, Path(model_id): Path, @@ -418,6 +503,19 @@ fn handle_token_encode_inner( // ───────────────────────────────────────────────────────────────────────────── /// `GET /v1/token/decode?ids=9515,235,1234` +#[utoipa::path( + get, + path = "/v1/token/decode", + tag = "admin", + params( + ("ids" = String, Query, description = "Comma-separated token IDs, e.g. `9515,235,1234`."), + ), + responses( + (status = 200, description = "Decoded text", body = crate::openapi::schemas::TokenDecodeResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_token_decode( State(state): State>, Query(q): Query, @@ -425,6 +523,20 @@ pub async fn handle_token_decode( handle_token_decode_inner(&state, None, q) } +#[utoipa::path( + get, + path = "/v1/{model_id}/token/decode", + tag = "admin", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + ("ids" = String, Query, description = "Comma-separated token IDs."), + ), + responses( + (status = 200, body = crate::openapi::schemas::TokenDecodeResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_token_decode_multi( State(state): State>, Path(model_id): Path, @@ -482,6 +594,20 @@ fn handle_token_decode_inner( /// /// Response (JSON, if Accept: application/json): /// {"token_id": N, "embedding": [f32, ...], "hidden_size": N} +#[utoipa::path( + get, + path = "/v1/embed/{token_id}", + tag = "admin", + params( + ("token_id" = u32, Path, description = "Vocabulary token id."), + ), + responses( + (status = 200, description = "Binary f32 LE embedding (default)", content_type = "application/x-larql-ffn", body = Vec), + (status = 200, description = "JSON response when `Accept: application/json`", body = crate::openapi::schemas::EmbedSingleJsonResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_embed_single( State(state): State>, Path(token_id): Path, @@ -490,6 +616,21 @@ pub async fn handle_embed_single( handle_embed_single_inner(&state, None, token_id, headers) } +#[utoipa::path( + get, + path = "/v1/{model_id}/embed/{token_id}", + tag = "admin", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + ("token_id" = u32, Path), + ), + responses( + (status = 200, content_type = "application/x-larql-ffn", body = Vec), + (status = 200, body = crate::openapi::schemas::EmbedSingleJsonResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_embed_single_multi( State(state): State>, Path((model_id, token_id)): Path<(String, u32)>, diff --git a/crates/larql-server/src/routes/expert/batch_legacy.rs b/crates/larql-server/src/routes/expert/batch_legacy.rs index a5a90e0c..ee987129 100644 --- a/crates/larql-server/src/routes/expert/batch_legacy.rs +++ b/crates/larql-server/src/routes/expert/batch_legacy.rs @@ -28,6 +28,20 @@ use crate::state::AppState; use super::single::run_expert; use super::{BatchExpertRequest, BatchExpertResponse, BatchExpertResult}; +#[utoipa::path( + post, + path = "/v1/expert/batch", + tag = "expert", + request_body( + content = crate::routes::expert::BatchExpertRequest, + description = "JSON `{requests:[{layer, expert_id, residual}]}` OR binary \ + `application/x-larql-expert` wire (see docs/server-spec.md).", + ), + responses( + (status = 200, description = "Per-item expert outputs", body = crate::routes::expert::BatchExpertResponse), + (status = 400, body = crate::error::ErrorBody), + ), +)] pub async fn handle_expert_batch( State(state): State>, headers: axum::http::HeaderMap, diff --git a/crates/larql-server/src/routes/expert/layer_batch.rs b/crates/larql-server/src/routes/expert/layer_batch.rs index 492471c7..79e3ac71 100644 --- a/crates/larql-server/src/routes/expert/layer_batch.rs +++ b/crates/larql-server/src/routes/expert/layer_batch.rs @@ -57,6 +57,21 @@ fn compute_semaphore() -> &'static Semaphore { }) } +#[utoipa::path( + post, + path = "/v1/experts/layer-batch", + tag = "expert", + request_body( + content_type = "application/octet-stream", + description = "Binary wire: one pre-norm residual + K (expert_id, weight) pairs for a single layer. \ + Router-weighted sum is returned as f32. See `docs/server-spec.md` for the exact packed layout.", + ), + responses( + (status = 200, content_type = "application/x-larql-ffn", + description = "Weighted-sum f32 output", body = Vec), + (status = 400, body = crate::error::ErrorBody), + ), +)] pub async fn handle_experts_layer_batch( State(state): State>, body: Bytes, @@ -123,6 +138,20 @@ pub async fn handle_experts_layer_batch( Ok(resp) } +#[utoipa::path( + post, + path = "/v1/experts/layer-batch-f16", + tag = "expert", + request_body( + content_type = "application/octet-stream", + description = "Same shape as `/v1/experts/layer-batch` but residual is f16 to cut upload bandwidth by 2×.", + ), + responses( + (status = 200, content_type = "application/x-larql-ffn", + description = "Weighted-sum f32 output", body = Vec), + (status = 400, body = crate::error::ErrorBody), + ), +)] pub async fn handle_experts_layer_batch_f16( State(state): State>, body: Bytes, diff --git a/crates/larql-server/src/routes/expert/mod.rs b/crates/larql-server/src/routes/expert/mod.rs index 5fd01da5..a5fc5e60 100644 --- a/crates/larql-server/src/routes/expert/mod.rs +++ b/crates/larql-server/src/routes/expert/mod.rs @@ -25,6 +25,7 @@ //! Metal expert buffer cache, called from boot. use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; pub mod batch_legacy; pub mod cpu; @@ -57,37 +58,37 @@ pub use warmup::warmup_metal_expert_cache; // Kept in `mod.rs` because they're shared across the single + batch_legacy // handlers and trivially small. -#[derive(Deserialize)] +#[derive(Deserialize, ToSchema)] pub struct SingleExpertRequest { pub residual: Vec, } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct SingleExpertResponse { pub output: Vec, pub latency_ms: f64, } -#[derive(Deserialize)] +#[derive(Deserialize, ToSchema)] pub struct BatchExpertItem { pub layer: usize, pub expert_id: usize, pub residual: Vec, } -#[derive(Deserialize)] +#[derive(Deserialize, ToSchema)] pub struct BatchExpertRequest { pub requests: Vec, } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct BatchExpertResult { pub layer: usize, pub expert_id: usize, pub output: Vec, } -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct BatchExpertResponse { pub results: Vec, pub latency_ms: f64, diff --git a/crates/larql-server/src/routes/expert/multi_layer_batch.rs b/crates/larql-server/src/routes/expert/multi_layer_batch.rs index 1f8de791..67560515 100644 --- a/crates/larql-server/src/routes/expert/multi_layer_batch.rs +++ b/crates/larql-server/src/routes/expert/multi_layer_batch.rs @@ -29,6 +29,21 @@ use crate::state::AppState; use super::cpu::{run_experts_cpu_batch, run_experts_cpu_batch_q8k_prenormed}; +#[utoipa::path( + post, + path = "/v1/experts/multi-layer-batch", + tag = "expert", + request_body( + content_type = "application/octet-stream", + description = "N packed layer tasks — each `(layer, residual, expert_ids, weights)`. \ + Server runs every task in parallel via rayon and returns N per-layer f32 outputs in one response.", + ), + responses( + (status = 200, content_type = "application/x-larql-ffn-multi", + description = "N per-layer f32 outputs, one per task", body = Vec), + (status = 400, body = crate::error::ErrorBody), + ), +)] pub async fn handle_experts_multi_layer_batch( State(state): State>, body: Bytes, @@ -85,6 +100,21 @@ pub async fn handle_experts_multi_layer_batch( /// Q8K-prenormed variant: client pre-quantises h_norm, server skips /// `pre_experts_norm` and `quantize_h_norm_for_q4k` — just the matvec. /// 4× smaller upload; response is standard f32. +#[utoipa::path( + post, + path = "/v1/experts/multi-layer-batch-q8k", + tag = "expert", + request_body( + content_type = "application/octet-stream", + description = "Same shape as `/v1/experts/multi-layer-batch` but the client has already applied `pre_experts_norm` \ + and quantised `h_norm` to Q8K, saving ~4× upload bandwidth. Response is standard f32.", + ), + responses( + (status = 200, content_type = "application/x-larql-ffn-multi", + description = "N per-layer f32 outputs, one per task", body = Vec), + (status = 400, body = crate::error::ErrorBody), + ), +)] pub async fn handle_experts_multi_layer_batch_q8k( State(state): State>, body: Bytes, diff --git a/crates/larql-server/src/routes/expert/single.rs b/crates/larql-server/src/routes/expert/single.rs index 33508888..48a8b901 100644 --- a/crates/larql-server/src/routes/expert/single.rs +++ b/crates/larql-server/src/routes/expert/single.rs @@ -137,6 +137,21 @@ pub fn run_expert( Ok(output) } +#[utoipa::path( + post, + path = "/v1/expert/{layer}/{expert_id}", + tag = "expert", + params( + ("layer" = usize, Path), + ("expert_id" = usize, Path), + ), + request_body = SingleExpertRequest, + responses( + (status = 200, description = "Expert output vector", body = SingleExpertResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_expert( State(state): State>, Path((layer, expert_id)): Path<(usize, usize)>, diff --git a/crates/larql-server/src/routes/explain.rs b/crates/larql-server/src/routes/explain.rs index 93cc92c8..dd882a65 100644 --- a/crates/larql-server/src/routes/explain.rs +++ b/crates/larql-server/src/routes/explain.rs @@ -10,7 +10,7 @@ use crate::band_utils::{get_layer_bands, BAND_KNOWLEDGE, BAND_OUTPUT, BAND_SYNTA use crate::error::ServerError; use crate::state::{elapsed_ms, AppState, LoadedModel}; -#[derive(Deserialize)] +#[derive(Deserialize, utoipa::ToSchema)] pub struct ExplainRequest { pub prompt: String, #[serde(default = "default_top")] @@ -287,6 +287,17 @@ fn explain_infer( Ok(body) } +#[utoipa::path( + post, + path = "/v1/explain-infer", + tag = "inference", + request_body = ExplainRequest, + responses( + (status = 200, description = "Predictions with per-layer feature traces", body = crate::openapi::schemas::ExplainResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_explain( State(state): State>, Json(req): Json, @@ -299,6 +310,18 @@ pub async fn handle_explain( Ok(Json(result)) } +#[utoipa::path( + post, + path = "/v1/{model_id}/explain-infer", + tag = "inference", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body = ExplainRequest, + responses( + (status = 200, body = crate::openapi::schemas::ExplainResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_explain_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/health.rs b/crates/larql-server/src/routes/health.rs index 3f776905..20916a5b 100644 --- a/crates/larql-server/src/routes/health.rs +++ b/crates/larql-server/src/routes/health.rs @@ -8,6 +8,14 @@ use axum::Json; use crate::band_utils::HEALTH_STATUS_OK; use crate::state::AppState; +#[utoipa::path( + get, + path = "/v1/health", + tag = "admin", + responses( + (status = 200, description = "Server is alive", body = crate::openapi::schemas::HealthResponse), + ), +)] pub async fn handle_health(State(state): State>) -> Json { state.bump_requests(); let uptime = state.started_at.elapsed().as_secs(); diff --git a/crates/larql-server/src/routes/infer.rs b/crates/larql-server/src/routes/infer.rs index 51975193..2bdbffb0 100644 --- a/crates/larql-server/src/routes/infer.rs +++ b/crates/larql-server/src/routes/infer.rs @@ -12,11 +12,14 @@ use crate::error::ServerError; use crate::session::extract_session_id; use crate::state::{elapsed_ms, AppState, LoadedModel}; -#[derive(Deserialize)] +#[derive(Deserialize, utoipa::ToSchema)] pub struct InferRequest { + /// Prompt to run inference on. pub prompt: String, + /// Top-K next-token predictions to return. #[serde(default = "default_top")] pub top: usize, + /// Inference mode: `walk` (default), `dense`, or `compare`. #[serde(default = "default_mode")] pub mode: String, } @@ -187,6 +190,18 @@ fn run_infer( Ok(serde_json::Value::Object(result)) } +#[utoipa::path( + post, + path = "/v1/infer", + tag = "inference", + request_body = InferRequest, + responses( + (status = 200, description = "Next-token predictions", body = crate::openapi::schemas::InferResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 503, body = crate::error::ErrorBody, description = "Inference weights unavailable"), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_infer( State(state): State>, headers: HeaderMap, @@ -203,6 +218,19 @@ pub async fn handle_infer( Ok(Json(result)) } +#[utoipa::path( + post, + path = "/v1/{model_id}/infer", + tag = "inference", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body = InferRequest, + responses( + (status = 200, body = crate::openapi::schemas::InferResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + (status = 503, body = crate::error::ErrorBody), + ), +)] pub async fn handle_infer_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/insert.rs b/crates/larql-server/src/routes/insert.rs index 5d692a62..7b80cfad 100644 --- a/crates/larql-server/src/routes/insert.rs +++ b/crates/larql-server/src/routes/insert.rs @@ -16,7 +16,7 @@ use crate::error::ServerError; use crate::session::extract_session_id; use crate::state::{elapsed_ms, AppState, LoadedModel}; -#[derive(Deserialize)] +#[derive(Deserialize, utoipa::ToSchema)] pub struct InsertRequest { pub entity: String, pub relation: String, @@ -237,6 +237,17 @@ fn run_insert( })) } +#[utoipa::path( + post, + path = "/v1/insert", + tag = "inference", + request_body = InsertRequest, + responses( + (status = 200, description = "Constellation insert result", body = crate::openapi::schemas::InsertResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_insert( State(state): State>, headers: HeaderMap, @@ -253,6 +264,18 @@ pub async fn handle_insert( Ok(Json(result)) } +#[utoipa::path( + post, + path = "/v1/{model_id}/insert", + tag = "inference", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body = InsertRequest, + responses( + (status = 200, body = crate::openapi::schemas::InsertResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_insert_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/models.rs b/crates/larql-server/src/routes/models.rs index 6dd8491d..8936f647 100644 --- a/crates/larql-server/src/routes/models.rs +++ b/crates/larql-server/src/routes/models.rs @@ -48,6 +48,14 @@ fn server_boot_unix_secs(state: &AppState) -> u64 { now_unix.saturating_sub(uptime) } +#[utoipa::path( + get, + path = "/v1/models", + tag = "browse", + responses( + (status = 200, description = "OpenAI-compatible list of loaded models", body = crate::openapi::schemas::ModelsListResponse), + ), +)] pub async fn handle_models(State(state): State>) -> Json { state.bump_requests(); diff --git a/crates/larql-server/src/routes/openai/chat.rs b/crates/larql-server/src/routes/openai/chat.rs index 2cd9fe27..6951e5ec 100644 --- a/crates/larql-server/src/routes/openai/chat.rs +++ b/crates/larql-server/src/routes/openai/chat.rs @@ -225,6 +225,20 @@ pub struct ChatCompletionsResponse { pub usage: ChatUsage, } +#[utoipa::path( + post, + path = "/v1/chat/completions", + tag = "openai", + request_body = crate::openapi::schemas::OpenAiChatRequest, + responses( + (status = 200, description = "Non-streaming JSON response.", + body = crate::openapi::schemas::OpenAiChatResponse), + (status = 200, description = "SSE stream when `stream: true`. Each event is `data: \\n\\n`, terminated by `data: [DONE]`.", + content_type = "text/event-stream", body = String), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_chat_completions( State(state): State>, Json(req): Json, diff --git a/crates/larql-server/src/routes/openai/completions.rs b/crates/larql-server/src/routes/openai/completions.rs index 46afbd0e..2f71afc5 100644 --- a/crates/larql-server/src/routes/openai/completions.rs +++ b/crates/larql-server/src/routes/openai/completions.rs @@ -163,6 +163,20 @@ pub struct CompletionsResponse { pub usage: CompletionsUsage, } +#[utoipa::path( + post, + path = "/v1/completions", + tag = "openai", + request_body = crate::openapi::schemas::OpenAiCompletionsRequest, + responses( + (status = 200, description = "Non-streaming JSON response.", + body = crate::openapi::schemas::OpenAiCompletionsResponse), + (status = 200, description = "SSE stream when `stream: true`. Each event is `data: \\n\\n`, terminated by `data: [DONE]`.", + content_type = "text/event-stream", body = String), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_completions( State(state): State>, Json(req): Json, diff --git a/crates/larql-server/src/routes/openai/embeddings.rs b/crates/larql-server/src/routes/openai/embeddings.rs index 85aa49a8..dfba9ae5 100644 --- a/crates/larql-server/src/routes/openai/embeddings.rs +++ b/crates/larql-server/src/routes/openai/embeddings.rs @@ -118,6 +118,18 @@ pub struct EmbeddingsResponse { pub usage: EmbeddingsUsage, } +#[utoipa::path( + post, + path = "/v1/embeddings", + tag = "openai", + request_body = crate::openapi::schemas::OpenAiEmbeddingsRequest, + responses( + (status = 200, description = "Mean-pooled embeddings (not contrastively trained — use at your own risk).", + body = crate::openapi::schemas::OpenAiEmbeddingsResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_embeddings( State(state): State>, Json(req): Json, diff --git a/crates/larql-server/src/routes/patches.rs b/crates/larql-server/src/routes/patches.rs index 5c70439b..f58c7bc9 100644 --- a/crates/larql-server/src/routes/patches.rs +++ b/crates/larql-server/src/routes/patches.rs @@ -163,6 +163,21 @@ async fn apply_patch_to_model( } } +#[utoipa::path( + post, + path = "/v1/patches/apply", + tag = "patches", + request_body( + content = crate::openapi::schemas::ApplyPatchBody, + description = "Provide either a `url` (path / http(s):// / hf://) or an inline `patch` object. \ + Use the `X-Session-Id` header to scope the apply to a session.", + ), + responses( + (status = 200, description = "Patch applied", body = crate::openapi::schemas::ApplyPatchResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_apply_patch( State(state): State>, headers: HeaderMap, @@ -172,6 +187,18 @@ pub async fn handle_apply_patch( apply_patch_to_model(&state, None, &headers, req).await } +#[utoipa::path( + post, + path = "/v1/{model_id}/patches/apply", + tag = "patches", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body(content = crate::openapi::schemas::ApplyPatchBody), + responses( + (status = 200, body = crate::openapi::schemas::ApplyPatchResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_apply_patch_multi( State(state): State>, Path(model_id): Path, @@ -214,6 +241,16 @@ async fn list_patches_for_model( Ok(Json(serde_json::json!({ "patches": patches }))) } +#[utoipa::path( + get, + path = "/v1/patches", + tag = "patches", + responses( + (status = 200, description = "Active patches for the current session or global state", + body = crate::openapi::schemas::ListPatchesResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_list_patches( State(state): State>, headers: HeaderMap, @@ -222,6 +259,16 @@ pub async fn handle_list_patches( list_patches_for_model(&state, None, &headers).await } +#[utoipa::path( + get, + path = "/v1/{model_id}/patches", + tag = "patches", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + responses( + (status = 200, body = crate::openapi::schemas::ListPatchesResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_list_patches_multi( State(state): State>, Path(model_id): Path, @@ -268,6 +315,18 @@ async fn remove_patch_from_model( }))) } +#[utoipa::path( + delete, + path = "/v1/patches/{name}", + tag = "patches", + params( + ("name" = String, Path, description = "Patch description/name (or `inline-patch` if it was inlined without one)."), + ), + responses( + (status = 200, description = "Patch removed", body = crate::openapi::schemas::RemovePatchResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_remove_patch( State(state): State>, headers: HeaderMap, @@ -277,6 +336,19 @@ pub async fn handle_remove_patch( remove_patch_from_model(&state, None, &headers, &name).await } +#[utoipa::path( + delete, + path = "/v1/{model_id}/patches/{name}", + tag = "patches", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + ("name" = String, Path, description = "Patch description/name."), + ), + responses( + (status = 200, body = crate::openapi::schemas::RemovePatchResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_remove_patch_multi( State(state): State>, Path((model_id, name)): Path<(String, String)>, diff --git a/crates/larql-server/src/routes/relations.rs b/crates/larql-server/src/routes/relations.rs index 32aa8b83..15deceea 100644 --- a/crates/larql-server/src/routes/relations.rs +++ b/crates/larql-server/src/routes/relations.rs @@ -136,7 +136,8 @@ fn is_content_token(tok: &str) -> bool { ) } -#[derive(Deserialize, Default)] +#[derive(Deserialize, Default, utoipa::IntoParams)] +#[into_params(parameter_in = Query)] pub struct RelationsParams { /// Filter by label source (future use). #[serde(default)] @@ -250,6 +251,16 @@ fn list_relations(model: &LoadedModel) -> Result })) } +#[utoipa::path( + get, + path = "/v1/relations", + tag = "browse", + params(RelationsParams), + responses( + (status = 200, description = "Known relation types across the knowledge band", body = crate::openapi::schemas::RelationsResponse), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_relations( State(state): State>, Query(_params): Query, @@ -262,6 +273,20 @@ pub async fn handle_relations( Ok(Json(result)) } +#[utoipa::path( + get, + path = "/v1/{model_id}/relations", + tag = "browse", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + RelationsParams, + ), + responses( + (status = 200, body = crate::openapi::schemas::RelationsResponse), + (status = 404, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_relations_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/select.rs b/crates/larql-server/src/routes/select.rs index 495ee5e7..f26292fd 100644 --- a/crates/larql-server/src/routes/select.rs +++ b/crates/larql-server/src/routes/select.rs @@ -9,7 +9,7 @@ use serde::Deserialize; use crate::error::ServerError; use crate::state::{elapsed_ms, AppState, LoadedModel}; -#[derive(Deserialize)] +#[derive(Deserialize, utoipa::ToSchema)] pub struct SelectRequest { #[serde(default)] pub entity: Option, @@ -157,6 +157,17 @@ fn select_edges( })) } +#[utoipa::path( + post, + path = "/v1/select", + tag = "inference", + request_body = SelectRequest, + responses( + (status = 200, description = "Selected edges", body = crate::openapi::schemas::SelectResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_select( State(state): State>, Json(req): Json, @@ -169,6 +180,18 @@ pub async fn handle_select( Ok(Json(result)) } +#[utoipa::path( + post, + path = "/v1/{model_id}/select", + tag = "inference", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + request_body = SelectRequest, + responses( + (status = 200, body = crate::openapi::schemas::SelectResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_select_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/stats.rs b/crates/larql-server/src/routes/stats.rs index 545abb2a..dbe2f0c6 100644 --- a/crates/larql-server/src/routes/stats.rs +++ b/crates/larql-server/src/routes/stats.rs @@ -79,6 +79,15 @@ async fn add_q4k_ffn(model: &LoadedModel, mut stats: serde_json::Value) -> serde stats } +#[utoipa::path( + get, + path = "/v1/stats", + tag = "browse", + responses( + (status = 200, description = "Model + vindex statistics", body = crate::openapi::schemas::StatsResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_stats( State(state): State>, ) -> Result, ServerError> { @@ -88,6 +97,16 @@ pub async fn handle_stats( Ok(Json(add_q4k_ffn(model, stats).await)) } +#[utoipa::path( + get, + path = "/v1/{model_id}/stats", + tag = "browse", + params(("model_id" = String, Path, description = "Id of a loaded vindex.")), + responses( + (status = 200, body = crate::openapi::schemas::StatsResponse), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_stats_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/stream.rs b/crates/larql-server/src/routes/stream.rs index c28cb7ea..fb8b805f 100644 --- a/crates/larql-server/src/routes/stream.rs +++ b/crates/larql-server/src/routes/stream.rs @@ -101,6 +101,25 @@ fn ws_infer_done( }) } +#[utoipa::path( + get, + path = "/v1/stream", + tag = "admin", + responses( + (status = 101, description = "\ +WebSocket upgrade. After upgrade:\n\n\ +**Client → server** (text frames, JSON):\n\ +- `{\"type\":\"describe\", \"entity\":\"France\", \"band\":\"all\"}` — streams layer-by-layer describe.\n\ +- `{\"type\":\"infer\", \"prompt\":\"The capital of France is\", \"top\":5, \"mode\":\"walk\"}` — \ +streams top-K predictions one at a time.\n\n\ +**Server → client** (text frames, JSON):\n\ +- `{\"type\":\"layer\", \"layer\":N, \"edges\":[...]}` — per-layer describe output.\n\ +- `{\"type\":\"done\", \"entity\":..., \"total_edges\":N, \"latency_ms\":M}` — describe finished.\n\ +- `{\"type\":\"prediction\", \"rank\":I, \"token\":..., \"probability\":P}` — inference result.\n\ +- `{\"type\":\"infer_done\", \"prompt\":..., \"mode\":..., \"predictions\":N, \"latency_ms\":M}` — inference finished.\n\ +- `{\"type\":\"error\", \"message\":...}` — protocol or runtime error.\n"), + ), +)] pub async fn handle_stream(State(state): State>, ws: WebSocketUpgrade) -> Response { ws.on_upgrade(move |socket| handle_socket(socket, state)) } diff --git a/crates/larql-server/src/routes/topology.rs b/crates/larql-server/src/routes/topology.rs index ae4e8f52..5d2d045c 100644 --- a/crates/larql-server/src/routes/topology.rs +++ b/crates/larql-server/src/routes/topology.rs @@ -14,10 +14,11 @@ use axum::extract::State; use axum::http::StatusCode; use axum::Json; use serde::Serialize; +use utoipa::ToSchema; use crate::state::AppState; -#[derive(Serialize)] +#[derive(Serialize, ToSchema)] pub struct TopologyResponse { /// Model identifier (e.g. `"google/gemma-4-26B-A4B-it"`). pub model_id: String, @@ -31,6 +32,15 @@ pub struct TopologyResponse { pub owned_end: usize, } +#[utoipa::path( + get, + path = "/v1/expert/topology", + tag = "browse", + responses( + (status = 200, description = "Expert ownership range for this shard", body = TopologyResponse), + (status = 404, description = "Server was not launched with --experts"), + ), +)] pub async fn handle_topology( State(state): State>, ) -> Result, StatusCode> { diff --git a/crates/larql-server/src/routes/walk.rs b/crates/larql-server/src/routes/walk.rs index 5ade4f2f..31e5ba4a 100644 --- a/crates/larql-server/src/routes/walk.rs +++ b/crates/larql-server/src/routes/walk.rs @@ -9,11 +9,15 @@ use serde::Deserialize; use crate::error::ServerError; use crate::state::{elapsed_ms, AppState, LoadedModel}; -#[derive(Deserialize)] +#[derive(Deserialize, utoipa::IntoParams)] +#[into_params(parameter_in = Query)] pub struct WalkParams { + /// Prompt text to scan for features. pub prompt: String, + /// Top-K features per layer. #[serde(default = "default_top")] pub top: usize, + /// Restrict scan to these layers — either a range (`"24-33"`) or a comma list (`"14,26,27"`). #[serde(default)] pub layers: Option, } @@ -88,6 +92,17 @@ fn walk_prompt(model: &LoadedModel, params: &WalkParams) -> Result>, Query(params): Query, @@ -100,6 +115,21 @@ pub async fn handle_walk( Ok(Json(result)) } +#[utoipa::path( + get, + path = "/v1/{model_id}/walk", + tag = "browse", + params( + ("model_id" = String, Path, description = "Id of a loaded vindex."), + WalkParams, + ), + responses( + (status = 200, body = crate::openapi::schemas::WalkResponse), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_walk_multi( State(state): State>, Path(model_id): Path, diff --git a/crates/larql-server/src/routes/walk_ffn.rs b/crates/larql-server/src/routes/walk_ffn.rs index 0e7ed79f..e3145163 100644 --- a/crates/larql-server/src/routes/walk_ffn.rs +++ b/crates/larql-server/src/routes/walk_ffn.rs @@ -684,6 +684,25 @@ fn run_walk_ffn(state: &AppState, req: &WalkFfnRequest) -> Result), + (status = 200, description = "Binary packed output when the request was `application/x-larql-ffn`", + content_type = "application/x-larql-ffn", body = Vec), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_walk_ffn( State(state): State>, request: axum::extract::Request, @@ -774,6 +793,22 @@ pub(crate) const Q8K_BATCH_CT: &str = "application/x-larql-ffn-q8k-batch"; /// /// Returns 404 if the vindex doesn't have interleaved Q4K data (ffn-only /// servers without Q4K weights can't serve this endpoint). +#[utoipa::path( + post, + path = "/v1/walk-ffn-q8k", + tag = "expert", + request_body( + content_type = "application/x-larql-ffn-q8k-batch", + description = "Q8K-prenormed dense-FFN batch: client has applied FFN input norm + Q8 quantisation. \ + 404 if the vindex lacks interleaved Q4K data.", + ), + responses( + (status = 200, content_type = "application/x-larql-ffn-q8k-batch", + description = "Per-layer FFN delta as f32", body = Vec), + (status = 400, body = crate::error::ErrorBody), + (status = 404, body = crate::error::ErrorBody), + ), +)] pub async fn handle_walk_ffn_q8k( State(state): State>, request: axum::extract::Request, diff --git a/crates/larql-server/src/routes/warmup.rs b/crates/larql-server/src/routes/warmup.rs index a575ea02..1c4db6cb 100644 --- a/crates/larql-server/src/routes/warmup.rs +++ b/crates/larql-server/src/routes/warmup.rs @@ -26,7 +26,7 @@ use tracing::info; use crate::error::ServerError; use crate::state::{AppState, LoadedModel}; -#[derive(Default, Deserialize)] +#[derive(Default, Deserialize, utoipa::ToSchema)] pub struct WarmupRequest { /// Specific layers to prefetch (`madvise WILLNEED`). Defaults to /// every owned layer when omitted — the typical case for boot @@ -48,7 +48,7 @@ pub struct WarmupRequest { pub warmup_hnsw: bool, } -#[derive(Serialize)] +#[derive(Serialize, utoipa::ToSchema)] pub struct WarmupResponse { pub model: String, pub weights_loaded: bool, @@ -164,6 +164,16 @@ pub async fn warmup_model_async(model: Arc, req: WarmupRequest) -> .expect("warmup spawn_blocking") } +#[utoipa::path( + post, + path = "/v1/warmup", + tag = "inference", + request_body = WarmupRequest, + responses( + (status = 200, description = "Warmup completed", body = WarmupResponse), + (status = 500, body = crate::error::ErrorBody), + ), +)] pub async fn handle_warmup( State(state): State>, body: Option>,