From 0a26cf6a4a523ff8c97d10c7551731a7d88bb1bc Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:27:12 +0100 Subject: [PATCH 1/7] feat(backend): pass JSONSchema and ResponseFormat through gRPC Add JSONSchema field to ModelConfig to carry the raw JSON schema string alongside the GBNF Grammar. Pass both JSONSchema and ResponseFormat through gRPCPredictOpts to backends via the new proto fields. This allows backends like vLLM to receive the original JSON schema for native structured output support. Ref: #6857 Signed-off-by: eureka928 --- core/backend/options.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/core/backend/options.go b/core/backend/options.go index b09782ce2ca7..afa2ab20095a 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -282,6 +282,8 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions TensorSplit: c.TensorSplit, TailFreeSamplingZ: float32(*c.TFZ), TypicalP: float32(*c.TypicalP), + JSONSchema: c.JSONSchema, + ResponseFormat: c.ResponseFormat, } metadata := map[string]string{} From bbb32ac24437d16df6e63fd97d8b069eac7a3b25 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:27:56 +0100 Subject: [PATCH 2/7] feat(endpoints): extract raw JSON schema for structured output In chat and completion endpoints, when response_format is json_schema, extract the raw JSON schema and store it on config.JSONSchema alongside the GBNF grammar. Also set config.ResponseFormat to the format type. This allows backends that support native structured output (like vLLM) to use the JSON schema directly instead of the GBNF grammar. Ref: #6857 Signed-off-by: eureka928 --- core/http/endpoints/openai/chat.go | 9 +++++++++ core/http/endpoints/openai/completion.go | 16 +++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index ab715d8b166d..bb0f87ba96b1 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -593,7 +593,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator switch d.Type { case "json_object": input.Grammar = functions.JSONBNF + config.ResponseFormat = "json_object" case "json_schema": + config.ResponseFormat = "json_schema" d := schema.JsonSchemaRequest{} dat, err := json.Marshal(config.ResponseFormatMap) if err != nil { @@ -603,6 +605,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator if err != nil { return err } + + // Pass raw JSON schema to backends that support native structured output + schemaBytes, err := json.Marshal(d.JsonSchema.Schema) + if err == nil { + config.JSONSchema = string(schemaBytes) + } + fs := &functions.JSONFunctionStructure{ AnyOf: []functions.Item{d.JsonSchema.Schema}, } diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 069bc33a60f5..693dfd7d2edb 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -88,8 +88,22 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva d := schema.ChatCompletionResponseFormat{} dat, _ := json.Marshal(config.ResponseFormatMap) _ = json.Unmarshal(dat, &d) - if d.Type == "json_object" { + switch d.Type { + case "json_object": input.Grammar = functions.JSONBNF + config.ResponseFormat = "json_object" + case "json_schema": + config.ResponseFormat = "json_schema" + jsr := schema.JsonSchemaRequest{} + dat, err := json.Marshal(config.ResponseFormatMap) + if err == nil { + if err := json.Unmarshal(dat, &jsr); err == nil { + schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema) + if err == nil { + config.JSONSchema = string(schemaBytes) + } + } + } } } From 3617e2aba48a45ed9680a9a38d487bb3a3a8bd4d Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:28:27 +0100 Subject: [PATCH 3/7] feat(vllm): add structured output support via guided decoding Update the vLLM backend to support structured output: - Import GuidedDecodingParams from vllm.sampling_params - Handle JSONSchema: parse and pass as GuidedDecodingParams(json_schema=...) - Handle json_object response format: GuidedDecodingParams(json_object=True) - Fall back to Grammar (GBNF) via GuidedDecodingParams(grammar=...) - Remove phantom GuidedDecoding mapping (field doesn't exist in proto) - Fix missing 'import time' and 'import json' for load_video and schema parsing Priority: JSONSchema > json_object > Grammar (GBNF fallback) Ref: #6857 Signed-off-by: eureka928 --- backend/python/vllm/backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 95ae95a9d4e6..c83926c1de32 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -2,6 +2,7 @@ import asyncio from concurrent import futures import argparse +import json import signal import sys import os @@ -21,7 +22,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams +from vllm.sampling_params import SamplingParams, GuidedDecodingParams from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image From 7a4d0a6bcb9d5198080ed5dc8027a48a2e9df36a Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 02:36:37 +0100 Subject: [PATCH 4/7] fix: refine vLLM structured output implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make GuidedDecodingParams import conditional (try/except) for backwards compatibility with older vLLM versions - Remove GBNF grammar fallback — vLLM expects EBNF, not GBNF, so passing LocalAI's GBNF grammar would produce confusing errors - Pass JSONSchema as string directly instead of parsing to dict (safer across vLLM versions) - Add GBNF grammar generation for json_schema in completion endpoint so non-vLLM backends (llama.cpp) also get grammar enforcement Ref: #6857 Signed-off-by: eureka928 --- backend/python/vllm/backend.py | 6 +++++- core/http/endpoints/openai/completion.go | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index c83926c1de32..8bbbf1fda34c 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -22,7 +22,11 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine -from vllm.sampling_params import SamplingParams, GuidedDecodingParams +from vllm.sampling_params import SamplingParams +try: + from vllm.sampling_params import GuidedDecodingParams +except ImportError: + GuidedDecodingParams = None from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 693dfd7d2edb..208fc52aff88 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -102,6 +102,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva if err == nil { config.JSONSchema = string(schemaBytes) } + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{jsr.JsonSchema.Schema}, + } + g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...) + if err == nil { + input.Grammar = g + } else { + xlog.Error("Failed generating grammar", "error", err) + } } } } From b19d7f23eda0723d71370ac3e04c3790586a0f79 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 03:21:58 +0100 Subject: [PATCH 5/7] fix(vllm): support both vLLM API versions and add grammar passthrough MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Handle both StructuredOutputsParams (vLLM latest) and GuidedDecodingParams (vLLM <=0.8.x) with graceful fallback - Use the correct SamplingParams field name for each version (structured_outputs vs guided_decoding) - Use 'json' parameter (not 'json_schema') matching both APIs - Re-add grammar (GBNF/BNF) passthrough — both vLLM APIs accept a 'grammar' parameter handled by xgrammar which supports GBNF - Priority: JSONSchema > json_object > Grammar Ref: #6857 Signed-off-by: eureka928 --- backend/python/vllm/backend.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py index 8bbbf1fda34c..adfd440cfbbd 100644 --- a/backend/python/vllm/backend.py +++ b/backend/python/vllm/backend.py @@ -23,10 +23,21 @@ from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.sampling_params import SamplingParams + +# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions. +# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs. try: - from vllm.sampling_params import GuidedDecodingParams + from vllm.sampling_params import StructuredOutputsParams + _structured_output_cls = StructuredOutputsParams + _structured_output_field = "structured_outputs" except ImportError: - GuidedDecodingParams = None + try: + from vllm.sampling_params import GuidedDecodingParams + _structured_output_cls = GuidedDecodingParams + _structured_output_field = "guided_decoding" + except ImportError: + _structured_output_cls = None + _structured_output_field = None from vllm.utils import random_uuid from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.multimodal.utils import fetch_image From 480bcd8ed164d90b1b57ec3dd9c137c2761bd717 Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 6 Mar 2026 03:22:42 +0100 Subject: [PATCH 6/7] docs: update constrained grammars with vLLM structured output support Update the compatibility notice to include vLLM alongside llama.cpp. Add a vLLM-specific section with examples for all three supported methods: json_schema, json_object, and grammar (via xgrammar). Ref: #6857 Signed-off-by: eureka928 --- docs/content/features/constrained_grammars.md | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md index 5cf39c438b4f..c594f79d8875 100644 --- a/docs/content/features/constrained_grammars.md +++ b/docs/content/features/constrained_grammars.md @@ -10,7 +10,11 @@ url = "/features/constrained_grammars/" The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form). {{% notice note %}} -**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887). +**Compatibility Notice:** Grammar and structured output support is available for the following backends: +- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object` +- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object` + +For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. {{% /notice %}} ## Setup @@ -66,6 +70,59 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars - Character classes (`[a-z]`) - String literals (`"text"`) +## vLLM Backend + +The vLLM backend supports structured output via three methods: + +### JSON Schema (recommended) + +Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure: + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Generate a person object"}], + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "person", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } + } +}' +``` + +### JSON Object + +Force the model to output valid JSON (without a specific schema): + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Generate a person as JSON"}], + "response_format": {"type": "json_object"} +}' +``` + +### Grammar + +The `grammar` parameter also works with vLLM via xgrammar: + +```bash +curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{ + "model": "my-vllm-model", + "messages": [{"role": "user", "content": "Do you like apples?"}], + "grammar": "root ::= (\"yes\" | \"no\")" +}' +``` + ## Related Features - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs From c703a03247463170026c1c4f6be29dbced4a956e Mon Sep 17 00:00:00 2001 From: eureka928 Date: Fri, 13 Mar 2026 19:14:07 +0100 Subject: [PATCH 7/7] refactor: use Metadata map instead of dedicated proto fields for structured output Address review feedback: - Remove JSONSchema and ResponseFormat proto fields; pass them via the existing Metadata map instead, avoiding proto changes - vLLM backend reads json_schema and response_format from request.Metadata - Add structured output support (json_schema, json_object) to Open Responses API via text_format parameter - Update docs with Open Responses structured output examples Ref: #6857 Signed-off-by: eureka928 --- core/backend/options.go | 8 +++- core/http/endpoints/openai/chat.go | 7 +++- core/http/endpoints/openai/completion.go | 5 ++- .../http/endpoints/openresponses/responses.go | 37 ++++++++++++++++++- docs/content/features/constrained_grammars.md | 37 +++++++++++++++++++ 5 files changed, 87 insertions(+), 7 deletions(-) diff --git a/core/backend/options.go b/core/backend/options.go index afa2ab20095a..17adbceb458b 100644 --- a/core/backend/options.go +++ b/core/backend/options.go @@ -282,8 +282,6 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions TensorSplit: c.TensorSplit, TailFreeSamplingZ: float32(*c.TFZ), TypicalP: float32(*c.TypicalP), - JSONSchema: c.JSONSchema, - ResponseFormat: c.ResponseFormat, } metadata := map[string]string{} @@ -294,6 +292,12 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions metadata["enable_thinking"] = "true" } } + if c.ResponseFormat != "" { + metadata["response_format"] = c.ResponseFormat + } + for k, v := range c.RequestMetadata { + metadata[k] = v + } pbOpts.Metadata = metadata // Logprobs and TopLogprobs are set by the caller if provided diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go index bb0f87ba96b1..8f8f79d84a39 100644 --- a/core/http/endpoints/openai/chat.go +++ b/core/http/endpoints/openai/chat.go @@ -606,10 +606,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator return err } - // Pass raw JSON schema to backends that support native structured output + // Pass raw JSON schema via metadata for backends that support native structured output schemaBytes, err := json.Marshal(d.JsonSchema.Schema) if err == nil { - config.JSONSchema = string(schemaBytes) + if config.RequestMetadata == nil { + config.RequestMetadata = map[string]string{} + } + config.RequestMetadata["json_schema"] = string(schemaBytes) } fs := &functions.JSONFunctionStructure{ diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go index 208fc52aff88..6e5aae07423e 100644 --- a/core/http/endpoints/openai/completion.go +++ b/core/http/endpoints/openai/completion.go @@ -100,7 +100,10 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva if err := json.Unmarshal(dat, &jsr); err == nil { schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema) if err == nil { - config.JSONSchema = string(schemaBytes) + if config.RequestMetadata == nil { + config.RequestMetadata = map[string]string{} + } + config.RequestMetadata["json_schema"] = string(schemaBytes) } fs := &functions.JSONFunctionStructure{ AnyOf: []functions.Item{jsr.JsonSchema.Schema}, diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go index 764156d4da1b..45f18edc5de8 100644 --- a/core/http/endpoints/openresponses/responses.go +++ b/core/http/endpoints/openresponses/responses.go @@ -173,9 +173,42 @@ func ResponsesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval Functions: funcs, } - // Handle text_format -> response_format conversion + // Handle text_format -> response_format conversion and structured output if input.TextFormat != nil { - openAIReq.ResponseFormat = convertTextFormatToResponseFormat(input.TextFormat) + responseFormat := convertTextFormatToResponseFormat(input.TextFormat) + openAIReq.ResponseFormat = responseFormat + + // Generate grammar and pass schema for structured output (like OpenAI chat/completion) + if rfMap, ok := responseFormat.(map[string]interface{}); ok { + if rfType, _ := rfMap["type"].(string); rfType == "json_object" { + cfg.Grammar = functions.JSONBNF + cfg.ResponseFormat = "json_object" + } else if rfType == "json_schema" { + cfg.ResponseFormat = "json_schema" + d := schema.JsonSchemaRequest{} + dat, err := json.Marshal(rfMap) + if err == nil { + if err := json.Unmarshal(dat, &d); err == nil { + schemaBytes, err := json.Marshal(d.JsonSchema.Schema) + if err == nil { + if cfg.RequestMetadata == nil { + cfg.RequestMetadata = map[string]string{} + } + cfg.RequestMetadata["json_schema"] = string(schemaBytes) + } + fs := &functions.JSONFunctionStructure{ + AnyOf: []functions.Item{d.JsonSchema.Schema}, + } + g, err := fs.Grammar(cfg.FunctionsConfig.GrammarOptions()...) + if err == nil { + cfg.Grammar = g + } else { + xlog.Error("Open Responses - Failed generating grammar for json_schema", "error", err) + } + } + } + } + } } // Generate grammar for function calling (similar to OpenAI chat endpoint) diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md index c594f79d8875..0ba4e4d43b51 100644 --- a/docs/content/features/constrained_grammars.md +++ b/docs/content/features/constrained_grammars.md @@ -123,6 +123,43 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso }' ``` +## Open Responses API + +The Open Responses API (`/v1/responses`) also supports structured output via the `text_format` parameter: + +### JSON Schema + +```bash +curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{ + "model": "my-model", + "input": "Generate a person object", + "text_format": { + "type": "json_schema", + "json_schema": { + "name": "person", + "schema": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "age": {"type": "integer"} + }, + "required": ["name", "age"] + } + } + } +}' +``` + +### JSON Object + +```bash +curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{ + "model": "my-model", + "input": "Generate a person as JSON", + "text_format": {"type": "json_object"} +}' +``` + ## Related Features - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs