mudler · eureka0928 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
@@ -2,6 +2,7 @@
 import asyncio
 from concurrent import futures
 import argparse
+import json
 import signal
 import sys
 import os
@@ -22,6 +23,21 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+
+# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions.
+# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs.
+try:
+    from vllm.sampling_params import StructuredOutputsParams
+    _structured_output_cls = StructuredOutputsParams
+    _structured_output_field = "structured_outputs"
+except ImportError:
+    try:
+        from vllm.sampling_params import GuidedDecodingParams
+        _structured_output_cls = GuidedDecodingParams
+        _structured_output_field = "guided_decoding"
+    except ImportError:
+        _structured_output_cls = None
+        _structured_output_field = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image

diff --git a/core/backend/options.go b/core/backend/options.go
@@ -292,6 +292,12 @@
 			metadata["enable_thinking"] = "true"
 		}
 	}
+	if c.ResponseFormat != "" {
+		metadata["response_format"] = c.ResponseFormat
+	}
+	for k, v := range c.RequestMetadata {
+		metadata[k] = v
+	}
 	pbOpts.Metadata = metadata
 
 	// Logprobs and TopLogprobs are set by the caller if provided

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
@@ -593,7 +593,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			switch d.Type {
 			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
 			case "json_schema":
+				config.ResponseFormat = "json_schema"
 				d := schema.JsonSchemaRequest{}
 				dat, err := json.Marshal(config.ResponseFormatMap)
 				if err != nil {
@@ -603,6 +605,16 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				if err != nil {
 					return err
 				}
+
+				// Pass raw JSON schema via metadata for backends that support native structured output
+				schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+				if err == nil {
+					if config.RequestMetadata == nil {
+						config.RequestMetadata = map[string]string{}
+					}
+					config.RequestMetadata["json_schema"] = string(schemaBytes)
+				}
+
 				fs := &functions.JSONFunctionStructure{
 					AnyOf: []functions.Item{d.JsonSchema.Schema},
 				}

diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
@@ -88,8 +88,34 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 			d := schema.ChatCompletionResponseFormat{}
 			dat, _ := json.Marshal(config.ResponseFormatMap)
 			_ = json.Unmarshal(dat, &d)
-			if d.Type == "json_object" {
+			switch d.Type {
+			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
+			case "json_schema":
+				config.ResponseFormat = "json_schema"
+				jsr := schema.JsonSchemaRequest{}
+				dat, err := json.Marshal(config.ResponseFormatMap)
+				if err == nil {
+					if err := json.Unmarshal(dat, &jsr); err == nil {
+						schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
+						if err == nil {
+							if config.RequestMetadata == nil {
+								config.RequestMetadata = map[string]string{}
+							}
+							config.RequestMetadata["json_schema"] = string(schemaBytes)
+						}
+						fs := &functions.JSONFunctionStructure{
+							AnyOf: []functions.Item{jsr.JsonSchema.Schema},
+						}
+						g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
+						if err == nil {
+							input.Grammar = g
+						} else {
+							xlog.Error("Failed generating grammar", "error", err)
+						}
+					}
+				}
 			}
 		}
 

diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
@@ -173,9 +173,42 @@ func ResponsesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 			Functions: funcs,
 		}
 
-		// Handle text_format -> response_format conversion
+		// Handle text_format -> response_format conversion and structured output
 		if input.TextFormat != nil {
-			openAIReq.ResponseFormat = convertTextFormatToResponseFormat(input.TextFormat)
+			responseFormat := convertTextFormatToResponseFormat(input.TextFormat)
+			openAIReq.ResponseFormat = responseFormat
+
+			// Generate grammar and pass schema for structured output (like OpenAI chat/completion)
+			if rfMap, ok := responseFormat.(map[string]interface{}); ok {
+				if rfType, _ := rfMap["type"].(string); rfType == "json_object" {
+					cfg.Grammar = functions.JSONBNF
+					cfg.ResponseFormat = "json_object"
+				} else if rfType == "json_schema" {
+					cfg.ResponseFormat = "json_schema"
+					d := schema.JsonSchemaRequest{}
+					dat, err := json.Marshal(rfMap)
+					if err == nil {
+						if err := json.Unmarshal(dat, &d); err == nil {
+							schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+							if err == nil {
+								if cfg.RequestMetadata == nil {
+									cfg.RequestMetadata = map[string]string{}
+								}
+								cfg.RequestMetadata["json_schema"] = string(schemaBytes)
+							}
+							fs := &functions.JSONFunctionStructure{
+								AnyOf: []functions.Item{d.JsonSchema.Schema},
+							}
+							g, err := fs.Grammar(cfg.FunctionsConfig.GrammarOptions()...)
+							if err == nil {
+								cfg.Grammar = g
+							} else {
+								xlog.Error("Open Responses - Failed generating grammar for json_schema", "error", err)
+							}
+						}
+					}
+				}
+			}
 		}
 
 		// Generate grammar for function calling (similar to OpenAI chat endpoint)

diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md
@@ -10,7 +10,11 @@ url = "/features/constrained_grammars/"
 The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).
 
 {{% notice note %}}
-**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887).
+**Compatibility Notice:** Grammar and structured output support is available for the following backends:
+- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object`
+- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object`
+
+For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page.
  {{% /notice %}}
 
 ## Setup
@@ -66,6 +70,96 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars
 - Character classes (`[a-z]`)
 - String literals (`"text"`)
 
+## vLLM Backend
+
+The vLLM backend supports structured output via three methods:
+
+### JSON Schema (recommended)
+
+Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person object"}],
+  "response_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+Force the model to output valid JSON (without a specific schema):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person as JSON"}],
+  "response_format": {"type": "json_object"}
+}'
+```
+
+### Grammar
+
+The `grammar` parameter also works with vLLM via xgrammar:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Do you like apples?"}],
+  "grammar": "root ::= (\"yes\" | \"no\")"
+}'
+```
+
+## Open Responses API
+
+The Open Responses API (`/v1/responses`) also supports structured output via the `text_format` parameter:
+
+### JSON Schema
+
+```bash
+curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{
+  "model": "my-model",
+  "input": "Generate a person object",
+  "text_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+```bash
+curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{
+  "model": "my-model",
+  "input": "Generate a person as JSON",
+  "text_format": {"type": "json_object"}
+}'
+```
+
 ## Related Features
 
 - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs