From 0a26cf6a4a523ff8c97d10c7551731a7d88bb1bc Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:27:12 +0100
Subject: [PATCH 1/7] feat(backend): pass JSONSchema and ResponseFormat through
 gRPC

Add JSONSchema field to ModelConfig to carry the raw JSON schema string
alongside the GBNF Grammar. Pass both JSONSchema and ResponseFormat
through gRPCPredictOpts to backends via the new proto fields.

This allows backends like vLLM to receive the original JSON schema
for native structured output support.

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 core/backend/options.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/core/backend/options.go b/core/backend/options.go
index b09782ce2ca7..afa2ab20095a 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -282,6 +282,8 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		TensorSplit:         c.TensorSplit,
 		TailFreeSamplingZ:   float32(*c.TFZ),
 		TypicalP:            float32(*c.TypicalP),
+		JSONSchema:          c.JSONSchema,
+		ResponseFormat:      c.ResponseFormat,
 	}
 
 	metadata := map[string]string{}

From bbb32ac24437d16df6e63fd97d8b069eac7a3b25 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:27:56 +0100
Subject: [PATCH 2/7] feat(endpoints): extract raw JSON schema for structured
 output

In chat and completion endpoints, when response_format is json_schema,
extract the raw JSON schema and store it on config.JSONSchema alongside
the GBNF grammar. Also set config.ResponseFormat to the format type.

This allows backends that support native structured output (like vLLM)
to use the JSON schema directly instead of the GBNF grammar.

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 core/http/endpoints/openai/chat.go       |  9 +++++++++
 core/http/endpoints/openai/completion.go | 16 +++++++++++++++-
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index ab715d8b166d..bb0f87ba96b1 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -593,7 +593,9 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 			switch d.Type {
 			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
 			case "json_schema":
+				config.ResponseFormat = "json_schema"
 				d := schema.JsonSchemaRequest{}
 				dat, err := json.Marshal(config.ResponseFormatMap)
 				if err != nil {
@@ -603,6 +605,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 				if err != nil {
 					return err
 				}
+
+				// Pass raw JSON schema to backends that support native structured output
+				schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+				if err == nil {
+					config.JSONSchema = string(schemaBytes)
+				}
+
 				fs := &functions.JSONFunctionStructure{
 					AnyOf: []functions.Item{d.JsonSchema.Schema},
 				}
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index 069bc33a60f5..693dfd7d2edb 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -88,8 +88,22 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 			d := schema.ChatCompletionResponseFormat{}
 			dat, _ := json.Marshal(config.ResponseFormatMap)
 			_ = json.Unmarshal(dat, &d)
-			if d.Type == "json_object" {
+			switch d.Type {
+			case "json_object":
 				input.Grammar = functions.JSONBNF
+				config.ResponseFormat = "json_object"
+			case "json_schema":
+				config.ResponseFormat = "json_schema"
+				jsr := schema.JsonSchemaRequest{}
+				dat, err := json.Marshal(config.ResponseFormatMap)
+				if err == nil {
+					if err := json.Unmarshal(dat, &jsr); err == nil {
+						schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
+						if err == nil {
+							config.JSONSchema = string(schemaBytes)
+						}
+					}
+				}
 			}
 		}
 

From 3617e2aba48a45ed9680a9a38d487bb3a3a8bd4d Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:28:27 +0100
Subject: [PATCH 3/7] feat(vllm): add structured output support via guided
 decoding

Update the vLLM backend to support structured output:
- Import GuidedDecodingParams from vllm.sampling_params
- Handle JSONSchema: parse and pass as GuidedDecodingParams(json_schema=...)
- Handle json_object response format: GuidedDecodingParams(json_object=True)
- Fall back to Grammar (GBNF) via GuidedDecodingParams(grammar=...)
- Remove phantom GuidedDecoding mapping (field doesn't exist in proto)
- Fix missing 'import time' and 'import json' for load_video and schema parsing

Priority: JSONSchema > json_object > Grammar (GBNF fallback)

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/python/vllm/backend.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 95ae95a9d4e6..c83926c1de32 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -2,6 +2,7 @@
 import asyncio
 from concurrent import futures
 import argparse
+import json
 import signal
 import sys
 import os
@@ -21,7 +22,7 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams
+from vllm.sampling_params import SamplingParams, GuidedDecodingParams
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image

From 7a4d0a6bcb9d5198080ed5dc8027a48a2e9df36a Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 02:36:37 +0100
Subject: [PATCH 4/7] fix: refine vLLM structured output implementation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Make GuidedDecodingParams import conditional (try/except) for
  backwards compatibility with older vLLM versions
- Remove GBNF grammar fallback — vLLM expects EBNF, not GBNF, so
  passing LocalAI's GBNF grammar would produce confusing errors
- Pass JSONSchema as string directly instead of parsing to dict
  (safer across vLLM versions)
- Add GBNF grammar generation for json_schema in completion endpoint
  so non-vLLM backends (llama.cpp) also get grammar enforcement

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/python/vllm/backend.py           | 6 +++++-
 core/http/endpoints/openai/completion.go | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index c83926c1de32..8bbbf1fda34c 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -22,7 +22,11 @@
 
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.sampling_params import SamplingParams, GuidedDecodingParams
+from vllm.sampling_params import SamplingParams
+try:
+    from vllm.sampling_params import GuidedDecodingParams
+except ImportError:
+    GuidedDecodingParams = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index 693dfd7d2edb..208fc52aff88 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -102,6 +102,15 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 						if err == nil {
 							config.JSONSchema = string(schemaBytes)
 						}
+						fs := &functions.JSONFunctionStructure{
+							AnyOf: []functions.Item{jsr.JsonSchema.Schema},
+						}
+						g, err := fs.Grammar(config.FunctionsConfig.GrammarOptions()...)
+						if err == nil {
+							input.Grammar = g
+						} else {
+							xlog.Error("Failed generating grammar", "error", err)
+						}
 					}
 				}
 			}

From b19d7f23eda0723d71370ac3e04c3790586a0f79 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 03:21:58 +0100
Subject: [PATCH 5/7] fix(vllm): support both vLLM API versions and add grammar
 passthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Handle both StructuredOutputsParams (vLLM latest) and
  GuidedDecodingParams (vLLM <=0.8.x) with graceful fallback
- Use the correct SamplingParams field name for each version
  (structured_outputs vs guided_decoding)
- Use 'json' parameter (not 'json_schema') matching both APIs
- Re-add grammar (GBNF/BNF) passthrough — both vLLM APIs accept
  a 'grammar' parameter handled by xgrammar which supports GBNF
- Priority: JSONSchema > json_object > Grammar

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 backend/python/vllm/backend.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 8bbbf1fda34c..adfd440cfbbd 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -23,10 +23,21 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
+
+# vLLM renamed GuidedDecodingParams to StructuredOutputsParams in newer versions.
+# The corresponding SamplingParams field also changed from guided_decoding to structured_outputs.
 try:
-    from vllm.sampling_params import GuidedDecodingParams
+    from vllm.sampling_params import StructuredOutputsParams
+    _structured_output_cls = StructuredOutputsParams
+    _structured_output_field = "structured_outputs"
 except ImportError:
-    GuidedDecodingParams = None
+    try:
+        from vllm.sampling_params import GuidedDecodingParams
+        _structured_output_cls = GuidedDecodingParams
+        _structured_output_field = "guided_decoding"
+    except ImportError:
+        _structured_output_cls = None
+        _structured_output_field = None
 from vllm.utils import random_uuid
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.multimodal.utils import fetch_image

From 480bcd8ed164d90b1b57ec3dd9c137c2761bd717 Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 6 Mar 2026 03:22:42 +0100
Subject: [PATCH 6/7] docs: update constrained grammars with vLLM structured
 output support

Update the compatibility notice to include vLLM alongside llama.cpp.
Add a vLLM-specific section with examples for all three supported
methods: json_schema, json_object, and grammar (via xgrammar).

Ref: #6857
Signed-off-by: eureka928 <meobius123@gmail.com>
---
 docs/content/features/constrained_grammars.md | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md
index 5cf39c438b4f..c594f79d8875 100644
--- a/docs/content/features/constrained_grammars.md
+++ b/docs/content/features/constrained_grammars.md
@@ -10,7 +10,11 @@ url = "/features/constrained_grammars/"
 The `chat` endpoint supports the `grammar` parameter, which allows users to specify a grammar in Backus-Naur Form (BNF). This feature enables the Large Language Model (LLM) to generate outputs adhering to a user-defined schema, such as `JSON`, `YAML`, or any other format that can be defined using BNF. For more details about BNF, see [Backus-Naur Form on Wikipedia](https://en.wikipedia.org/wiki/Backus%E2%80%93Naur_form).
 
 {{% notice note %}}
-**Compatibility Notice:** This feature is only supported by models that use the [llama.cpp](https://github.com/ggerganov/llama.cpp) backend. For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page. For technical details, see the related pull requests: [PR #1773](https://github.com/ggerganov/llama.cpp/pull/1773) and [PR #1887](https://github.com/ggerganov/llama.cpp/pull/1887).
+**Compatibility Notice:** Grammar and structured output support is available for the following backends:
+- **llama.cpp** — supports the `grammar` parameter (GBNF syntax) and `response_format` with `json_schema`/`json_object`
+- **vLLM** — supports the `grammar` parameter (via xgrammar), `response_format` with `json_schema` (native JSON schema enforcement), and `json_object`
+
+For a complete list of compatible models, refer to the [Model Compatibility]({{%relref "reference/compatibility-table" %}}) page.
  {{% /notice %}}
 
 ## Setup
@@ -66,6 +70,59 @@ For more complex grammars, you can define multi-line BNF rules. The grammar pars
 - Character classes (`[a-z]`)
 - String literals (`"text"`)
 
+## vLLM Backend
+
+The vLLM backend supports structured output via three methods:
+
+### JSON Schema (recommended)
+
+Use the OpenAI-compatible `response_format` parameter with `json_schema` to enforce a specific JSON structure:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person object"}],
+  "response_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+Force the model to output valid JSON (without a specific schema):
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Generate a person as JSON"}],
+  "response_format": {"type": "json_object"}
+}'
+```
+
+### Grammar
+
+The `grammar` parameter also works with vLLM via xgrammar:
+
+```bash
+curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
+  "model": "my-vllm-model",
+  "messages": [{"role": "user", "content": "Do you like apples?"}],
+  "grammar": "root ::= (\"yes\" | \"no\")"
+}'
+```
+
 ## Related Features
 
 - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs

From c703a03247463170026c1c4f6be29dbced4a956e Mon Sep 17 00:00:00 2001
From: eureka928 <meobius123@gmail.com>
Date: Fri, 13 Mar 2026 19:14:07 +0100
Subject: [PATCH 7/7] refactor: use Metadata map instead of dedicated proto
 fields for structured output

Address review feedback:
- Remove JSONSchema and ResponseFormat proto fields; pass them via the
  existing Metadata map instead, avoiding proto changes
- vLLM backend reads json_schema and response_format from request.Metadata
- Add structured output support (json_schema, json_object) to Open
  Responses API via text_format parameter
- Update docs with Open Responses structured output examples

Ref: #6857

Signed-off-by: eureka928 <meobius123@gmail.com>
---
 core/backend/options.go                       |  8 +++-
 core/http/endpoints/openai/chat.go            |  7 +++-
 core/http/endpoints/openai/completion.go      |  5 ++-
 .../http/endpoints/openresponses/responses.go | 37 ++++++++++++++++++-
 docs/content/features/constrained_grammars.md | 37 +++++++++++++++++++
 5 files changed, 87 insertions(+), 7 deletions(-)

diff --git a/core/backend/options.go b/core/backend/options.go
index afa2ab20095a..17adbceb458b 100644
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -282,8 +282,6 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 		TensorSplit:         c.TensorSplit,
 		TailFreeSamplingZ:   float32(*c.TFZ),
 		TypicalP:            float32(*c.TypicalP),
-		JSONSchema:          c.JSONSchema,
-		ResponseFormat:      c.ResponseFormat,
 	}
 
 	metadata := map[string]string{}
@@ -294,6 +292,12 @@ func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions
 			metadata["enable_thinking"] = "true"
 		}
 	}
+	if c.ResponseFormat != "" {
+		metadata["response_format"] = c.ResponseFormat
+	}
+	for k, v := range c.RequestMetadata {
+		metadata[k] = v
+	}
 	pbOpts.Metadata = metadata
 
 	// Logprobs and TopLogprobs are set by the caller if provided
diff --git a/core/http/endpoints/openai/chat.go b/core/http/endpoints/openai/chat.go
index bb0f87ba96b1..8f8f79d84a39 100644
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -606,10 +606,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
 					return err
 				}
 
-				// Pass raw JSON schema to backends that support native structured output
+				// Pass raw JSON schema via metadata for backends that support native structured output
 				schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
 				if err == nil {
-					config.JSONSchema = string(schemaBytes)
+					if config.RequestMetadata == nil {
+						config.RequestMetadata = map[string]string{}
+					}
+					config.RequestMetadata["json_schema"] = string(schemaBytes)
 				}
 
 				fs := &functions.JSONFunctionStructure{
diff --git a/core/http/endpoints/openai/completion.go b/core/http/endpoints/openai/completion.go
index 208fc52aff88..6e5aae07423e 100644
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -100,7 +100,10 @@ func CompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eva
 					if err := json.Unmarshal(dat, &jsr); err == nil {
 						schemaBytes, err := json.Marshal(jsr.JsonSchema.Schema)
 						if err == nil {
-							config.JSONSchema = string(schemaBytes)
+							if config.RequestMetadata == nil {
+								config.RequestMetadata = map[string]string{}
+							}
+							config.RequestMetadata["json_schema"] = string(schemaBytes)
 						}
 						fs := &functions.JSONFunctionStructure{
 							AnyOf: []functions.Item{jsr.JsonSchema.Schema},
diff --git a/core/http/endpoints/openresponses/responses.go b/core/http/endpoints/openresponses/responses.go
index 764156d4da1b..45f18edc5de8 100644
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -173,9 +173,42 @@ func ResponsesEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, eval
 			Functions: funcs,
 		}
 
-		// Handle text_format -> response_format conversion
+		// Handle text_format -> response_format conversion and structured output
 		if input.TextFormat != nil {
-			openAIReq.ResponseFormat = convertTextFormatToResponseFormat(input.TextFormat)
+			responseFormat := convertTextFormatToResponseFormat(input.TextFormat)
+			openAIReq.ResponseFormat = responseFormat
+
+			// Generate grammar and pass schema for structured output (like OpenAI chat/completion)
+			if rfMap, ok := responseFormat.(map[string]interface{}); ok {
+				if rfType, _ := rfMap["type"].(string); rfType == "json_object" {
+					cfg.Grammar = functions.JSONBNF
+					cfg.ResponseFormat = "json_object"
+				} else if rfType == "json_schema" {
+					cfg.ResponseFormat = "json_schema"
+					d := schema.JsonSchemaRequest{}
+					dat, err := json.Marshal(rfMap)
+					if err == nil {
+						if err := json.Unmarshal(dat, &d); err == nil {
+							schemaBytes, err := json.Marshal(d.JsonSchema.Schema)
+							if err == nil {
+								if cfg.RequestMetadata == nil {
+									cfg.RequestMetadata = map[string]string{}
+								}
+								cfg.RequestMetadata["json_schema"] = string(schemaBytes)
+							}
+							fs := &functions.JSONFunctionStructure{
+								AnyOf: []functions.Item{d.JsonSchema.Schema},
+							}
+							g, err := fs.Grammar(cfg.FunctionsConfig.GrammarOptions()...)
+							if err == nil {
+								cfg.Grammar = g
+							} else {
+								xlog.Error("Open Responses - Failed generating grammar for json_schema", "error", err)
+							}
+						}
+					}
+				}
+			}
 		}
 
 		// Generate grammar for function calling (similar to OpenAI chat endpoint)
diff --git a/docs/content/features/constrained_grammars.md b/docs/content/features/constrained_grammars.md
index c594f79d8875..0ba4e4d43b51 100644
--- a/docs/content/features/constrained_grammars.md
+++ b/docs/content/features/constrained_grammars.md
@@ -123,6 +123,43 @@ curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/jso
 }'
 ```
 
+## Open Responses API
+
+The Open Responses API (`/v1/responses`) also supports structured output via the `text_format` parameter:
+
+### JSON Schema
+
+```bash
+curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{
+  "model": "my-model",
+  "input": "Generate a person object",
+  "text_format": {
+    "type": "json_schema",
+    "json_schema": {
+      "name": "person",
+      "schema": {
+        "type": "object",
+        "properties": {
+          "name": {"type": "string"},
+          "age": {"type": "integer"}
+        },
+        "required": ["name", "age"]
+      }
+    }
+  }
+}'
+```
+
+### JSON Object
+
+```bash
+curl http://localhost:8080/v1/responses -H "Content-Type: application/json" -d '{
+  "model": "my-model",
+  "input": "Generate a person as JSON",
+  "text_format": {"type": "json_object"}
+}'
+```
+
 ## Related Features
 
 - [OpenAI Functions]({{%relref "features/openai-functions" %}}) - Function calling with structured outputs