From 7d35b7b09743f88fab3526ed4d835b24939b10ca Mon Sep 17 00:00:00 2001
From: r266-tech <r2668940489@gmail.com>
Date: Sun, 21 Jun 2026 16:48:49 +0800
Subject: [PATCH 1/3] fix(vlm): default max_tokens fallback exceeds gpt-4o-mini
 completion cap (#2751)

The unset-fallback default of 32768 exceeds the 16384 completion-token cap of the
OpenAI VLM backend's own default model (gpt-4o-mini), so a default-configured
deployment gets an HTTP 400 that the memory-extraction path swallows -> a silent
0-memory extraction. Lower the fallback to a named _DEFAULT_MAX_TOKENS=16384 and
fall back only when max_tokens is genuinely unset (None), leaving explicit values
untouched.
---
 openviking/models/vlm/backends/openai_vlm.py | 27 ++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py
index 012adcbf3d..b3f1f9b58d 100644
--- a/openviking/models/vlm/backends/openai_vlm.py
+++ b/openviking/models/vlm/backends/openai_vlm.py
@@ -35,6 +35,19 @@
 _REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")
 
 
+# Default completion-token cap used when the VLM config does not set ``max_tokens``.
+# It must not exceed the completion-token limit of this backend's own default model
+# (``gpt-4o-mini`` / ``gpt-4o``, which cap completion at 16384 tokens). The previous
+# fallback of 32768 is rejected by those models with an HTTP 400 ("max_tokens is too
+# large ... supports at most 16384 completion tokens"); the memory-extraction path
+# swallows that 400 and returns 0 extracted memories with no surfaced error, so a
+# default-configured deployment silently extracts nothing (issue #2751). Memory
+# extraction emits small JSON, so 16384 introduces no real truncation while still
+# guarding against runaway generation. Callers that need a larger budget set
+# ``max_tokens`` explicitly, which is honored unchanged.
+_DEFAULT_MAX_TOKENS = 16384
+
+
 def _is_reasoning_model(model: Optional[str]) -> bool:
     """OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`.
 
@@ -233,7 +246,12 @@ def _build_text_kwargs(
         else:
             kwargs["temperature"] = self.temperature
         self._apply_provider_specific_extra_body(kwargs, effective_thinking)
-        max_tokens = self.max_tokens or 32768
+        # Fall back to the default only when max_tokens is genuinely unset (None);
+        # an explicitly configured value (including a degenerate 0) is passed through
+        # so bad config surfaces loudly instead of being silently rewritten.
+        max_tokens = (
+            self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS
+        )
         kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
         if tools:
             kwargs["tools"] = tools
@@ -271,7 +289,12 @@ def _build_vision_kwargs(
         else:
             kwargs["temperature"] = self.temperature
         self._apply_provider_specific_extra_body(kwargs, effective_thinking)
-        max_tokens = self.max_tokens or 32768
+        # Fall back to the default only when max_tokens is genuinely unset (None);
+        # an explicitly configured value (including a degenerate 0) is passed through
+        # so bad config surfaces loudly instead of being silently rewritten.
+        max_tokens = (
+            self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS
+        )
         kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
         if tools:
             kwargs["tools"] = tools

From 5beabc50c80af22692766c475e07fb5d44ce6118 Mon Sep 17 00:00:00 2001
From: r266-tech <r2668940489@gmail.com>
Date: Sun, 21 Jun 2026 16:48:50 +0800
Subject: [PATCH 2/3] test(vlm): regression for default max_tokens fallback
 within model cap (#2751)

Pins the unset default to 16384 (<= gpt-4o-mini cap), that an explicit value is
honored, and that an explicit falsy 0 is not silently replaced by the default.
---
 tests/unit/test_vlm_default_max_tokens.py | 58 +++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 tests/unit/test_vlm_default_max_tokens.py

diff --git a/tests/unit/test_vlm_default_max_tokens.py b/tests/unit/test_vlm_default_max_tokens.py
new file mode 100644
index 0000000000..06f43e6305
--- /dev/null
+++ b/tests/unit/test_vlm_default_max_tokens.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Regression tests for the OpenAI VLM default ``max_tokens`` fallback (issue #2751).
+
+When ``max_tokens`` is not configured, the OpenAI VLM backend falls back to a default
+that must not exceed the completion-token cap of the backend's own default model
+(``gpt-4o-mini`` / ``gpt-4o``, capped at 16384 completion tokens). The previous
+fallback of 32768 produced an HTTP 400 ("max_tokens is too large ... supports at most
+16384 completion tokens") that the memory-extraction path swallowed, silently yielding
+0 extracted memories for default-configured deployments.
+"""
+
+from openviking.models.vlm.backends.openai_vlm import _DEFAULT_MAX_TOKENS, OpenAIVLM
+
+# gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens.
+_GPT_4O_COMPLETION_CAP = 16384
+
+
+def _make_vlm(**overrides):
+    config = {
+        "api_key": "sk-test",
+        "api_base": "https://api.openai.com/v1",
+    }
+    config.update(overrides)
+    return OpenAIVLM(config)
+
+
+class TestDefaultMaxTokensFallback:
+    """Unset ``max_tokens`` must fall back to a value the default model accepts."""
+
+    def test_default_fallback_within_default_model_cap(self):
+        assert _DEFAULT_MAX_TOKENS <= _GPT_4O_COMPLETION_CAP
+
+    def test_text_kwargs_default_model_unset_max_tokens(self):
+        # No model -> backend default gpt-4o-mini; no max_tokens -> fallback default.
+        kwargs = _make_vlm()._build_text_kwargs(prompt="hi")
+        assert kwargs["model"] == "gpt-4o-mini"
+        assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS
+        assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP
+
+    def test_vision_kwargs_default_model_unset_max_tokens(self):
+        kwargs = _make_vlm()._build_vision_kwargs(prompt="describe this")
+        assert kwargs["model"] == "gpt-4o-mini"
+        assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS
+        assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP
+
+    def test_explicit_max_tokens_is_respected(self):
+        # An explicitly configured max_tokens must override the fallback unchanged.
+        vlm = _make_vlm(max_tokens=512)
+        assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 512
+        assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 512
+
+    def test_explicit_zero_max_tokens_not_replaced_by_default(self):
+        # The fallback fires only when max_tokens is unset (None); an explicit value
+        # is passed through unchanged, so the default never silently overrides config.
+        vlm = _make_vlm(max_tokens=0)
+        assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0
+        assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0

From 3fc83809ed557bfe83d8b2d70a88987d1b45b382 Mon Sep 17 00:00:00 2001
From: r266-tech <r2668940489@gmail.com>
Date: Wed, 24 Jun 2026 00:54:11 +0800
Subject: [PATCH 3/3] fix(vlm): keep reasoning-model unset max_tokens at prior
 32768 default

Addresses @chenjw review: the original fix lowered the unset max_tokens
fallback to 16384 for ALL models, but #2751 is specific to the gpt-4o
family (gpt-4o / gpt-4o-mini cap completion at 16384). Reasoning models
(gpt-5/o1/o3/o4) advertise larger completion limits and spend a hidden
reasoning-token budget out of max_completion_tokens, so the 16384 cap
would needlessly truncate them.

Make the unset fallback conditional on is_reasoning: reasoning models keep
their prior 32768 default (sent as max_completion_tokens); the gpt-4o-family
default stays at 16384. Explicitly configured max_tokens (including 0) is
still passed through unchanged. Adds reasoning-model regression tests.
---
 openviking/models/vlm/backends/openai_vlm.py | 46 ++++++++++++++------
 tests/unit/test_vlm_default_max_tokens.py    | 27 +++++++++++-
 2 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py
index b3f1f9b58d..6a7a728b18 100644
--- a/openviking/models/vlm/backends/openai_vlm.py
+++ b/openviking/models/vlm/backends/openai_vlm.py
@@ -35,18 +35,26 @@
 _REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")
 
 
-# Default completion-token cap used when the VLM config does not set ``max_tokens``.
-# It must not exceed the completion-token limit of this backend's own default model
-# (``gpt-4o-mini`` / ``gpt-4o``, which cap completion at 16384 tokens). The previous
-# fallback of 32768 is rejected by those models with an HTTP 400 ("max_tokens is too
-# large ... supports at most 16384 completion tokens"); the memory-extraction path
-# swallows that 400 and returns 0 extracted memories with no surfaced error, so a
-# default-configured deployment silently extracts nothing (issue #2751). Memory
-# extraction emits small JSON, so 16384 introduces no real truncation while still
-# guarding against runaway generation. Callers that need a larger budget set
-# ``max_tokens`` explicitly, which is honored unchanged.
+# Default completion-token cap used when the VLM config does not set ``max_tokens``,
+# for this backend's own default *non-reasoning* model. It must not exceed the
+# completion-token limit of ``gpt-4o-mini`` / ``gpt-4o`` (which cap completion at
+# 16384 tokens). The previous fallback of 32768 is rejected by those models with an
+# HTTP 400 ("max_tokens is too large ... supports at most 16384 completion tokens");
+# the memory-extraction path swallows that 400 and returns 0 extracted memories with
+# no surfaced error, so a default-configured deployment silently extracts nothing
+# (issue #2751). Memory extraction emits small JSON, so 16384 introduces no real
+# truncation while still guarding against runaway generation. Callers that need a
+# larger budget set ``max_tokens`` explicitly, which is honored unchanged.
 _DEFAULT_MAX_TOKENS = 16384
 
+# Reasoning models (``gpt-5`` / ``o1`` / ``o3`` / ``o4``) advertise much larger
+# completion-token limits and additionally spend a hidden reasoning-token budget out
+# of ``max_completion_tokens``; the 16384 non-reasoning cap would needlessly truncate
+# them. They were never affected by #2751 (that 400 is specific to the gpt-4o family's
+# 16384 cap), so the unset fallback for reasoning models is left at its prior 32768
+# value rather than lowered. Explicitly configured ``max_tokens`` is still honored.
+_DEFAULT_REASONING_MAX_TOKENS = 32768
+
 
 def _is_reasoning_model(model: Optional[str]) -> bool:
     """OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`.
@@ -248,9 +256,14 @@ def _build_text_kwargs(
         self._apply_provider_specific_extra_body(kwargs, effective_thinking)
         # Fall back to the default only when max_tokens is genuinely unset (None);
         # an explicitly configured value (including a degenerate 0) is passed through
-        # so bad config surfaces loudly instead of being silently rewritten.
+        # so bad config surfaces loudly instead of being silently rewritten. Reasoning
+        # models keep their prior 32768 unset default (they were not hit by #2751 and
+        # accept larger completion budgets); only the gpt-4o-family default is lowered.
+        default_max_tokens = (
+            _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS
+        )
         max_tokens = (
-            self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS
+            self.max_tokens if self.max_tokens is not None else default_max_tokens
         )
         kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
         if tools:
@@ -291,9 +304,14 @@ def _build_vision_kwargs(
         self._apply_provider_specific_extra_body(kwargs, effective_thinking)
         # Fall back to the default only when max_tokens is genuinely unset (None);
         # an explicitly configured value (including a degenerate 0) is passed through
-        # so bad config surfaces loudly instead of being silently rewritten.
+        # so bad config surfaces loudly instead of being silently rewritten. Reasoning
+        # models keep their prior 32768 unset default (they were not hit by #2751 and
+        # accept larger completion budgets); only the gpt-4o-family default is lowered.
+        default_max_tokens = (
+            _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS
+        )
         max_tokens = (
-            self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS
+            self.max_tokens if self.max_tokens is not None else default_max_tokens
         )
         kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
         if tools:
diff --git a/tests/unit/test_vlm_default_max_tokens.py b/tests/unit/test_vlm_default_max_tokens.py
index 06f43e6305..f7256bc797 100644
--- a/tests/unit/test_vlm_default_max_tokens.py
+++ b/tests/unit/test_vlm_default_max_tokens.py
@@ -10,7 +10,11 @@
 0 extracted memories for default-configured deployments.
 """
 
-from openviking.models.vlm.backends.openai_vlm import _DEFAULT_MAX_TOKENS, OpenAIVLM
+from openviking.models.vlm.backends.openai_vlm import (
+    _DEFAULT_MAX_TOKENS,
+    _DEFAULT_REASONING_MAX_TOKENS,
+    OpenAIVLM,
+)
 
 # gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens.
 _GPT_4O_COMPLETION_CAP = 16384
@@ -56,3 +60,24 @@ def test_explicit_zero_max_tokens_not_replaced_by_default(self):
         vlm = _make_vlm(max_tokens=0)
         assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0
         assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0
+
+
+class TestReasoningModelDefaultUnchanged:
+    """Reasoning models keep their prior 32768 unset default (not lowered by #2751)."""
+
+    def test_reasoning_unset_keeps_prior_default(self):
+        # gpt-5 is a reasoning model: unset max_tokens -> prior 32768 via
+        # max_completion_tokens, NOT the lowered gpt-4o-family cap. Reasoning models
+        # advertise larger completion limits and spend hidden reasoning tokens from
+        # this budget, so the #2751 16384 cap must not apply to them.
+        for builder in ("_build_text_kwargs", "_build_vision_kwargs"):
+            kwargs = getattr(_make_vlm(model="gpt-5"), builder)(prompt="hi")
+            assert "max_tokens" not in kwargs
+            assert kwargs["max_completion_tokens"] == _DEFAULT_REASONING_MAX_TOKENS
+            assert _DEFAULT_REASONING_MAX_TOKENS > _DEFAULT_MAX_TOKENS
+
+    def test_reasoning_explicit_max_tokens_respected(self):
+        # An explicit budget on a reasoning model is still honored unchanged.
+        vlm = _make_vlm(model="o3", max_tokens=4096)
+        assert vlm._build_text_kwargs(prompt="hi")["max_completion_tokens"] == 4096
+        assert vlm._build_vision_kwargs(prompt="x")["max_completion_tokens"] == 4096