volcengine · r266-tech · Jun 21, 2026 · Jun 21, 2026 · Jun 23, 2026 · chenjw
diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py
@@ -35,6 +35,27 @@
 _REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")
 
 
+# Default completion-token cap used when the VLM config does not set ``max_tokens``,
+# for this backend's own default *non-reasoning* model. It must not exceed the
+# completion-token limit of ``gpt-4o-mini`` / ``gpt-4o`` (which cap completion at
+# 16384 tokens). The previous fallback of 32768 is rejected by those models with an
+# HTTP 400 ("max_tokens is too large ... supports at most 16384 completion tokens");
+# the memory-extraction path swallows that 400 and returns 0 extracted memories with
+# no surfaced error, so a default-configured deployment silently extracts nothing
+# (issue #2751). Memory extraction emits small JSON, so 16384 introduces no real
+# truncation while still guarding against runaway generation. Callers that need a
+# larger budget set ``max_tokens`` explicitly, which is honored unchanged.
+_DEFAULT_MAX_TOKENS = 16384
+
+# Reasoning models (``gpt-5`` / ``o1`` / ``o3`` / ``o4``) advertise much larger
+# completion-token limits and additionally spend a hidden reasoning-token budget out
+# of ``max_completion_tokens``; the 16384 non-reasoning cap would needlessly truncate
+# them. They were never affected by #2751 (that 400 is specific to the gpt-4o family's
+# 16384 cap), so the unset fallback for reasoning models is left at its prior 32768
+# value rather than lowered. Explicitly configured ``max_tokens`` is still honored.
+_DEFAULT_REASONING_MAX_TOKENS = 32768
+
+
 def _is_reasoning_model(model: Optional[str]) -> bool:
     """OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`.
 
@@ -233,7 +254,17 @@ def _build_text_kwargs(
         else:
             kwargs["temperature"] = self.temperature
         self._apply_provider_specific_extra_body(kwargs, effective_thinking)
-        max_tokens = self.max_tokens or 32768
+        # Fall back to the default only when max_tokens is genuinely unset (None);
+        # an explicitly configured value (including a degenerate 0) is passed through
+        # so bad config surfaces loudly instead of being silently rewritten. Reasoning
+        # models keep their prior 32768 unset default (they were not hit by #2751 and
+        # accept larger completion budgets); only the gpt-4o-family default is lowered.
+        default_max_tokens = (
+            _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS
+        )
+        max_tokens = (
+            self.max_tokens if self.max_tokens is not None else default_max_tokens
+        )
         kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
         if tools:
             kwargs["tools"] = tools
@@ -271,7 +302,17 @@ def _build_vision_kwargs(
         else:
             kwargs["temperature"] = self.temperature
         self._apply_provider_specific_extra_body(kwargs, effective_thinking)
-        max_tokens = self.max_tokens or 32768
+        # Fall back to the default only when max_tokens is genuinely unset (None);
+        # an explicitly configured value (including a degenerate 0) is passed through
+        # so bad config surfaces loudly instead of being silently rewritten. Reasoning
+        # models keep their prior 32768 unset default (they were not hit by #2751 and
+        # accept larger completion budgets); only the gpt-4o-family default is lowered.
+        default_max_tokens = (
+            _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS
+        )
+        max_tokens = (
+            self.max_tokens if self.max_tokens is not None else default_max_tokens
+        )
         kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
         if tools:
             kwargs["tools"] = tools

diff --git a/tests/unit/test_vlm_default_max_tokens.py b/tests/unit/test_vlm_default_max_tokens.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
+# SPDX-License-Identifier: AGPL-3.0
+"""Regression tests for the OpenAI VLM default ``max_tokens`` fallback (issue #2751).
+
+When ``max_tokens`` is not configured, the OpenAI VLM backend falls back to a default
+that must not exceed the completion-token cap of the backend's own default model
+(``gpt-4o-mini`` / ``gpt-4o``, capped at 16384 completion tokens). The previous
+fallback of 32768 produced an HTTP 400 ("max_tokens is too large ... supports at most
+16384 completion tokens") that the memory-extraction path swallowed, silently yielding
+0 extracted memories for default-configured deployments.
+"""
+
+from openviking.models.vlm.backends.openai_vlm import (
+    _DEFAULT_MAX_TOKENS,
+    _DEFAULT_REASONING_MAX_TOKENS,
+    OpenAIVLM,
+)
+
+# gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens.
+_GPT_4O_COMPLETION_CAP = 16384
+
+
+def _make_vlm(**overrides):
+    config = {
+        "api_key": "sk-test",
+        "api_base": "https://api.openai.com/v1",
+    }
+    config.update(overrides)
+    return OpenAIVLM(config)
+
+
+class TestDefaultMaxTokensFallback:
+    """Unset ``max_tokens`` must fall back to a value the default model accepts."""
+
+    def test_default_fallback_within_default_model_cap(self):
+        assert _DEFAULT_MAX_TOKENS <= _GPT_4O_COMPLETION_CAP
+
+    def test_text_kwargs_default_model_unset_max_tokens(self):
+        # No model -> backend default gpt-4o-mini; no max_tokens -> fallback default.
+        kwargs = _make_vlm()._build_text_kwargs(prompt="hi")
+        assert kwargs["model"] == "gpt-4o-mini"
+        assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS
+        assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP
+
+    def test_vision_kwargs_default_model_unset_max_tokens(self):
+        kwargs = _make_vlm()._build_vision_kwargs(prompt="describe this")
+        assert kwargs["model"] == "gpt-4o-mini"
+        assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS
+        assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP
+
+    def test_explicit_max_tokens_is_respected(self):
+        # An explicitly configured max_tokens must override the fallback unchanged.
+        vlm = _make_vlm(max_tokens=512)
+        assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 512
+        assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 512
+
+    def test_explicit_zero_max_tokens_not_replaced_by_default(self):
+        # The fallback fires only when max_tokens is unset (None); an explicit value
+        # is passed through unchanged, so the default never silently overrides config.
+        vlm = _make_vlm(max_tokens=0)
+        assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0
+        assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0
+
+
+class TestReasoningModelDefaultUnchanged:
+    """Reasoning models keep their prior 32768 unset default (not lowered by #2751)."""
+
+    def test_reasoning_unset_keeps_prior_default(self):
+        # gpt-5 is a reasoning model: unset max_tokens -> prior 32768 via
+        # max_completion_tokens, NOT the lowered gpt-4o-family cap. Reasoning models
+        # advertise larger completion limits and spend hidden reasoning tokens from
+        # this budget, so the #2751 16384 cap must not apply to them.
+        for builder in ("_build_text_kwargs", "_build_vision_kwargs"):
+            kwargs = getattr(_make_vlm(model="gpt-5"), builder)(prompt="hi")
+            assert "max_tokens" not in kwargs
+            assert kwargs["max_completion_tokens"] == _DEFAULT_REASONING_MAX_TOKENS
+            assert _DEFAULT_REASONING_MAX_TOKENS > _DEFAULT_MAX_TOKENS
+
+    def test_reasoning_explicit_max_tokens_respected(self):
+        # An explicit budget on a reasoning model is still honored unchanged.
+        vlm = _make_vlm(model="o3", max_tokens=4096)
+        assert vlm._build_text_kwargs(prompt="hi")["max_completion_tokens"] == 4096
+        assert vlm._build_vision_kwargs(prompt="x")["max_completion_tokens"] == 4096