From 7d35b7b09743f88fab3526ed4d835b24939b10ca Mon Sep 17 00:00:00 2001 From: r266-tech Date: Sun, 21 Jun 2026 16:48:49 +0800 Subject: [PATCH 1/3] fix(vlm): default max_tokens fallback exceeds gpt-4o-mini completion cap (#2751) The unset-fallback default of 32768 exceeds the 16384 completion-token cap of the OpenAI VLM backend's own default model (gpt-4o-mini), so a default-configured deployment gets an HTTP 400 that the memory-extraction path swallows -> a silent 0-memory extraction. Lower the fallback to a named _DEFAULT_MAX_TOKENS=16384 and fall back only when max_tokens is genuinely unset (None), leaving explicit values untouched. --- openviking/models/vlm/backends/openai_vlm.py | 27 ++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py index 012adcbf3d..b3f1f9b58d 100644 --- a/openviking/models/vlm/backends/openai_vlm.py +++ b/openviking/models/vlm/backends/openai_vlm.py @@ -35,6 +35,19 @@ _REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4") +# Default completion-token cap used when the VLM config does not set ``max_tokens``. +# It must not exceed the completion-token limit of this backend's own default model +# (``gpt-4o-mini`` / ``gpt-4o``, which cap completion at 16384 tokens). The previous +# fallback of 32768 is rejected by those models with an HTTP 400 ("max_tokens is too +# large ... supports at most 16384 completion tokens"); the memory-extraction path +# swallows that 400 and returns 0 extracted memories with no surfaced error, so a +# default-configured deployment silently extracts nothing (issue #2751). Memory +# extraction emits small JSON, so 16384 introduces no real truncation while still +# guarding against runaway generation. Callers that need a larger budget set +# ``max_tokens`` explicitly, which is honored unchanged. +_DEFAULT_MAX_TOKENS = 16384 + + def _is_reasoning_model(model: Optional[str]) -> bool: """OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`. @@ -233,7 +246,12 @@ def _build_text_kwargs( else: kwargs["temperature"] = self.temperature self._apply_provider_specific_extra_body(kwargs, effective_thinking) - max_tokens = self.max_tokens or 32768 + # Fall back to the default only when max_tokens is genuinely unset (None); + # an explicitly configured value (including a degenerate 0) is passed through + # so bad config surfaces loudly instead of being silently rewritten. + max_tokens = ( + self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS + ) kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens if tools: kwargs["tools"] = tools @@ -271,7 +289,12 @@ def _build_vision_kwargs( else: kwargs["temperature"] = self.temperature self._apply_provider_specific_extra_body(kwargs, effective_thinking) - max_tokens = self.max_tokens or 32768 + # Fall back to the default only when max_tokens is genuinely unset (None); + # an explicitly configured value (including a degenerate 0) is passed through + # so bad config surfaces loudly instead of being silently rewritten. + max_tokens = ( + self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS + ) kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens if tools: kwargs["tools"] = tools From 5beabc50c80af22692766c475e07fb5d44ce6118 Mon Sep 17 00:00:00 2001 From: r266-tech Date: Sun, 21 Jun 2026 16:48:50 +0800 Subject: [PATCH 2/3] test(vlm): regression for default max_tokens fallback within model cap (#2751) Pins the unset default to 16384 (<= gpt-4o-mini cap), that an explicit value is honored, and that an explicit falsy 0 is not silently replaced by the default. --- tests/unit/test_vlm_default_max_tokens.py | 58 +++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/unit/test_vlm_default_max_tokens.py diff --git a/tests/unit/test_vlm_default_max_tokens.py b/tests/unit/test_vlm_default_max_tokens.py new file mode 100644 index 0000000000..06f43e6305 --- /dev/null +++ b/tests/unit/test_vlm_default_max_tokens.py @@ -0,0 +1,58 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Regression tests for the OpenAI VLM default ``max_tokens`` fallback (issue #2751). + +When ``max_tokens`` is not configured, the OpenAI VLM backend falls back to a default +that must not exceed the completion-token cap of the backend's own default model +(``gpt-4o-mini`` / ``gpt-4o``, capped at 16384 completion tokens). The previous +fallback of 32768 produced an HTTP 400 ("max_tokens is too large ... supports at most +16384 completion tokens") that the memory-extraction path swallowed, silently yielding +0 extracted memories for default-configured deployments. +""" + +from openviking.models.vlm.backends.openai_vlm import _DEFAULT_MAX_TOKENS, OpenAIVLM + +# gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens. +_GPT_4O_COMPLETION_CAP = 16384 + + +def _make_vlm(**overrides): + config = { + "api_key": "sk-test", + "api_base": "https://api.openai.com/v1", + } + config.update(overrides) + return OpenAIVLM(config) + + +class TestDefaultMaxTokensFallback: + """Unset ``max_tokens`` must fall back to a value the default model accepts.""" + + def test_default_fallback_within_default_model_cap(self): + assert _DEFAULT_MAX_TOKENS <= _GPT_4O_COMPLETION_CAP + + def test_text_kwargs_default_model_unset_max_tokens(self): + # No model -> backend default gpt-4o-mini; no max_tokens -> fallback default. + kwargs = _make_vlm()._build_text_kwargs(prompt="hi") + assert kwargs["model"] == "gpt-4o-mini" + assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS + assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP + + def test_vision_kwargs_default_model_unset_max_tokens(self): + kwargs = _make_vlm()._build_vision_kwargs(prompt="describe this") + assert kwargs["model"] == "gpt-4o-mini" + assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS + assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP + + def test_explicit_max_tokens_is_respected(self): + # An explicitly configured max_tokens must override the fallback unchanged. + vlm = _make_vlm(max_tokens=512) + assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 512 + assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 512 + + def test_explicit_zero_max_tokens_not_replaced_by_default(self): + # The fallback fires only when max_tokens is unset (None); an explicit value + # is passed through unchanged, so the default never silently overrides config. + vlm = _make_vlm(max_tokens=0) + assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0 + assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0 From 3fc83809ed557bfe83d8b2d70a88987d1b45b382 Mon Sep 17 00:00:00 2001 From: r266-tech Date: Wed, 24 Jun 2026 00:54:11 +0800 Subject: [PATCH 3/3] fix(vlm): keep reasoning-model unset max_tokens at prior 32768 default Addresses @chenjw review: the original fix lowered the unset max_tokens fallback to 16384 for ALL models, but #2751 is specific to the gpt-4o family (gpt-4o / gpt-4o-mini cap completion at 16384). Reasoning models (gpt-5/o1/o3/o4) advertise larger completion limits and spend a hidden reasoning-token budget out of max_completion_tokens, so the 16384 cap would needlessly truncate them. Make the unset fallback conditional on is_reasoning: reasoning models keep their prior 32768 default (sent as max_completion_tokens); the gpt-4o-family default stays at 16384. Explicitly configured max_tokens (including 0) is still passed through unchanged. Adds reasoning-model regression tests. --- openviking/models/vlm/backends/openai_vlm.py | 46 ++++++++++++++------ tests/unit/test_vlm_default_max_tokens.py | 27 +++++++++++- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py index b3f1f9b58d..6a7a728b18 100644 --- a/openviking/models/vlm/backends/openai_vlm.py +++ b/openviking/models/vlm/backends/openai_vlm.py @@ -35,18 +35,26 @@ _REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4") -# Default completion-token cap used when the VLM config does not set ``max_tokens``. -# It must not exceed the completion-token limit of this backend's own default model -# (``gpt-4o-mini`` / ``gpt-4o``, which cap completion at 16384 tokens). The previous -# fallback of 32768 is rejected by those models with an HTTP 400 ("max_tokens is too -# large ... supports at most 16384 completion tokens"); the memory-extraction path -# swallows that 400 and returns 0 extracted memories with no surfaced error, so a -# default-configured deployment silently extracts nothing (issue #2751). Memory -# extraction emits small JSON, so 16384 introduces no real truncation while still -# guarding against runaway generation. Callers that need a larger budget set -# ``max_tokens`` explicitly, which is honored unchanged. +# Default completion-token cap used when the VLM config does not set ``max_tokens``, +# for this backend's own default *non-reasoning* model. It must not exceed the +# completion-token limit of ``gpt-4o-mini`` / ``gpt-4o`` (which cap completion at +# 16384 tokens). The previous fallback of 32768 is rejected by those models with an +# HTTP 400 ("max_tokens is too large ... supports at most 16384 completion tokens"); +# the memory-extraction path swallows that 400 and returns 0 extracted memories with +# no surfaced error, so a default-configured deployment silently extracts nothing +# (issue #2751). Memory extraction emits small JSON, so 16384 introduces no real +# truncation while still guarding against runaway generation. Callers that need a +# larger budget set ``max_tokens`` explicitly, which is honored unchanged. _DEFAULT_MAX_TOKENS = 16384 +# Reasoning models (``gpt-5`` / ``o1`` / ``o3`` / ``o4``) advertise much larger +# completion-token limits and additionally spend a hidden reasoning-token budget out +# of ``max_completion_tokens``; the 16384 non-reasoning cap would needlessly truncate +# them. They were never affected by #2751 (that 400 is specific to the gpt-4o family's +# 16384 cap), so the unset fallback for reasoning models is left at its prior 32768 +# value rather than lowered. Explicitly configured ``max_tokens`` is still honored. +_DEFAULT_REASONING_MAX_TOKENS = 32768 + def _is_reasoning_model(model: Optional[str]) -> bool: """OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`. @@ -248,9 +256,14 @@ def _build_text_kwargs( self._apply_provider_specific_extra_body(kwargs, effective_thinking) # Fall back to the default only when max_tokens is genuinely unset (None); # an explicitly configured value (including a degenerate 0) is passed through - # so bad config surfaces loudly instead of being silently rewritten. + # so bad config surfaces loudly instead of being silently rewritten. Reasoning + # models keep their prior 32768 unset default (they were not hit by #2751 and + # accept larger completion budgets); only the gpt-4o-family default is lowered. + default_max_tokens = ( + _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS + ) max_tokens = ( - self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS + self.max_tokens if self.max_tokens is not None else default_max_tokens ) kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens if tools: @@ -291,9 +304,14 @@ def _build_vision_kwargs( self._apply_provider_specific_extra_body(kwargs, effective_thinking) # Fall back to the default only when max_tokens is genuinely unset (None); # an explicitly configured value (including a degenerate 0) is passed through - # so bad config surfaces loudly instead of being silently rewritten. + # so bad config surfaces loudly instead of being silently rewritten. Reasoning + # models keep their prior 32768 unset default (they were not hit by #2751 and + # accept larger completion budgets); only the gpt-4o-family default is lowered. + default_max_tokens = ( + _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS + ) max_tokens = ( - self.max_tokens if self.max_tokens is not None else _DEFAULT_MAX_TOKENS + self.max_tokens if self.max_tokens is not None else default_max_tokens ) kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens if tools: diff --git a/tests/unit/test_vlm_default_max_tokens.py b/tests/unit/test_vlm_default_max_tokens.py index 06f43e6305..f7256bc797 100644 --- a/tests/unit/test_vlm_default_max_tokens.py +++ b/tests/unit/test_vlm_default_max_tokens.py @@ -10,7 +10,11 @@ 0 extracted memories for default-configured deployments. """ -from openviking.models.vlm.backends.openai_vlm import _DEFAULT_MAX_TOKENS, OpenAIVLM +from openviking.models.vlm.backends.openai_vlm import ( + _DEFAULT_MAX_TOKENS, + _DEFAULT_REASONING_MAX_TOKENS, + OpenAIVLM, +) # gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens. _GPT_4O_COMPLETION_CAP = 16384 @@ -56,3 +60,24 @@ def test_explicit_zero_max_tokens_not_replaced_by_default(self): vlm = _make_vlm(max_tokens=0) assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0 assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0 + + +class TestReasoningModelDefaultUnchanged: + """Reasoning models keep their prior 32768 unset default (not lowered by #2751).""" + + def test_reasoning_unset_keeps_prior_default(self): + # gpt-5 is a reasoning model: unset max_tokens -> prior 32768 via + # max_completion_tokens, NOT the lowered gpt-4o-family cap. Reasoning models + # advertise larger completion limits and spend hidden reasoning tokens from + # this budget, so the #2751 16384 cap must not apply to them. + for builder in ("_build_text_kwargs", "_build_vision_kwargs"): + kwargs = getattr(_make_vlm(model="gpt-5"), builder)(prompt="hi") + assert "max_tokens" not in kwargs + assert kwargs["max_completion_tokens"] == _DEFAULT_REASONING_MAX_TOKENS + assert _DEFAULT_REASONING_MAX_TOKENS > _DEFAULT_MAX_TOKENS + + def test_reasoning_explicit_max_tokens_respected(self): + # An explicit budget on a reasoning model is still honored unchanged. + vlm = _make_vlm(model="o3", max_tokens=4096) + assert vlm._build_text_kwargs(prompt="hi")["max_completion_tokens"] == 4096 + assert vlm._build_vision_kwargs(prompt="x")["max_completion_tokens"] == 4096