diff --git a/openviking/models/vlm/backends/openai_vlm.py b/openviking/models/vlm/backends/openai_vlm.py index 012adcbf3d..6a7a728b18 100644 --- a/openviking/models/vlm/backends/openai_vlm.py +++ b/openviking/models/vlm/backends/openai_vlm.py @@ -35,6 +35,27 @@ _REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4") +# Default completion-token cap used when the VLM config does not set ``max_tokens``, +# for this backend's own default *non-reasoning* model. It must not exceed the +# completion-token limit of ``gpt-4o-mini`` / ``gpt-4o`` (which cap completion at +# 16384 tokens). The previous fallback of 32768 is rejected by those models with an +# HTTP 400 ("max_tokens is too large ... supports at most 16384 completion tokens"); +# the memory-extraction path swallows that 400 and returns 0 extracted memories with +# no surfaced error, so a default-configured deployment silently extracts nothing +# (issue #2751). Memory extraction emits small JSON, so 16384 introduces no real +# truncation while still guarding against runaway generation. Callers that need a +# larger budget set ``max_tokens`` explicitly, which is honored unchanged. +_DEFAULT_MAX_TOKENS = 16384 + +# Reasoning models (``gpt-5`` / ``o1`` / ``o3`` / ``o4``) advertise much larger +# completion-token limits and additionally spend a hidden reasoning-token budget out +# of ``max_completion_tokens``; the 16384 non-reasoning cap would needlessly truncate +# them. They were never affected by #2751 (that 400 is specific to the gpt-4o family's +# 16384 cap), so the unset fallback for reasoning models is left at its prior 32768 +# value rather than lowered. Explicitly configured ``max_tokens`` is still honored. +_DEFAULT_REASONING_MAX_TOKENS = 32768 + + def _is_reasoning_model(model: Optional[str]) -> bool: """OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`. @@ -233,7 +254,17 @@ def _build_text_kwargs( else: kwargs["temperature"] = self.temperature self._apply_provider_specific_extra_body(kwargs, effective_thinking) - max_tokens = self.max_tokens or 32768 + # Fall back to the default only when max_tokens is genuinely unset (None); + # an explicitly configured value (including a degenerate 0) is passed through + # so bad config surfaces loudly instead of being silently rewritten. Reasoning + # models keep their prior 32768 unset default (they were not hit by #2751 and + # accept larger completion budgets); only the gpt-4o-family default is lowered. + default_max_tokens = ( + _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS + ) + max_tokens = ( + self.max_tokens if self.max_tokens is not None else default_max_tokens + ) kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens if tools: kwargs["tools"] = tools @@ -271,7 +302,17 @@ def _build_vision_kwargs( else: kwargs["temperature"] = self.temperature self._apply_provider_specific_extra_body(kwargs, effective_thinking) - max_tokens = self.max_tokens or 32768 + # Fall back to the default only when max_tokens is genuinely unset (None); + # an explicitly configured value (including a degenerate 0) is passed through + # so bad config surfaces loudly instead of being silently rewritten. Reasoning + # models keep their prior 32768 unset default (they were not hit by #2751 and + # accept larger completion budgets); only the gpt-4o-family default is lowered. + default_max_tokens = ( + _DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS + ) + max_tokens = ( + self.max_tokens if self.max_tokens is not None else default_max_tokens + ) kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens if tools: kwargs["tools"] = tools diff --git a/tests/unit/test_vlm_default_max_tokens.py b/tests/unit/test_vlm_default_max_tokens.py new file mode 100644 index 0000000000..f7256bc797 --- /dev/null +++ b/tests/unit/test_vlm_default_max_tokens.py @@ -0,0 +1,83 @@ +# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd. +# SPDX-License-Identifier: AGPL-3.0 +"""Regression tests for the OpenAI VLM default ``max_tokens`` fallback (issue #2751). + +When ``max_tokens`` is not configured, the OpenAI VLM backend falls back to a default +that must not exceed the completion-token cap of the backend's own default model +(``gpt-4o-mini`` / ``gpt-4o``, capped at 16384 completion tokens). The previous +fallback of 32768 produced an HTTP 400 ("max_tokens is too large ... supports at most +16384 completion tokens") that the memory-extraction path swallowed, silently yielding +0 extracted memories for default-configured deployments. +""" + +from openviking.models.vlm.backends.openai_vlm import ( + _DEFAULT_MAX_TOKENS, + _DEFAULT_REASONING_MAX_TOKENS, + OpenAIVLM, +) + +# gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens. +_GPT_4O_COMPLETION_CAP = 16384 + + +def _make_vlm(**overrides): + config = { + "api_key": "sk-test", + "api_base": "https://api.openai.com/v1", + } + config.update(overrides) + return OpenAIVLM(config) + + +class TestDefaultMaxTokensFallback: + """Unset ``max_tokens`` must fall back to a value the default model accepts.""" + + def test_default_fallback_within_default_model_cap(self): + assert _DEFAULT_MAX_TOKENS <= _GPT_4O_COMPLETION_CAP + + def test_text_kwargs_default_model_unset_max_tokens(self): + # No model -> backend default gpt-4o-mini; no max_tokens -> fallback default. + kwargs = _make_vlm()._build_text_kwargs(prompt="hi") + assert kwargs["model"] == "gpt-4o-mini" + assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS + assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP + + def test_vision_kwargs_default_model_unset_max_tokens(self): + kwargs = _make_vlm()._build_vision_kwargs(prompt="describe this") + assert kwargs["model"] == "gpt-4o-mini" + assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS + assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP + + def test_explicit_max_tokens_is_respected(self): + # An explicitly configured max_tokens must override the fallback unchanged. + vlm = _make_vlm(max_tokens=512) + assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 512 + assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 512 + + def test_explicit_zero_max_tokens_not_replaced_by_default(self): + # The fallback fires only when max_tokens is unset (None); an explicit value + # is passed through unchanged, so the default never silently overrides config. + vlm = _make_vlm(max_tokens=0) + assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0 + assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0 + + +class TestReasoningModelDefaultUnchanged: + """Reasoning models keep their prior 32768 unset default (not lowered by #2751).""" + + def test_reasoning_unset_keeps_prior_default(self): + # gpt-5 is a reasoning model: unset max_tokens -> prior 32768 via + # max_completion_tokens, NOT the lowered gpt-4o-family cap. Reasoning models + # advertise larger completion limits and spend hidden reasoning tokens from + # this budget, so the #2751 16384 cap must not apply to them. + for builder in ("_build_text_kwargs", "_build_vision_kwargs"): + kwargs = getattr(_make_vlm(model="gpt-5"), builder)(prompt="hi") + assert "max_tokens" not in kwargs + assert kwargs["max_completion_tokens"] == _DEFAULT_REASONING_MAX_TOKENS + assert _DEFAULT_REASONING_MAX_TOKENS > _DEFAULT_MAX_TOKENS + + def test_reasoning_explicit_max_tokens_respected(self): + # An explicit budget on a reasoning model is still honored unchanged. + vlm = _make_vlm(model="o3", max_tokens=4096) + assert vlm._build_text_kwargs(prompt="hi")["max_completion_tokens"] == 4096 + assert vlm._build_vision_kwargs(prompt="x")["max_completion_tokens"] == 4096