Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 43 additions & 2 deletions openviking/models/vlm/backends/openai_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,27 @@
_REASONING_MODEL_PREFIXES = ("gpt-5", "o1", "o3", "o4")


# Default completion-token cap used when the VLM config does not set ``max_tokens``,
# for this backend's own default *non-reasoning* model. It must not exceed the
# completion-token limit of ``gpt-4o-mini`` / ``gpt-4o`` (which cap completion at
# 16384 tokens). The previous fallback of 32768 is rejected by those models with an
# HTTP 400 ("max_tokens is too large ... supports at most 16384 completion tokens");
# the memory-extraction path swallows that 400 and returns 0 extracted memories with
# no surfaced error, so a default-configured deployment silently extracts nothing
# (issue #2751). Memory extraction emits small JSON, so 16384 introduces no real
# truncation while still guarding against runaway generation. Callers that need a
# larger budget set ``max_tokens`` explicitly, which is honored unchanged.
_DEFAULT_MAX_TOKENS = 16384

# Reasoning models (``gpt-5`` / ``o1`` / ``o3`` / ``o4``) advertise much larger
# completion-token limits and additionally spend a hidden reasoning-token budget out
# of ``max_completion_tokens``; the 16384 non-reasoning cap would needlessly truncate
# them. They were never affected by #2751 (that 400 is specific to the gpt-4o family's
# 16384 cap), so the unset fallback for reasoning models is left at its prior 32768
# value rather than lowered. Explicitly configured ``max_tokens`` is still honored.
_DEFAULT_REASONING_MAX_TOKENS = 32768


def _is_reasoning_model(model: Optional[str]) -> bool:
"""OpenAI reasoning-model families reject `max_tokens` and non-default `temperature`.

Expand Down Expand Up @@ -233,7 +254,17 @@ def _build_text_kwargs(
else:
kwargs["temperature"] = self.temperature
self._apply_provider_specific_extra_body(kwargs, effective_thinking)
max_tokens = self.max_tokens or 32768
# Fall back to the default only when max_tokens is genuinely unset (None);
# an explicitly configured value (including a degenerate 0) is passed through
# so bad config surfaces loudly instead of being silently rewritten. Reasoning
# models keep their prior 32768 unset default (they were not hit by #2751 and
# accept larger completion budgets); only the gpt-4o-family default is lowered.
default_max_tokens = (
_DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS
)
max_tokens = (
self.max_tokens if self.max_tokens is not None else default_max_tokens
)
kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One concern: this now gives reasoning models a default max_completion_tokens when max_tokens is unset. Existing behavior intentionally omitted both token fields for reasoning models without max_tokens (see test_reasoning_model_without_max_tokens_omits_both), and this PR currently breaks that test. Could we keep the 16384 fallback only for non-reasoning models, and only set max_completion_tokens for reasoning models when max_tokens is explicitly configured?

if tools:
kwargs["tools"] = tools
Expand Down Expand Up @@ -271,7 +302,17 @@ def _build_vision_kwargs(
else:
kwargs["temperature"] = self.temperature
self._apply_provider_specific_extra_body(kwargs, effective_thinking)
max_tokens = self.max_tokens or 32768
# Fall back to the default only when max_tokens is genuinely unset (None);
# an explicitly configured value (including a degenerate 0) is passed through
# so bad config surfaces loudly instead of being silently rewritten. Reasoning
# models keep their prior 32768 unset default (they were not hit by #2751 and
# accept larger completion budgets); only the gpt-4o-family default is lowered.
default_max_tokens = (
_DEFAULT_REASONING_MAX_TOKENS if is_reasoning else _DEFAULT_MAX_TOKENS
)
max_tokens = (
self.max_tokens if self.max_tokens is not None else default_max_tokens
)
kwargs["max_completion_tokens" if is_reasoning else "max_tokens"] = max_tokens
if tools:
kwargs["tools"] = tools
Expand Down
83 changes: 83 additions & 0 deletions tests/unit/test_vlm_default_max_tokens.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2026 Beijing Volcano Engine Technology Co., Ltd.
# SPDX-License-Identifier: AGPL-3.0
"""Regression tests for the OpenAI VLM default ``max_tokens`` fallback (issue #2751).

When ``max_tokens`` is not configured, the OpenAI VLM backend falls back to a default
that must not exceed the completion-token cap of the backend's own default model
(``gpt-4o-mini`` / ``gpt-4o``, capped at 16384 completion tokens). The previous
fallback of 32768 produced an HTTP 400 ("max_tokens is too large ... supports at most
16384 completion tokens") that the memory-extraction path swallowed, silently yielding
0 extracted memories for default-configured deployments.
"""

from openviking.models.vlm.backends.openai_vlm import (
_DEFAULT_MAX_TOKENS,
_DEFAULT_REASONING_MAX_TOKENS,
OpenAIVLM,
)

# gpt-4o / gpt-4o-mini (the backend default model) cap completion at 16384 tokens.
_GPT_4O_COMPLETION_CAP = 16384


def _make_vlm(**overrides):
config = {
"api_key": "sk-test",
"api_base": "https://api.openai.com/v1",
}
config.update(overrides)
return OpenAIVLM(config)


class TestDefaultMaxTokensFallback:
"""Unset ``max_tokens`` must fall back to a value the default model accepts."""

def test_default_fallback_within_default_model_cap(self):
assert _DEFAULT_MAX_TOKENS <= _GPT_4O_COMPLETION_CAP

def test_text_kwargs_default_model_unset_max_tokens(self):
# No model -> backend default gpt-4o-mini; no max_tokens -> fallback default.
kwargs = _make_vlm()._build_text_kwargs(prompt="hi")
assert kwargs["model"] == "gpt-4o-mini"
assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS
assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP

def test_vision_kwargs_default_model_unset_max_tokens(self):
kwargs = _make_vlm()._build_vision_kwargs(prompt="describe this")
assert kwargs["model"] == "gpt-4o-mini"
assert kwargs["max_tokens"] == _DEFAULT_MAX_TOKENS
assert kwargs["max_tokens"] <= _GPT_4O_COMPLETION_CAP

def test_explicit_max_tokens_is_respected(self):
# An explicitly configured max_tokens must override the fallback unchanged.
vlm = _make_vlm(max_tokens=512)
assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 512
assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 512

def test_explicit_zero_max_tokens_not_replaced_by_default(self):
# The fallback fires only when max_tokens is unset (None); an explicit value
# is passed through unchanged, so the default never silently overrides config.
vlm = _make_vlm(max_tokens=0)
assert vlm._build_text_kwargs(prompt="hi")["max_tokens"] == 0
assert vlm._build_vision_kwargs(prompt="x")["max_tokens"] == 0


class TestReasoningModelDefaultUnchanged:
"""Reasoning models keep their prior 32768 unset default (not lowered by #2751)."""

def test_reasoning_unset_keeps_prior_default(self):
# gpt-5 is a reasoning model: unset max_tokens -> prior 32768 via
# max_completion_tokens, NOT the lowered gpt-4o-family cap. Reasoning models
# advertise larger completion limits and spend hidden reasoning tokens from
# this budget, so the #2751 16384 cap must not apply to them.
for builder in ("_build_text_kwargs", "_build_vision_kwargs"):
kwargs = getattr(_make_vlm(model="gpt-5"), builder)(prompt="hi")
assert "max_tokens" not in kwargs
assert kwargs["max_completion_tokens"] == _DEFAULT_REASONING_MAX_TOKENS
assert _DEFAULT_REASONING_MAX_TOKENS > _DEFAULT_MAX_TOKENS

def test_reasoning_explicit_max_tokens_respected(self):
# An explicit budget on a reasoning model is still honored unchanged.
vlm = _make_vlm(model="o3", max_tokens=4096)
assert vlm._build_text_kwargs(prompt="hi")["max_completion_tokens"] == 4096
assert vlm._build_vision_kwargs(prompt="x")["max_completion_tokens"] == 4096
Loading