diff --git a/doc/tag-log.md b/doc/tag-log.md index 13bfa3f..4ba1018 100644 --- a/doc/tag-log.md +++ b/doc/tag-log.md @@ -17,6 +17,10 @@ Release history of mimo2codex, newest first. --- +## feat/multimodal-fallback + +- **[new]** **Multimodal fallback: auto-switch to vision model when images are detected**: when a request contains images but the active model doesn't support vision (e.g. `mimo-v2.5-pro`), the proxy now automatically rewrites the upstream model to a vision-capable one (default `mimo-v2.5`) so images are processed instead of silently dropped. Toggle and model selection are in the admin UI → Codex Integration → "Thinking & Runtime Overrides" tab. Disabled by default — enable it when your workflow mixes vision and non-vision models. + ## v0.5.21 (upcoming) - **[fix]** **Sustained 429 rate limits no longer break the session (follow-up to v0.5.20's retry)**: v0.5.20 added proxy-side 429/5xx retry, but the default budget (3 retries, ~3.5s) only outlasted sub-second blips. Real per-minute quota limits (`429 Too many requests / limitation`, often *without* a `Retry-After` header) still exhausted it, so the raw 429 was forwarded to Codex, which then burned its own retries and surfaced "exceeded retry limit, last status: 429" again. The default retry budget is now larger: **6 retries with exponential backoff capped at 12s (~28s total)**, so a multi-second quota limit clears before we give up. Still abortable, still honors `Retry-After` when present, and still tunable via `MIMO2CODEX_UPSTREAM_MAX_RETRIES` (now up to 12) / `MIMO2CODEX_UPSTREAM_RETRY_BASE_MS`. Trade-off: while rate-limited, a single request now waits up to ~28s before failing instead of ~3.5s. diff --git a/doc/tag-log.zh.md b/doc/tag-log.zh.md index 3eb716a..2a27c1a 100644 --- a/doc/tag-log.zh.md +++ b/doc/tag-log.zh.md @@ -17,6 +17,10 @@ mimo2codex 的版本发布历史,按 tag 倒序排列。 --- +## feat/multimodal-fallback + +- **[new]** **多模态 Fallback:检测到图片时自动切换 vision 模型**:当请求包含图片但当前模型不支持 vision(如 `mimo-v2.5-pro`)时,代理自动将 upstream model 重写为多模态模型(默认 `mimo-v2.5`),避免图片被静默丢弃。开关和模型选择在 admin UI → Codex 接入 →「思考与运行时覆盖」标签页。默认关闭——需要混合 vision / 非 vision 模型时开启。 + ## v0.5.21 (upcoming) - **[fix]** **持续型 429 限流不再中断会话(v0.5.20 重试的补强)**:v0.5.20 加了代理侧的 429/5xx 重试,但默认预算(重试 3 次、约 3.5 秒)只能扛住亚秒级抖动。真正按分钟计的配额限流(`429 Too many requests / limitation`,且经常**不带 `Retry-After` 头**)仍会把预算耗尽,于是原始 429 被透传给 Codex,Codex 再耗尽自己的重试,又报出「exceeded retry limit, last status: 429」。现在默认重试预算放大为:**重试 6 次、指数退避封顶 12 秒(总计约 28 秒)**,让几秒到几十秒的配额限流在放弃前自行解除。仍可被取消、仍尊重上游的 `Retry-After`、仍可通过 `MIMO2CODEX_UPSTREAM_MAX_RETRIES`(上限提到 12)/ `MIMO2CODEX_UPSTREAM_RETRY_BASE_MS` 调整。代价:限流期间单个请求最长会等约 28 秒才失败,而不是原来的约 3.5 秒。 diff --git a/src/admin/router.ts b/src/admin/router.ts index 46cbdf0..920127f 100644 --- a/src/admin/router.ts +++ b/src/admin/router.ts @@ -1003,6 +1003,57 @@ async function handleApi(ctx: RouteContext): Promise { return sendError(res, 405, "method_not_allowed", "use GET or PUT"); } + // GET/PUT /admin/api/vision-fallback — multimodal fallback toggle + model. + // When enabled, requests containing images are automatically routed to a + // vision-capable model even if the client's model doesn't support images. + if (pathname === "/admin/api/vision-fallback") { + if (req.method === "GET") { + const enabled = (() => { + try { + return getSetting("codex.visionFallbackEnabled") === "1"; + } catch { + return false; + } + })(); + const model = (() => { + try { + return getSetting("codex.visionFallbackModel") || "mimo-v2.5"; + } catch { + return "mimo-v2.5"; + } + })(); + return sendJson(res, 200, { enabled, model }); + } + if (req.method === "PUT") { + const body = await readJsonBody<{ enabled?: unknown; model?: unknown }>(req); + let changed = false; + if (typeof body.enabled === "boolean") { + setSetting("codex.visionFallbackEnabled", body.enabled ? "1" : "0"); + log.info(`codex.visionFallbackEnabled set to ${body.enabled} via admin UI`); + changed = true; + } + if (typeof body.model === "string") { + const trimmed = body.model.trim(); + if (!trimmed) { + return sendError(res, 400, "invalid_body", "model must be a non-empty string"); + } + setSetting("codex.visionFallbackModel", trimmed); + log.info(`codex.visionFallbackModel set to "${trimmed}" via admin UI`); + changed = true; + } + if (!changed) { + return sendError( + res, + 400, + "invalid_body", + "body must include at least one of: enabled (boolean), model (string)", + ); + } + return sendJson(res, 200, { ok: true }); + } + return sendError(res, 405, "method_not_allowed", "use GET or PUT"); + } + // GET/PUT /admin/api/log-settings — quick toggle for the "model fallback // applied" rewrite log. Default is silent (suppressed). env // MIMO2CODEX_SILENT_REWRITE, when set, overrides and disables the toggle. diff --git a/src/server.ts b/src/server.ts index b3b56ba..13675e2 100644 --- a/src/server.ts +++ b/src/server.ts @@ -2,6 +2,7 @@ import { createServer, type IncomingMessage, type Server, type ServerResponse } import type { Config } from "./config.js"; import { respToResponses } from "./translate/respToResponses.js"; import { pipeChatStreamToResponses, type StreamPipelineResult } from "./translate/streamToSse.js"; +import { modelSupportsImages } from "./translate/reqToChat.js"; import { iterChatStreamChunks } from "./upstream/chatStream.js"; import { callOpenAICompat, @@ -386,6 +387,54 @@ function rewriteWarning(notice: { from: string; to: string; reason: string }): { }; } +// --------------------------------------------------------------------------- +// Vision (multimodal) fallback +// --------------------------------------------------------------------------- + +// 读取 DB 设置,返回 vision fallback 模型名;未启用或 admin 关闭时返回 null。 +function resolveVisionFallback(cfg: Config): string | null { + if (!cfg.adminEnabled) return null; + try { + if (getSetting("codex.visionFallbackEnabled") !== "1") return null; + const model = getSetting("codex.visionFallbackModel"); + return model || "mimo-v2.5"; + } catch { + return null; + } +} + +// 检测 Responses API 请求是否包含图片(input_image 类型)。 +function requestContainsImages(payload: ResponsesRequest): boolean { + if (!Array.isArray(payload.input)) return false; + for (const item of payload.input) { + if (item.type === "message" && Array.isArray(item.content)) { + for (const part of item.content) { + if (part.type === "input_image") return true; + } + } + // function_call_output 也可能包含图片(tool 返回的图片) + if (item.type === "function_call_output" && Array.isArray(item.output)) { + for (const part of item.output) { + if (part.type === "input_image") return true; + } + } + } + return false; +} + +// 检测 Chat Completions API 请求是否包含图片(image_url 类型)。 +function chatRequestContainsImages(payload: ChatRequest): boolean { + if (!Array.isArray(payload.messages)) return false; + for (const msg of payload.messages) { + if (Array.isArray(msg.content)) { + for (const part of msg.content) { + if (part.type === "image_url") return true; + } + } + } + return false; +} + /** * 从 Codex 请求的 tools 数组中提取 namespace 映射:toolName → namespaceName。 * Codex Desktop 期望响应中的 function_call 带 namespace 字段才能路由到正确 handler。 @@ -457,6 +506,27 @@ async function handleResponses( cfg, readActiveOverrideSafely(cfg) ); + // 多模态 fallback:请求含图片但 model 不支持 vision → 自动切换。 + const visionFallbackModel = resolveVisionFallback(cfg); + if (visionFallbackModel) { + const effectiveModel = selectedRaw.upstreamModel; + if (!modelSupportsImages(effectiveModel) && requestContainsImages(payload)) { + const resolved = selectedRaw.provider.resolveModel(visionFallbackModel); + const newModel = resolved?.id ?? visionFallbackModel; + selectedRaw.rewriteNotice = { + from: effectiveModel, + to: newModel, + reason: `multimodal fallback — request contains images but model "${effectiveModel}" does not support vision`, + }; + selectedRaw.upstreamModel = newModel; + selectedRaw.modelInfo = resolved ?? selectedRaw.modelInfo; + log.info("vision fallback applied", { + from: effectiveModel, + to: newModel, + provider: selectedRaw.provider.id, + }); + } + } const { provider, upstreamModel, modelInfo, rewriteNotice } = selectedRaw; // BYOK: if a logged-in user has stored their own upstream API key for this // provider, swap it into the runtime. Local-mode / shared-key users keep @@ -1091,6 +1161,27 @@ async function handleChatPassthrough( cfg, readActiveOverrideSafely(cfg) ); + // 多模态 fallback(chat completions 路径):请求含图片但 model 不支持 vision → 自动切换。 + const visionFallbackModel = resolveVisionFallback(cfg); + if (visionFallbackModel) { + const effectiveModel = selectedRaw.upstreamModel; + if (!modelSupportsImages(effectiveModel) && chatRequestContainsImages(payload)) { + const resolved = selectedRaw.provider.resolveModel(visionFallbackModel); + const newModel = resolved?.id ?? visionFallbackModel; + selectedRaw.rewriteNotice = { + from: effectiveModel, + to: newModel, + reason: `multimodal fallback — request contains images but model "${effectiveModel}" does not support vision`, + }; + selectedRaw.upstreamModel = newModel; + selectedRaw.modelInfo = resolved ?? selectedRaw.modelInfo; + log.info("vision fallback applied", { + from: effectiveModel, + to: newModel, + provider: selectedRaw.provider.id, + }); + } + } const { provider, upstreamModel, modelInfo, rewriteNotice } = selectedRaw; const { runtime, source: apiKeySource } = resolveRuntimeForUser( selectedRaw.runtime, diff --git a/src/translate/reqToChat.ts b/src/translate/reqToChat.ts index 850dec1..cbbe503 100644 --- a/src/translate/reqToChat.ts +++ b/src/translate/reqToChat.ts @@ -59,7 +59,7 @@ function materializeStrippedImage(imageUrl: string, dropDir?: string): string | // only `mimo-v2.5` and `mimo-v2-omni` (and *-omni* variants) accept image // input. The other v2.5 variants (mimo-v2.5-pro, mimo-v2-flash, …) return // 404 "No endpoints found that support image input" when given image_url parts. -function modelSupportsImages(model: string): boolean { +export function modelSupportsImages(model: string): boolean { const base = model.toLowerCase(); if (base.includes("omni")) return true; if (base === "mimo-v2.5") return true; diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 6635e85..1f5b013 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -608,6 +608,10 @@ export const api = { request<{ ok: boolean }>("PUT", "/thinking-state", { disabled }), setForceHighEffort: (forceHighEffort: boolean) => request<{ ok: boolean }>("PUT", "/thinking-state", { forceHighEffort }), + visionFallback: () => + request<{ enabled: boolean; model: string }>("GET", "/vision-fallback"), + setVisionFallback: (body: { enabled?: boolean; model?: string }) => + request<{ ok: boolean }>("PUT", "/vision-fallback", body), logSettings: () => request("GET", "/log-settings"), setSilentRewrite: (silentRewrite: boolean) => request<{ ok: boolean }>("PUT", "/log-settings", { silentRewrite }), diff --git a/web/src/i18n/locales/en-US/codexEnable.json b/web/src/i18n/locales/en-US/codexEnable.json index fd1ea08..0af010c 100644 --- a/web/src/i18n/locales/en-US/codexEnable.json +++ b/web/src/i18n/locales/en-US/codexEnable.json @@ -62,6 +62,14 @@ "hint": "Thinking ON/OFF: when OFF, every provider skips thinking (mimo / deepseek send thinking:{type:\"disabled\"}, sensenova / other generic send reasoning_effort:\"none\"). When ON, each provider follows its own default (most clients don't include reasoning effort for non-GPT-5 models, so the upstream may skip thinking on simple tasks). Force high reasoning effort: only available while Thinking is ON — if Codex didn't pass an effort, mimo2codex injects reasoning_effort=\"high\"; if Codex did pass one, that value is respected. Takes effect immediately (no restart).", "cliOverride": "Thinking ON/OFF is currently controlled by CLI flag (--disable-thinking) or env (MIMO2CODEX_DISABLE_THINKING). The switch is locked. Restart without those args to control via UI." }, + "visionFallback": { + "title": "Multimodal Fallback", + "statusOn": "Enabled: image requests auto-switch to vision model", + "statusOff": "Disabled", + "hint": "When enabled, if a request contains images but the current model doesn't support vision (e.g. mimo-v2.5-pro), automatically switch to the specified multimodal model (default: mimo-v2.5) to avoid images being silently dropped.", + "modelLabel": "Fallback model", + "modelPlaceholder": "e.g. mimo-v2.5" + }, "targets": { "title": "Available combinations", "externalWarn": "Your current ~/.codex/auth.json was not written by mimo2codex (probably a real OpenAI login or another tool). 'Write files and enable' will back it up first, then overwrite — restore is always available.", diff --git a/web/src/i18n/locales/zh-CN/codexEnable.json b/web/src/i18n/locales/zh-CN/codexEnable.json index b571394..ae2d29b 100644 --- a/web/src/i18n/locales/zh-CN/codexEnable.json +++ b/web/src/i18n/locales/zh-CN/codexEnable.json @@ -62,6 +62,14 @@ "hint": "「开/关思考」:关闭后所有 provider 都不思考(mimo / deepseek 发 thinking:{type:\"disabled\"},sensenova / 其他 generic 发 reasoning_effort:\"none\")。开启时各 provider 走自己的默认行为(多数客户端对非 GPT-5 模型默认不带 reasoning effort,上游可能对简单任务跳过思考)。「强制高强度思考」:仅在「开/关思考」是开时可用,Codex 没明确传 effort 时由 mimo2codex 兜底注入 reasoning_effort=\"high\";Codex 显式传过的值会被尊重,不被覆盖。修改后立即对新请求生效(无需重启)。", "cliOverride": "「开/关思考」当前由 CLI flag (--disable-thinking) 或环境变量 (MIMO2CODEX_DISABLE_THINKING) 控制,开关被锁定。如需用 UI 控制,启动时不要带这些参数。" }, + "visionFallback": { + "title": "多模态 Fallback", + "statusOn": "已启用:图片请求自动切换 vision 模型", + "statusOff": "未启用", + "hint": "开启后,当请求包含图片但当前模型不支持 vision(如 mimo-v2.5-pro),自动切换到指定的多模态模型(默认 mimo-v2.5),避免图片被静默丢弃。", + "modelLabel": "Fallback 模型", + "modelPlaceholder": "例如 mimo-v2.5" + }, "targets": { "title": "可启用组合", "externalWarn": "当前 ~/.codex/auth.json 不是 mimo2codex 写入的(可能是真 OpenAI 登录或其他工具)。点「写入文件并启用」会先自动备份再覆盖,恢复随时可做。", diff --git a/web/src/pages/codex/CodexEnable.tsx b/web/src/pages/codex/CodexEnable.tsx index fc331bb..4650627 100644 --- a/web/src/pages/codex/CodexEnable.tsx +++ b/web/src/pages/codex/CodexEnable.tsx @@ -5,6 +5,7 @@ import { Button, Card, Collapse, + Input, Modal, Space, Switch, @@ -62,6 +63,10 @@ export function CodexEnable() { const [forceHighEffort, setForceHighEffort] = useState(null); const [forceHighEffortSaving, setForceHighEffortSaving] = useState(false); + // visionFallback:多模态 fallback 开关 + 目标模型。null = 加载中。 + const [visionFallbackEnabled, setVisionFallbackEnabled] = useState(null); + const [visionFallbackModel, setVisionFallbackModel] = useState("mimo-v2.5"); + const [visionFallbackSaving, setVisionFallbackSaving] = useState(false); async function doProbe(target: CodexTarget) { const key = `${target.providerId}::${target.modelId}`; @@ -93,10 +98,11 @@ export function CodexEnable() { async function load() { try { setError(null); - const [s, ts, think] = await Promise.all([ + const [s, ts, think, vf] = await Promise.all([ api.codexState(), api.codexTargets(), api.thinkingState().catch(() => null), // 老后端没此端点时降级 + api.visionFallback().catch(() => null), // 老后端没此端点时降级 ]); setState(s); setTargetsResp(ts); @@ -105,6 +111,12 @@ export function CodexEnable() { setThinkingCliOverridden(think.cliOverride !== null); setForceHighEffort(think.forceHighEffort); } + if (vf) { + setVisionFallbackEnabled(vf.enabled); + setVisionFallbackModel(vf.model); + } else { + setVisionFallbackEnabled(false); + } } catch (err) { setError((err as Error).message); } @@ -134,6 +146,32 @@ export function CodexEnable() { } } + async function doToggleVisionFallback(enabled: boolean): Promise { + setVisionFallbackSaving(true); + try { + await api.setVisionFallback({ enabled }); + setVisionFallbackEnabled(enabled); + } catch (err) { + setError((err as Error).message); + } finally { + setVisionFallbackSaving(false); + } + } + + async function doSetVisionFallbackModel(model: string): Promise { + const trimmed = model.trim(); + if (!trimmed || trimmed === visionFallbackModel) return; + setVisionFallbackSaving(true); + try { + await api.setVisionFallback({ model: trimmed }); + setVisionFallbackModel(trimmed); + } catch (err) { + setError((err as Error).message); + } finally { + setVisionFallbackSaving(false); + } + } + useEffect(() => { void load(); }, []); @@ -527,6 +565,62 @@ export function CodexEnable() { )} )} + {visionFallbackEnabled !== null && ( + + + + void doToggleVisionFallback(enabled) + } + checkedChildren={t("thinking.switchOn")} + unCheckedChildren={t("thinking.switchOff")} + /> + + {visionFallbackEnabled + ? t("visionFallback.statusOn") + : t("visionFallback.statusOff")} + + +
+ + {t("visionFallback.modelLabel")} + + + void doSetVisionFallbackModel(e.target.value) + } + onPressEnter={() => + void doSetVisionFallbackModel(visionFallbackModel) + } + style={{ width: 240, marginTop: 4, marginLeft: 4 }} + /> +
+ + {t("visionFallback.hint")} + +
+ )} {state && (