From 0b1e08f8138a680faf2d26e141c70de764da25a5 Mon Sep 17 00:00:00 2001
From: yabo083 <sz7372797@gmail.com>
Date: Sun, 17 May 2026 20:08:48 +0800
Subject: [PATCH 1/7] feat(service-multimodal): support MiMo audio and image
 inputs

---
 packages/service-multimodal/README.md         |  30 ++-
 packages/service-multimodal/package.json      |   4 +-
 packages/service-multimodal/src/audio.ts      | 111 +++++++++
 packages/service-multimodal/src/index.ts      |   6 +
 packages/service-multimodal/src/media.ts      |  27 +++
 .../service-multimodal/src/plugins/audio.ts   |  63 +++--
 .../service-multimodal/src/plugins/image.ts   |   6 +-
 .../src/plugins/read_files.ts                 | 220 ++++++++++++------
 .../src/read_files_schema.ts                  |  31 +++
 .../tests/audio-mimo.test.ts                  | 149 ++++++++++++
 10 files changed, 548 insertions(+), 99 deletions(-)
 create mode 100644 packages/service-multimodal/src/audio.ts
 create mode 100644 packages/service-multimodal/src/media.ts
 create mode 100644 packages/service-multimodal/src/read_files_schema.ts
 create mode 100644 packages/service-multimodal/tests/audio-mimo.test.ts

diff --git a/packages/service-multimodal/README.md b/packages/service-multimodal/README.md
index e96b4badc..a630b5799 100644
--- a/packages/service-multimodal/README.md
+++ b/packages/service-multimodal/README.md
@@ -1,7 +1,29 @@
-## koishi-plugin-chatluna-long-memory
+## koishi-plugin-chatluna-multimodal-service
 
-## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package/koishi-plugin-chatluna-long-memory) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package//koishi-plugin-chatluna-long-memory)
+ChatLuna 多模态支持服务，提供上下文图像描述、GIF 帧处理、`read_files` 文件读取，以及语音消息转码注入能力。
 
-> 提供长期记忆支持的插件
+### MiMo 音频理解
 
-[长期记忆文档](https://chatluna.chat/ecosystem/renderer/image.html)
+MiMo 官方 OpenAI 兼容接口中，`mimo-v2.5` 与 `mimo-v2-omni` 支持音频理解。服务会将 QQ/OneBot 语音先下载到本地内存，必要时通过 `ffmpeg` 转成 MP3，再以 Base64 data URL 注入 `input_audio`：
+
+- 避免 QQ CDN 直链过期导致模型侧晚读失败。
+- 规避 AMR、Silk 等上游模型不稳定支持的格式。
+- 遵循 MiMo Base64 单音频 50 MB 上限；URL 输入的官方上限是单文件 100 MB。
+
+MiMo 官方列出的音频格式为 MP3、WAV、FLAC、M4A、OGG。实际变体较多，服务默认把语音消息转成 MP3 以提高稳定性。
+
+`read_files` 也会沿用这条路线：工具调用层如果把 `files` 传成 JSON 字符串，会先容错解析；音频 URL 即使被缓存服务误标为 MP3，也会按文件头识别 AMR/Silk 等实际格式，并在模型注入前通过 `ffmpeg` 转成 MP3。
+
+### MiMo 图片理解
+
+`mimo-v2.5` 与 `mimo-v2-omni` 也支持图片理解。即使 OpenAI 兼容适配器暂未在模型元数据中声明 `ImageInput`，服务也会把这两个 MiMo 模型视为原生图片输入模型，并使用标准 OpenAI 兼容 `image_url` 内容块注入 Base64 data URL。
+
+- 支持 JPEG、PNG、GIF、WebP、BMP。
+- MiMo Base64 单图片上限为 50 MB；URL 单图片官方上限同样为 50 MB。
+- 多图输入受模型上下文和 token 长度限制。
+
+音频消息转码需要启用：
+
+- `enableAudioFfmpegConversion`
+- `koishi-plugin-ffmpeg-path`
+- 官方 Bot/QQ Silk 语音还需要 `koishi-plugin-ffmpeg-path` 提供的 `silk` 服务
diff --git a/packages/service-multimodal/package.json b/packages/service-multimodal/package.json
index 98586c1f9..f79bab824 100644
--- a/packages/service-multimodal/package.json
+++ b/packages/service-multimodal/package.json
@@ -34,7 +34,8 @@
     },
     "homepage": "https://github.com/ChatLunaLab/chatluna/tree/v1-dev/packages/service-image#readme",
     "scripts": {
-        "build": "atsc -b"
+        "build": "atsc -b",
+        "test": "tsx --test tests/*.test.ts"
     },
     "keywords": [
         "chatbot",
@@ -57,6 +58,7 @@
         "@types/omggif": "^1.0.5",
         "atsc": "^2.1.0",
         "koishi": "^4.18.9",
+        "koishi-plugin-adapter-onebot": "^6.9.3",
         "koishi-plugin-ffmpeg-path": "^2.0.0"
     },
     "peerDependencies": {
diff --git a/packages/service-multimodal/src/audio.ts b/packages/service-multimodal/src/audio.ts
new file mode 100644
index 000000000..71aa25f00
--- /dev/null
+++ b/packages/service-multimodal/src/audio.ts
@@ -0,0 +1,111 @@
+import { MessageContentComplex } from '@langchain/core/messages'
+import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
+
+export const MIMO_BASE64_AUDIO_BYTES = 50 * 1024 * 1024
+export const MIMO_BASE64_IMAGE_BYTES = 50 * 1024 * 1024
+
+const mimoModels = new Set(['mimo-v2.5', 'mimo-v2-omni'])
+
+const mimoAudioMimes = new Set([
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/flac',
+    'audio/mp4',
+    'audio/ogg'
+])
+
+const mimoImageMimes = new Set([
+    'image/jpeg',
+    'image/png',
+    'image/gif',
+    'image/webp',
+    'image/bmp'
+])
+
+export function isMimoAudioModel(model?: string): boolean {
+    if (!model) return false
+    return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '')
+}
+
+export function isMimoImageModel(model?: string): boolean {
+    if (!model) return false
+    return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '')
+}
+
+export function isMimoAudioMime(mime: string): boolean {
+    return mimoAudioMimes.has(mime.toLowerCase())
+}
+
+export function isMimoImageMime(mime: string): boolean {
+    return mimoImageMimes.has(mime.toLowerCase())
+}
+
+export function modelCanReadAudio(
+    info:
+        | {
+              value?: {
+                  capabilities?: ModelCapabilities[]
+              }
+          }
+        | undefined,
+    model?: string
+): boolean {
+    return (
+        isMimoAudioModel(model) ||
+        info?.value?.capabilities?.includes(ModelCapabilities.AudioInput) ===
+            true
+    )
+}
+
+export function modelCanReadImage(
+    info:
+        | {
+              value?: {
+                  capabilities?: ModelCapabilities[]
+              }
+          }
+        | undefined,
+    model?: string
+): boolean {
+    return (
+        isMimoImageModel(model) ||
+        info?.value?.capabilities?.includes(ModelCapabilities.ImageInput) ===
+            true
+    )
+}
+
+export function buildAudioContent(
+    model: string | undefined,
+    data: string,
+    mime: string
+): MessageContentComplex {
+    if (isMimoAudioModel(model)) {
+        return {
+            type: 'input_audio',
+            input_audio: {
+                data: `data:${mime};base64,${data}`
+            }
+        } as unknown as MessageContentComplex
+    }
+
+    return {
+        type: 'audio_url',
+        audio_url: {
+            url: `data:${mime};base64,${data}`,
+            mimeType: mime
+        }
+    } as unknown as MessageContentComplex
+}
+
+export function buildImageContent(
+    data: string,
+    mime: string
+): MessageContentComplex {
+    return {
+        type: 'image_url',
+        image_url: {
+            url: `data:${mime};base64,${data}`
+        }
+    } as unknown as MessageContentComplex
+}
diff --git a/packages/service-multimodal/src/index.ts b/packages/service-multimodal/src/index.ts
index f1e7588a6..b7b402a42 100644
--- a/packages/service-multimodal/src/index.ts
+++ b/packages/service-multimodal/src/index.ts
@@ -99,4 +99,10 @@ export const usage = `
 
 ### 注意
 建议搭配 \`chatluna-storage-service\` 使用。请求中的图像、文件大小限制遵循模型平台配置（如 Gemini：PDF 单文件 50MB、其他单文件 100MB、单轮总计 100MB，以文件被编码为 Base64 后的大小为准）。
+
+### MiMo 音频理解
+\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的音频理解走 OpenAI 兼容 \`input_audio\`。启用音频转换后，服务会先读取语音 URL，必要时用 ffmpeg/Silk 转为 MP3，再以 Base64 data URL 注入模型，避免 QQ CDN 过期和 AMR/Silk 等格式兼容问题。MiMo Base64 单音频上限为 50MB。
+
+### MiMo 图片理解
+\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的图片理解走 OpenAI 兼容 \`image_url\`。即使适配器暂未声明 \`ImageInput\`，服务也会按 MiMo 官方能力接入 JPEG、PNG、GIF、WebP、BMP，Base64 与 URL 单图片上限均为 50MB，多图受模型上下文限制。
 `
diff --git a/packages/service-multimodal/src/media.ts b/packages/service-multimodal/src/media.ts
new file mode 100644
index 000000000..0c2cd3602
--- /dev/null
+++ b/packages/service-multimodal/src/media.ts
@@ -0,0 +1,27 @@
+export function detectAudioMimeType(
+    buffer: Buffer,
+    declaredMimeType?: string | null
+): string | null {
+    const header = buffer.subarray(0, 16).toString('latin1')
+
+    if (header.startsWith('#!AMR')) return 'audio/amr'
+    if (
+        header.startsWith('#!SILK_V3') ||
+        buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+    ) {
+        return 'audio/silk'
+    }
+    if (header.startsWith('ID3') || buffer[0] === 0xff) {
+        return 'audio/mpeg'
+    }
+    if (
+        header.startsWith('RIFF') &&
+        buffer.subarray(8, 12).toString('latin1') === 'WAVE'
+    ) {
+        return 'audio/wav'
+    }
+    if (header.startsWith('fLaC')) return 'audio/flac'
+    if (header.startsWith('OggS')) return 'audio/ogg'
+
+    return declaredMimeType ?? null
+}
diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts
index f8ce7fbdb..57ccf5f64 100644
--- a/packages/service-multimodal/src/plugins/audio.ts
+++ b/packages/service-multimodal/src/plugins/audio.ts
@@ -2,10 +2,15 @@ import { MessageContentComplex } from '@langchain/core/messages'
 import { Context, h, Session } from 'koishi'
 import type { OneBotBot } from 'koishi-plugin-adapter-onebot'
 import { Message } from 'koishi-plugin-chatluna'
-import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import type {} from 'koishi-plugin-chatluna-storage-service'
 import type {} from 'koishi-plugin-ffmpeg-path'
 import { Config, logger } from '..'
+import {
+    buildAudioContent,
+    isMimoAudioModel,
+    MIMO_BASE64_AUDIO_BYTES,
+    modelCanReadAudio
+} from '../audio'
 
 const CHATLUNA_DOWNLOAD_USER_AGENT =
     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
@@ -24,12 +29,7 @@ export function apply(ctx: Context, config: Config) {
                     ? ctx.chatluna.platform.findModel(model)
                     : undefined
 
-                // If the model doesn't accept audio input, keep fallback path unchanged.
-                if (
-                    modelInfo?.value?.capabilities?.includes(
-                        ModelCapabilities.AudioInput
-                    ) === false
-                ) {
+                if (!modelCanReadAudio(modelInfo, model)) {
                     return false
                 }
 
@@ -66,24 +66,41 @@ export function apply(ctx: Context, config: Config) {
                 element.attrs['file'] = displayFileName
                 element.attrs['filename'] = displayFileName
 
-                const audioUrl = ctx.chatluna_storage
-                    ? (element.attrs['chatluna_file_url'] = (
-                          await ctx.chatluna_storage.createTempFile(
-                              buffer,
-                              displayFileName
-                          )
-                      ).url)
-                    : ((element.attrs['chatluna_file_url'] = sourceUrl),
-                      `data:audio/mpeg;base64,${buffer.toString('base64')}`)
+                const base64 = buffer.toString('base64')
+
+                if (
+                    isMimoAudioModel(model) &&
+                    Buffer.byteLength(base64) > MIMO_BASE64_AUDIO_BYTES
+                ) {
+                    logger.warn(
+                        `Skip oversized MiMo audio after base64 encoding: ${Buffer.byteLength(base64)} bytes > ${MIMO_BASE64_AUDIO_BYTES} bytes`
+                    )
+                    return false
+                }
+
+                const audioUrl =
+                    !isMimoAudioModel(model) && ctx.chatluna_storage
+                        ? (element.attrs['chatluna_file_url'] = (
+                              await ctx.chatluna_storage.createTempFile(
+                                  buffer,
+                                  displayFileName
+                              )
+                          ).url)
+                        : ((element.attrs['chatluna_file_url'] = sourceUrl),
+                          `data:audio/mpeg;base64,${base64}`)
 
                 ensureContentArray(message, `[voice:${displayFileName}]`)
-                ;(message.content as MessageContentComplex[]).push({
-                    type: 'audio_url',
-                    audio_url: {
-                        url: audioUrl,
-                        mimeType: 'audio/mpeg'
-                    }
-                } as unknown as MessageContentComplex)
+                ;(message.content as MessageContentComplex[]).push(
+                    isMimoAudioModel(model)
+                        ? buildAudioContent(model, base64, 'audio/mpeg')
+                        : ({
+                              type: 'audio_url',
+                              audio_url: {
+                                  url: audioUrl,
+                                  mimeType: 'audio/mpeg'
+                              }
+                          } as unknown as MessageContentComplex)
+                )
 
                 logger.debug(
                     `Transcoded unsupported audio to mp3 for multimodal input: ${displayFileName}`
diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts
index e29db3c56..560eb959a 100644
--- a/packages/service-multimodal/src/plugins/image.ts
+++ b/packages/service-multimodal/src/plugins/image.ts
@@ -11,6 +11,7 @@ import {
     processImageWithModel,
     readImage
 } from '../utils'
+import { modelCanReadImage } from '../audio'
 
 export async function apply(
     ctx: Context,
@@ -29,10 +30,7 @@ export async function apply(
                     ? ctx.chatluna.platform.findModel(model)
                     : undefined
             const modelSupportsImageInput =
-                parsedModelInfo?.value != null &&
-                parsedModelInfo.value.capabilities.includes(
-                    ModelCapabilities.ImageInput
-                )
+                modelCanReadImage(parsedModelInfo, model)
 
             let imageData: Awaited<ReturnType<typeof readImage>>
             const url = (element.attrs.url ?? element.attrs.src) as string
diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index ea2e578df..896261e50 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -3,6 +3,7 @@ import { StructuredTool } from '@langchain/core/tools'
 import { HumanMessage, MessageContentComplex } from '@langchain/core/messages'
 import { Context } from 'koishi'
 import { ComputedRef, Message } from 'koishi-plugin-chatluna'
+import type {} from 'koishi-plugin-ffmpeg-path'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
 import {
     ChatLunaToolRunnable,
@@ -12,7 +13,6 @@ import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import {
     isMessageContentAudio,
     isMessageContentVideo,
-    type MessageContentAudio,
     type MessageContentVideo
 } from 'koishi-plugin-chatluna/utils/langchain'
 import { getBase64EncodedSize } from 'koishi-plugin-chatluna/utils/base64'
@@ -23,6 +23,18 @@ import {
     parseGifToFrames,
     processImageWithModel
 } from '../utils'
+import {
+    buildAudioContent,
+    buildImageContent,
+    isMimoAudioMime,
+    isMimoImageMime,
+    MIMO_BASE64_AUDIO_BYTES,
+    MIMO_BASE64_IMAGE_BYTES,
+    modelCanReadAudio,
+    modelCanReadImage
+} from '../audio'
+import { detectAudioMimeType } from '../media'
+import { readFilesInputSchema } from '../read_files_schema'
 import z from 'zod'
 
 // ---------------------------------------------------------------------------
@@ -97,6 +109,28 @@ function normalizeMimeType(raw: string | null): string | null {
     return mimeType || null
 }
 
+function getHeaderValue(headers: unknown, name: string): string | null {
+    if (headers == null) return null
+
+    if (
+        typeof (headers as { get?: unknown }).get === 'function'
+    ) {
+        const value = (headers as { get(name: string): string | null }).get(
+            name
+        )
+        return typeof value === 'string' ? value : null
+    }
+
+    const record = headers as Record<string, unknown>
+    const value = record[name] ?? record[name.toLowerCase()]
+    if (typeof value === 'string') return value
+    if (Array.isArray(value) && typeof value[0] === 'string') {
+        return value[0]
+    }
+
+    return null
+}
+
 function inferMimeTypeFromPath(path: string): string | null {
     const sanitizedPath = path.toLowerCase().split(/[?#]/, 1)[0]
     const fileName = sanitizedPath.split(/[/\\]/).pop() ?? sanitizedPath
@@ -121,6 +155,37 @@ function inferMimeTypeFromUrl(url: string): string | null {
     return null
 }
 
+async function convertAudioBufferToMp3(
+    ctx: Context,
+    buffer: Buffer
+): Promise<Buffer | null> {
+    const ffmpeg = ctx.ffmpeg
+    if (!ffmpeg) {
+        return null
+    }
+
+    try {
+        return await ffmpeg
+            .builder()
+            .input(buffer)
+            .outputOption(
+                '-vn',
+                '-acodec',
+                'libmp3lame',
+                '-q:a',
+                '4',
+                '-f',
+                'mp3'
+            )
+            .run('buffer')
+    } catch (error) {
+        logger.warn(
+            `read_files audio transcoding to mp3 failed: ${error instanceof Error ? error.message : String(error)}`
+        )
+        return null
+    }
+}
+
 /**
  * Check whether the model natively supports a given MIME type based on its
  * capabilities and `FileHandlingConfig`.
@@ -133,9 +198,15 @@ function modelSupportsNativeMimeType(
 
     let capabilitySupportsMime = false
     if (IMAGE_MIME_TYPES.has(mimeType)) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.ImageInput)
+        capabilitySupportsMime = modelCanReadImage(
+            { value: model.modelInfo },
+            model.modelInfo.name
+        )
     } else if (mimeType.startsWith('audio/')) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.AudioInput)
+        capabilitySupportsMime = modelCanReadAudio(
+            { value: model.modelInfo },
+            model.modelInfo.name
+        )
     } else if (mimeType.startsWith('video/')) {
         capabilitySupportsMime = caps.includes(ModelCapabilities.VideoInput)
     } else if (
@@ -207,7 +278,8 @@ function buildMultimodalMessage(
         base64Data: string
         sourceUrl: string
     }[],
-    insertPrompt: string
+    insertPrompt: string,
+    model?: string
 ): HumanMessage {
     const content: MessageContentComplex[] = []
 
@@ -215,23 +287,14 @@ function buildMultimodalMessage(
         const { mimeType, base64Data } = part
 
         if (IMAGE_MIME_TYPES.has(mimeType)) {
-            content.push({
-                type: 'image_url',
-                image_url: {
-                    url: `data:${mimeType};base64,${base64Data}`
-                }
-            })
+            content.push(buildImageContent(base64Data, mimeType))
         } else if (mimeType.startsWith('audio/')) {
-            const audioContent: MessageContentAudio = {
-                type: 'audio_url',
-                audio_url: {
-                    url: `data:${mimeType};base64,${base64Data}`,
-                    mimeType
-                }
-            }
+            const audioContent = buildAudioContent(model, base64Data, mimeType)
 
             if (isMessageContentAudio(audioContent as MessageContentComplex)) {
                 content.push(audioContent as MessageContentComplex)
+            } else if (audioContent.type === 'input_audio') {
+                content.push(audioContent as MessageContentComplex)
             }
         } else if (mimeType.startsWith('video/')) {
             const videoContent: MessageContentVideo = {
@@ -274,25 +337,7 @@ export class ReadFilesTool extends StructuredTool {
     name = 'read_files'
     description: string
 
-    schema = z.object({
-        files: z
-            .union([
-                z.object({
-                    url: z.string().url()
-                }),
-                z
-                    .array(
-                        z.object({
-                            url: z.string().url()
-                        })
-                    )
-                    .min(1)
-                    .max(10)
-            ])
-            .describe(
-                'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
-            )
-    })
+    schema = readFilesInputSchema
 
     constructor(
         private readonly ctx: Context,
@@ -383,23 +428,21 @@ export class ReadFilesTool extends StructuredTool {
                 const buffer = Buffer.from(httpResponse.data)
 
                 // Resolve MIME type from response headers or URL
-                const headers = httpResponse.headers as unknown as
-                    | Record<string, unknown>
-                    | undefined
-                const rawCt =
-                    headers?.['content-type'] ?? headers?.['Content-Type']
-                let responseMimeType: string | null = null
-                if (typeof rawCt === 'string') {
-                    responseMimeType = normalizeMimeType(rawCt)
-                } else if (
-                    Array.isArray(rawCt) &&
-                    typeof rawCt[0] === 'string'
-                ) {
-                    responseMimeType = normalizeMimeType(rawCt[0])
-                }
+                const responseMimeType = normalizeMimeType(
+                    getHeaderValue(httpResponse.headers, 'content-type')
+                )
 
-                const mimeType =
+                const declaredMimeType =
                     responseMimeType ?? inferMimeTypeFromUrl(sourceUrl)
+                const detectedAudioMimeType = detectAudioMimeType(
+                    buffer,
+                    declaredMimeType
+                )
+                const mimeType =
+                    declaredMimeType?.startsWith('audio/') ||
+                    detectedAudioMimeType?.startsWith('audio/')
+                        ? detectedAudioMimeType
+                        : declaredMimeType
 
                 if (!mimeType) {
                     pushError(
@@ -418,23 +461,60 @@ export class ReadFilesTool extends StructuredTool {
 
                 // Check if the model supports this MIME type natively
                 const isImage = IMAGE_MIME_TYPES.has(mimeType)
+                const isAudio = mimeType.startsWith('audio/')
                 const modelSupports =
                     model != null &&
-                    modelSupportsNativeMimeType(model, mimeType)
+                    (isAudio
+                        ? modelCanReadAudio(
+                              { value: model.modelInfo },
+                              model.modelInfo.name
+                          )
+                        : modelSupportsNativeMimeType(model, mimeType))
 
                 if (modelSupports && !isImage) {
                     // Non-image file that the model supports natively -> inline inject
-                    const maxFileSize =
-                        fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ??
-                        fileConfig?.maxFileSizeBytes ??
-                        DEFAULT_MAX_FILE_SIZE_BYTES
+                    let nativeBuffer: Buffer = buffer
+                    let nativeMimeType = mimeType
 
-                    const encodedSize = getBase64EncodedSize(buffer.byteLength)
+                    if (isAudio && !isMimoAudioMime(mimeType)) {
+                        const converted = await convertAudioBufferToMp3(
+                            this.ctx,
+                            buffer
+                        )
+
+                        if (!converted) {
+                            pushError(
+                                `Unsupported audio MIME type "${mimeType}" and ffmpeg conversion to MP3 failed.`,
+                                mimeType
+                            )
+                            continue
+                        }
+
+                        nativeBuffer = converted
+                        nativeMimeType = 'audio/mpeg'
+                        logger.debug(
+                            `Transcoded read_files audio from ${mimeType} to audio/mpeg for multimodal input`
+                        )
+                    }
+
+                    const maxFileSize =
+                        isMimoAudioMime(nativeMimeType) &&
+                        modelCanReadAudio(undefined, model?.modelInfo.name)
+                            ? MIMO_BASE64_AUDIO_BYTES
+                            : (fileConfig?.maxFileSizeBytesOverrides?.[
+                                  nativeMimeType
+                              ] ??
+                              fileConfig?.maxFileSizeBytes ??
+                              DEFAULT_MAX_FILE_SIZE_BYTES)
+
+                    const encodedSize = getBase64EncodedSize(
+                        nativeBuffer.byteLength
+                    )
 
                     if (encodedSize > maxFileSize) {
                         pushError(
-                            `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${mimeType}`,
-                            mimeType
+                            `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${nativeMimeType}`,
+                            nativeMimeType
                         )
                         continue
                     }
@@ -442,21 +522,21 @@ export class ReadFilesTool extends StructuredTool {
                     if (totalBase64Bytes + encodedSize > maxTotalSize) {
                         pushError(
                             `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`,
-                            mimeType
+                            nativeMimeType
                         )
                         continue
                     }
 
                     totalBase64Bytes += encodedSize
                     nativeParts.push({
-                        mimeType,
-                        base64Data: buffer.toString('base64'),
+                        mimeType: nativeMimeType,
+                        base64Data: nativeBuffer.toString('base64'),
                         sourceUrl
                     })
 
                     response.files.push({
                         sourceUrl,
-                        mimeType,
+                        mimeType: nativeMimeType,
                         status: 'ok'
                     })
                     response.successCount++
@@ -464,9 +544,14 @@ export class ReadFilesTool extends StructuredTool {
                     // Image that the model supports natively -> inject directly
                     // Unified per-file size check before any branching
                     const maxFileSize =
-                        fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ??
-                        fileConfig?.maxFileSizeBytes ??
-                        DEFAULT_MAX_FILE_SIZE_BYTES
+                        isMimoImageMime(mimeType) &&
+                        modelCanReadImage(undefined, model?.modelInfo.name)
+                            ? MIMO_BASE64_IMAGE_BYTES
+                            : (fileConfig?.maxFileSizeBytesOverrides?.[
+                                  mimeType
+                              ] ??
+                              fileConfig?.maxFileSizeBytes ??
+                              DEFAULT_MAX_FILE_SIZE_BYTES)
 
                     const encodedSize = getBase64EncodedSize(buffer.byteLength)
 
@@ -592,7 +677,8 @@ export class ReadFilesTool extends StructuredTool {
         if (nativeParts.length > 0 && conversationId) {
             const message = buildMultimodalMessage(
                 nativeParts,
-                this.config.fileInsertPrompt
+                this.config.fileInsertPrompt,
+                model?.modelInfo.name
             )
 
             this.ctx.chatluna.contextManager.inject({
diff --git a/packages/service-multimodal/src/read_files_schema.ts b/packages/service-multimodal/src/read_files_schema.ts
new file mode 100644
index 000000000..8368f395d
--- /dev/null
+++ b/packages/service-multimodal/src/read_files_schema.ts
@@ -0,0 +1,31 @@
+import z from 'zod'
+
+const READ_FILE_SCHEMA = z.object({
+    url: z.string().url()
+})
+
+function parseJsonStringInput(value: unknown): unknown {
+    if (typeof value !== 'string') {
+        return value
+    }
+
+    try {
+        return JSON.parse(value)
+    } catch {
+        return value
+    }
+}
+
+export const readFilesInputSchema = z.object({
+    files: z
+        .preprocess(
+            parseJsonStringInput,
+            z.union([
+                READ_FILE_SCHEMA,
+                z.array(READ_FILE_SCHEMA).min(1).max(10)
+            ])
+        )
+        .describe(
+            'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
+        )
+})
diff --git a/packages/service-multimodal/tests/audio-mimo.test.ts b/packages/service-multimodal/tests/audio-mimo.test.ts
new file mode 100644
index 000000000..03b09d706
--- /dev/null
+++ b/packages/service-multimodal/tests/audio-mimo.test.ts
@@ -0,0 +1,149 @@
+import assert from 'node:assert/strict'
+import { test } from 'node:test'
+import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
+import {
+    MIMO_BASE64_AUDIO_BYTES,
+    MIMO_BASE64_IMAGE_BYTES,
+    buildAudioContent,
+    buildImageContent,
+    isMimoAudioMime,
+    isMimoImageMime,
+    modelCanReadAudio,
+    modelCanReadImage
+} from '../src/audio'
+import { detectAudioMimeType } from '../src/media'
+import { readFilesInputSchema } from '../src/read_files_schema'
+
+test('recognizes MiMo audio models even when adapter metadata lacks AudioInput', () => {
+    assert.equal(
+        modelCanReadAudio(
+            { value: { capabilities: [ModelCapabilities.ToolCall] } },
+            'unifyllm/mimo-v2.5'
+        ),
+        true
+    )
+    assert.equal(
+        modelCanReadAudio(
+            { value: { capabilities: [ModelCapabilities.ToolCall] } },
+            'mimo-v2-omni'
+        ),
+        true
+    )
+    assert.equal(
+        modelCanReadAudio(
+            { value: { capabilities: [ModelCapabilities.ToolCall] } },
+            'unifyllm/deepseek-v4-flash'
+        ),
+        false
+    )
+})
+
+test('uses MiMo input_audio data URL instead of ChatLuna audio_url', () => {
+    assert.deepEqual(buildAudioContent('mimo-v2.5', 'abc', 'audio/mpeg'), {
+        type: 'input_audio',
+        input_audio: {
+            data: 'data:audio/mpeg;base64,abc'
+        }
+    })
+    assert.deepEqual(buildAudioContent('gpt-4o-audio', 'abc', 'audio/mpeg'), {
+        type: 'audio_url',
+        audio_url: {
+            url: 'data:audio/mpeg;base64,abc',
+            mimeType: 'audio/mpeg'
+        }
+    })
+})
+
+test('keeps MiMo base64 audio within the documented 50 MB limit', () => {
+    assert.equal(MIMO_BASE64_AUDIO_BYTES, 50 * 1024 * 1024)
+    assert.equal(isMimoAudioMime('audio/mpeg'), true)
+    assert.equal(isMimoAudioMime('audio/wav'), true)
+    assert.equal(isMimoAudioMime('audio/flac'), true)
+    assert.equal(isMimoAudioMime('audio/mp4'), true)
+    assert.equal(isMimoAudioMime('audio/ogg'), true)
+    assert.equal(isMimoAudioMime('audio/aac'), false)
+})
+
+test('recognizes MiMo image models even when adapter metadata lacks ImageInput', () => {
+    assert.equal(
+        modelCanReadImage(
+            { value: { capabilities: [ModelCapabilities.ToolCall] } },
+            'unifyllm/mimo-v2.5'
+        ),
+        true
+    )
+    assert.equal(
+        modelCanReadImage(
+            { value: { capabilities: [ModelCapabilities.ToolCall] } },
+            'mimo-v2-omni'
+        ),
+        true
+    )
+    assert.equal(
+        modelCanReadImage(
+            { value: { capabilities: [ModelCapabilities.ToolCall] } },
+            'unifyllm/deepseek-v4-flash'
+        ),
+        false
+    )
+})
+
+test('uses OpenAI image_url content for MiMo images', () => {
+    assert.deepEqual(buildImageContent('abc', 'image/png'), {
+        type: 'image_url',
+        image_url: {
+            url: 'data:image/png;base64,abc'
+        }
+    })
+})
+
+test('keeps MiMo base64 images within the documented 50 MB limit', () => {
+    assert.equal(MIMO_BASE64_IMAGE_BYTES, 50 * 1024 * 1024)
+    assert.equal(isMimoImageMime('image/jpeg'), true)
+    assert.equal(isMimoImageMime('image/png'), true)
+    assert.equal(isMimoImageMime('image/gif'), true)
+    assert.equal(isMimoImageMime('image/webp'), true)
+    assert.equal(isMimoImageMime('image/bmp'), true)
+    assert.equal(isMimoImageMime('image/svg+xml'), false)
+})
+
+test('accepts JSON-stringified read_files input from tool calls', () => {
+    assert.deepEqual(
+        readFilesInputSchema.parse({
+            files: '{"url":"http://127.0.0.1:5140/image.png"}'
+        }),
+        {
+            files: {
+                url: 'http://127.0.0.1:5140/image.png'
+            }
+        }
+    )
+
+    assert.deepEqual(
+        readFilesInputSchema.parse({
+            files: '[{"url":"http://127.0.0.1:5140/image.png"}]'
+        }),
+        {
+            files: [
+                {
+                    url: 'http://127.0.0.1:5140/image.png'
+                }
+            ]
+        }
+    )
+})
+
+test('detects AMR audio even when storage declares it as MP3', () => {
+    assert.equal(
+        detectAudioMimeType(Buffer.from('#!AMR\nabc'), 'audio/mp3'),
+        'audio/amr'
+    )
+    assert.equal(
+        detectAudioMimeType(Buffer.from('#!AMR\nabc'), null),
+        'audio/amr'
+    )
+    assert.equal(
+        detectAudioMimeType(Buffer.from('ID3abc'), 'audio/mp3'),
+        'audio/mpeg'
+    )
+})

From 6ac2a42ffd72b340dff1dc1c751a22cb0ede72b1 Mon Sep 17 00:00:00 2001
From: yabo083 <sz7372797@gmail.com>
Date: Mon, 18 May 2026 03:31:32 +0800
Subject: [PATCH 2/7] fix(service-multimodal): prevent JPEG from being
 misdetected as MP3

`detectAudioMimeType` checked only `buffer[0] === 0xFF` to identify
MP3 frame sync, but JPEG files also start with 0xFF (FF D8).
This caused every JPEG passed through `read_files` to be injected
into the conversation as `audio/mpeg`, crashing model APIs that
reject unsupported audio formats.

Tighten the check to require the full MPEG sync word:
`buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0`.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 packages/service-multimodal/src/media.ts      |  5 ++++-
 .../tests/audio-mimo.test.ts                  | 20 +++++++++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/packages/service-multimodal/src/media.ts b/packages/service-multimodal/src/media.ts
index 0c2cd3602..0e458913c 100644
--- a/packages/service-multimodal/src/media.ts
+++ b/packages/service-multimodal/src/media.ts
@@ -11,7 +11,10 @@ export function detectAudioMimeType(
     ) {
         return 'audio/silk'
     }
-    if (header.startsWith('ID3') || buffer[0] === 0xff) {
+    if (
+        header.startsWith('ID3') ||
+        (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0)
+    ) {
         return 'audio/mpeg'
     }
     if (
diff --git a/packages/service-multimodal/tests/audio-mimo.test.ts b/packages/service-multimodal/tests/audio-mimo.test.ts
index 03b09d706..240e67c3d 100644
--- a/packages/service-multimodal/tests/audio-mimo.test.ts
+++ b/packages/service-multimodal/tests/audio-mimo.test.ts
@@ -147,3 +147,23 @@ test('detects AMR audio even when storage declares it as MP3', () => {
         'audio/mpeg'
     )
 })
+
+test('does not misidentify JPEG as audio/mpeg', () => {
+    // JPEG starts with FF D8 FF E0 (JFIF) — 0xD8 & 0xE0 = 0xC0, not an MP3 sync
+    const jpegHeader = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46])
+    assert.equal(
+        detectAudioMimeType(jpegHeader, 'image/jpeg'),
+        'image/jpeg'
+    )
+    assert.equal(detectAudioMimeType(jpegHeader, null), null)
+})
+
+test('still detects valid MP3 frame sync', () => {
+    // MP3: FF FB (MPEG1 Layer3) — 0xFB & 0xE0 = 0xE0, valid sync
+    const mp3Header = Buffer.from([0xff, 0xfb, 0x90, 0x00])
+    assert.equal(detectAudioMimeType(mp3Header, null), 'audio/mpeg')
+
+    // MP3: FF F3 (MPEG2 Layer3)
+    const mp3v2Header = Buffer.from([0xff, 0xf3, 0x90, 0x00])
+    assert.equal(detectAudioMimeType(mp3v2Header, null), 'audio/mpeg')
+})

From 8819b136169d598df175cec72bc31fe16032eb55 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Mon, 18 May 2026 14:41:04 +0800
Subject: [PATCH 3/7] [Feature] support native multimodal file handling

---
 packages/adapter-claude/src/utils.ts          |  34 +-
 packages/adapter-gemini/src/utils.ts          |  41 +-
 packages/adapter-ollama/src/utils.ts          |  33 +-
 packages/adapter-openai-like/src/client.ts    |   6 +
 packages/adapter-openai/src/client.ts         |   6 +
 packages/adapter-qwen/src/utils.ts            |  52 +-
 packages/service-multimodal/README.md         |  28 +-
 packages/service-multimodal/package.json      |   4 +-
 packages/service-multimodal/src/audio.ts      | 111 ---
 packages/service-multimodal/src/index.ts      |   8 +-
 packages/service-multimodal/src/media.ts      |  30 -
 .../service-multimodal/src/plugins/audio.ts   | 410 ++------
 .../service-multimodal/src/plugins/image.ts   | 209 ++--
 .../src/plugins/read_files.ts                 | 922 +++++++-----------
 .../src/read_files_schema.ts                  |  31 -
 packages/service-multimodal/src/utils.ts      | 443 +++++----
 .../tests/audio-mimo.test.ts                  | 169 ----
 packages/shared-adapter/src/client.ts         |  81 +-
 packages/shared-adapter/src/utils.ts          | 146 ++-
 19 files changed, 1022 insertions(+), 1742 deletions(-)
 delete mode 100644 packages/service-multimodal/src/audio.ts
 delete mode 100644 packages/service-multimodal/src/media.ts
 delete mode 100644 packages/service-multimodal/src/read_files_schema.ts
 delete mode 100644 packages/service-multimodal/tests/audio-mimo.test.ts

diff --git a/packages/adapter-claude/src/utils.ts b/packages/adapter-claude/src/utils.ts
index 06c8f8e9e..b9d38a422 100644
--- a/packages/adapter-claude/src/utils.ts
+++ b/packages/adapter-claude/src/utils.ts
@@ -56,41 +56,15 @@ export async function langchainMessageToClaudeMessage(
 
     const mappedMessages = await Promise.all(
         messages.map(async (rawMessage) => {
-            let content: string | ClaudeInputContentBlockParam[] | undefined =
+            const content: string | ClaudeInputContentBlockParam[] | undefined =
                 typeof rawMessage.content === 'string'
                     ? rawMessage.content
                     : await processMessageContent(plugin, rawMessage.content)
 
-            const images = rawMessage.additional_kwargs.images as
-                | string[]
-                | null
-
-            if (
-                (model?.includes('claude-3') || model?.includes('claude-4')) &&
-                images != null
-            ) {
-                const mappedImages = await Promise.all(
-                    images.map(async (image) =>
-                        processImageContent(plugin, {
-                            type: 'image_url',
-                            image_url: { url: image }
-                        } as MessageContentImageUrl)
-                    )
+            if (rawMessage.additional_kwargs.images != null) {
+                logger.warn(
+                    'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
                 )
-
-                const nextContent: ClaudeInputContentBlockParam[] =
-                    mappedImages.filter((item) => item != null)
-
-                if (Array.isArray(content)) {
-                    nextContent.push(...content)
-                } else if ((content?.length ?? 0) > 0) {
-                    nextContent.push({
-                        type: 'text',
-                        text: content
-                    })
-                }
-
-                content = nextContent
             }
 
             const result: ClaudeMessage = {
diff --git a/packages/adapter-gemini/src/utils.ts b/packages/adapter-gemini/src/utils.ts
index 3875a1b95..9f5a0c5ba 100644
--- a/packages/adapter-gemini/src/utils.ts
+++ b/packages/adapter-gemini/src/utils.ts
@@ -14,7 +14,6 @@ import {
     ChatCompletionResponseMessageRoleEnum,
     ChatFunctionCallingPart,
     ChatFunctionResponsePart,
-    ChatMessagePart,
     ChatPart,
     ChatResponse,
     GeminiUsageMetadata
@@ -78,9 +77,10 @@ export async function langchainMessageToGeminiMessage(
                           thoughtData
                       )
 
-            const images = message.additional_kwargs.images as string[] | null
-            if (images) {
-                processImageParts(result, images, model)
+            if (message.additional_kwargs.images != null) {
+                logger.warn(
+                    'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
+                )
             }
 
             return result
@@ -203,39 +203,6 @@ async function processFunctionMessage(
         ]
     }
 }
-function processImageParts(
-    result: ChatCompletionResponseMessage,
-    images: string[],
-    model: string
-) {
-    if (
-        !(
-            (model.includes('vision') ||
-                model.includes('gemini') ||
-                model.includes('gemma2')) &&
-            !model.includes('gemini-1.0')
-        )
-    ) {
-        return
-    }
-
-    for (const image of images) {
-        const mineType = image.split(';')?.[0]?.split(':')?.[1] ?? 'image/jpeg'
-        const data = image.replace(/^data:image\/\w+;base64,/, '')
-
-        result.parts.push({
-            inline_data: { data, mime_type: mineType }
-        })
-    }
-
-    result.parts = result.parts.filter((uncheckedPart) => {
-        const part = partAsTypeCheck<ChatMessagePart>(
-            uncheckedPart,
-            (part) => part['text'] != null
-        )
-        return part == null || part.text.length > 0
-    })
-}
 
 async function processGeminiImageContent(
     plugin: ChatLunaPlugin,
diff --git a/packages/adapter-ollama/src/utils.ts b/packages/adapter-ollama/src/utils.ts
index 30f671cc6..946e2bab1 100644
--- a/packages/adapter-ollama/src/utils.ts
+++ b/packages/adapter-ollama/src/utils.ts
@@ -21,25 +21,24 @@ export async function langchainMessageToOllamaMessage(
 
     const mappedMessage = await Promise.all(
         messages.map(async (rawMessage) => {
-            let images: string[] = []
-
-            if (rawMessage.additional_kwargs.images != null && supportImage) {
-                images = rawMessage.additional_kwargs.images as string[]
-            } else {
-                images =
-                    typeof rawMessage.content === 'string'
-                        ? undefined
-                        : await Promise.all(
-                              rawMessage.content
-                                  .filter((part) =>
-                                      isMessageContentImageUrl(part)
-                                  )
-                                  .map((part) =>
-                                      processOllamaImageContent(plugin, part)
-                                  )
-                          )
+            if (rawMessage.additional_kwargs.images != null) {
+                logger.warn(
+                    'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
+                )
             }
 
+            const images: string[] | undefined = supportImage
+                ? typeof rawMessage.content === 'string'
+                    ? undefined
+                    : await Promise.all(
+                          rawMessage.content
+                              .filter((part) => isMessageContentImageUrl(part))
+                              .map((part) =>
+                                  processOllamaImageContent(plugin, part)
+                              )
+                      )
+                : undefined
+
             const result = {
                 role: messageTypeToOllamaRole(rawMessage.getType()),
                 content: getMessageContent(rawMessage.content),
diff --git a/packages/adapter-openai-like/src/client.ts b/packages/adapter-openai-like/src/client.ts
index 20403fdf3..3435f9df2 100644
--- a/packages/adapter-openai-like/src/client.ts
+++ b/packages/adapter-openai-like/src/client.ts
@@ -20,10 +20,12 @@ import { OpenAIRequester } from './requester'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import {
     getModelMaxContextSize,
+    getOpenAIFileHandlingConfig,
     isEmbeddingModel,
     isImageGenerationModel,
     isNonLLMModel,
     isRerankerModel,
+    supportAudioInput,
     supportImageInput
 } from '@chatluna/v1-shared-adapter'
 import { RunnableConfig } from '@langchain/core/runnables'
@@ -92,6 +94,9 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient {
                         ModelCapabilities.ToolCall,
                         supportImageInput(model)
                             ? ModelCapabilities.ImageInput
+                            : null,
+                        supportAudioInput(model)
+                            ? ModelCapabilities.AudioInput
                             : null
                     ].filter(Boolean)
                 }
@@ -167,6 +172,7 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient {
                 temperature: this._config.temperature,
                 maxRetries: this._config.maxRetries,
                 llmType: 'openai',
+                fileHandlingConfig: getOpenAIFileHandlingConfig(model),
                 isThinkModel:
                     model.includes('reasoner') ||
                     model.includes('r1') ||
diff --git a/packages/adapter-openai/src/client.ts b/packages/adapter-openai/src/client.ts
index 6a4857855..0e0f0ffc1 100644
--- a/packages/adapter-openai/src/client.ts
+++ b/packages/adapter-openai/src/client.ts
@@ -20,6 +20,8 @@ import { OpenAIRequester } from './requester'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import {
     getModelMaxContextSize,
+    getOpenAIFileHandlingConfig,
+    supportAudioInput,
     supportImageInput
 } from '@chatluna/v1-shared-adapter'
 import { RunnableConfig } from '@langchain/core/runnables'
@@ -84,6 +86,9 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient<ClientConfig>
                             ModelCapabilities.ToolCall,
                             supportImageInput(model)
                                 ? ModelCapabilities.ImageInput
+                                : undefined,
+                            supportAudioInput(model)
+                                ? ModelCapabilities.AudioInput
                                 : undefined
                         ].filter(Boolean)
                     } as ModelInfo
@@ -125,6 +130,7 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient<ClientConfig>
                 timeout: this._config.timeout,
                 temperature: this._config.temperature,
                 maxRetries: this._config.maxRetries,
+                fileHandlingConfig: getOpenAIFileHandlingConfig(model),
                 llmType: 'openai'
             })
         }
diff --git a/packages/adapter-qwen/src/utils.ts b/packages/adapter-qwen/src/utils.ts
index 09201bb36..9491127cf 100644
--- a/packages/adapter-qwen/src/utils.ts
+++ b/packages/adapter-qwen/src/utils.ts
@@ -5,7 +5,6 @@ import {
     ChatMessageChunk,
     FunctionMessageChunk,
     HumanMessageChunk,
-    MessageContentImageUrl,
     MessageType,
     SystemMessageChunk,
     ToolMessage,
@@ -21,11 +20,11 @@ import {
 } from './types'
 import {
     fetchImageUrl,
-    removeAdditionalProperties,
-    supportImageInput
+    removeAdditionalProperties
 } from '@chatluna/v1-shared-adapter'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import { isZodSchemaV3 } from '@langchain/core/utils/types'
+import { logger } from '.'
 
 export function formatToolsToQWenTools(
     tools: StructuredTool[]
@@ -113,50 +112,13 @@ export async function langchainMessageToQWenMessage(
             }
         }
 
-        const images = rawMessage.additional_kwargs.images as string[] | null
-
-        if (
-            (model?.includes('qwen-vl') ||
-                model?.includes('omni') ||
-                model?.includes('qwen2.5-vl') ||
-                model?.includes('qwen2.5-omni') ||
-                model?.includes('qwen-omni') ||
-                model?.includes('qwen2-vl') ||
-                model?.includes('qvq') ||
-                supportImageInput(model)) &&
-            images != null
-        ) {
-            msg.content = [
-                {
-                    type: 'text',
-                    text: rawMessage.content as string
-                }
-            ]
-
-            const imageContents = await Promise.all(
-                images.map(async (image) => {
-                    try {
-                        const url = await fetchImageUrl(plugin, {
-                            type: 'image_url',
-                            image_url: { url: image }
-                        } as MessageContentImageUrl)
-                        return {
-                            type: 'image_url',
-                            image_url: {
-                                url,
-                                detail: 'low'
-                            }
-                        } as const
-                    } catch {
-                        return null
-                    }
-                })
+        if (rawMessage.additional_kwargs.images != null) {
+            logger.warn(
+                'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
             )
+        }
 
-            msg.content.push(
-                ...imageContents.filter((content) => content != null)
-            )
-        } else if (Array.isArray(msg.content) && msg.content.length > 0) {
+        if (Array.isArray(msg.content) && msg.content.length > 0) {
             const mappedContent = await Promise.all(
                 msg.content.map(async (content) => {
                     if (!isMessageContentImageUrl(content)) return content
diff --git a/packages/service-multimodal/README.md b/packages/service-multimodal/README.md
index a630b5799..84e92d7bf 100644
--- a/packages/service-multimodal/README.md
+++ b/packages/service-multimodal/README.md
@@ -1,29 +1,7 @@
 ## koishi-plugin-chatluna-multimodal-service
 
-ChatLuna 多模态支持服务，提供上下文图像描述、GIF 帧处理、`read_files` 文件读取，以及语音消息转码注入能力。
+## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service)
 
-### MiMo 音频理解
+> ChatLuna 的多模态服务插件，提供上下文图像/语音描述、GIF 处理与 `read_files` 文件读取工具。
 
-MiMo 官方 OpenAI 兼容接口中，`mimo-v2.5` 与 `mimo-v2-omni` 支持音频理解。服务会将 QQ/OneBot 语音先下载到本地内存，必要时通过 `ffmpeg` 转成 MP3，再以 Base64 data URL 注入 `input_audio`：
-
-- 避免 QQ CDN 直链过期导致模型侧晚读失败。
-- 规避 AMR、Silk 等上游模型不稳定支持的格式。
-- 遵循 MiMo Base64 单音频 50 MB 上限；URL 输入的官方上限是单文件 100 MB。
-
-MiMo 官方列出的音频格式为 MP3、WAV、FLAC、M4A、OGG。实际变体较多，服务默认把语音消息转成 MP3 以提高稳定性。
-
-`read_files` 也会沿用这条路线：工具调用层如果把 `files` 传成 JSON 字符串，会先容错解析；音频 URL 即使被缓存服务误标为 MP3，也会按文件头识别 AMR/Silk 等实际格式，并在模型注入前通过 `ffmpeg` 转成 MP3。
-
-### MiMo 图片理解
-
-`mimo-v2.5` 与 `mimo-v2-omni` 也支持图片理解。即使 OpenAI 兼容适配器暂未在模型元数据中声明 `ImageInput`，服务也会把这两个 MiMo 模型视为原生图片输入模型，并使用标准 OpenAI 兼容 `image_url` 内容块注入 Base64 data URL。
-
-- 支持 JPEG、PNG、GIF、WebP、BMP。
-- MiMo Base64 单图片上限为 50 MB；URL 单图片官方上限同样为 50 MB。
-- 多图输入受模型上下文和 token 长度限制。
-
-音频消息转码需要启用：
-
-- `enableAudioFfmpegConversion`
-- `koishi-plugin-ffmpeg-path`
-- 官方 Bot/QQ Silk 语音还需要 `koishi-plugin-ffmpeg-path` 提供的 `silk` 服务
+[多模态插件文档](https://chatluna.chat/ecosystem/plugin/multimodal-service.html)
diff --git a/packages/service-multimodal/package.json b/packages/service-multimodal/package.json
index f79bab824..98586c1f9 100644
--- a/packages/service-multimodal/package.json
+++ b/packages/service-multimodal/package.json
@@ -34,8 +34,7 @@
     },
     "homepage": "https://github.com/ChatLunaLab/chatluna/tree/v1-dev/packages/service-image#readme",
     "scripts": {
-        "build": "atsc -b",
-        "test": "tsx --test tests/*.test.ts"
+        "build": "atsc -b"
     },
     "keywords": [
         "chatbot",
@@ -58,7 +57,6 @@
         "@types/omggif": "^1.0.5",
         "atsc": "^2.1.0",
         "koishi": "^4.18.9",
-        "koishi-plugin-adapter-onebot": "^6.9.3",
         "koishi-plugin-ffmpeg-path": "^2.0.0"
     },
     "peerDependencies": {
diff --git a/packages/service-multimodal/src/audio.ts b/packages/service-multimodal/src/audio.ts
deleted file mode 100644
index 71aa25f00..000000000
--- a/packages/service-multimodal/src/audio.ts
+++ /dev/null
@@ -1,111 +0,0 @@
-import { MessageContentComplex } from '@langchain/core/messages'
-import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
-
-export const MIMO_BASE64_AUDIO_BYTES = 50 * 1024 * 1024
-export const MIMO_BASE64_IMAGE_BYTES = 50 * 1024 * 1024
-
-const mimoModels = new Set(['mimo-v2.5', 'mimo-v2-omni'])
-
-const mimoAudioMimes = new Set([
-    'audio/mpeg',
-    'audio/mp3',
-    'audio/wav',
-    'audio/flac',
-    'audio/mp4',
-    'audio/ogg'
-])
-
-const mimoImageMimes = new Set([
-    'image/jpeg',
-    'image/png',
-    'image/gif',
-    'image/webp',
-    'image/bmp'
-])
-
-export function isMimoAudioModel(model?: string): boolean {
-    if (!model) return false
-    return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '')
-}
-
-export function isMimoImageModel(model?: string): boolean {
-    if (!model) return false
-    return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '')
-}
-
-export function isMimoAudioMime(mime: string): boolean {
-    return mimoAudioMimes.has(mime.toLowerCase())
-}
-
-export function isMimoImageMime(mime: string): boolean {
-    return mimoImageMimes.has(mime.toLowerCase())
-}
-
-export function modelCanReadAudio(
-    info:
-        | {
-              value?: {
-                  capabilities?: ModelCapabilities[]
-              }
-          }
-        | undefined,
-    model?: string
-): boolean {
-    return (
-        isMimoAudioModel(model) ||
-        info?.value?.capabilities?.includes(ModelCapabilities.AudioInput) ===
-            true
-    )
-}
-
-export function modelCanReadImage(
-    info:
-        | {
-              value?: {
-                  capabilities?: ModelCapabilities[]
-              }
-          }
-        | undefined,
-    model?: string
-): boolean {
-    return (
-        isMimoImageModel(model) ||
-        info?.value?.capabilities?.includes(ModelCapabilities.ImageInput) ===
-            true
-    )
-}
-
-export function buildAudioContent(
-    model: string | undefined,
-    data: string,
-    mime: string
-): MessageContentComplex {
-    if (isMimoAudioModel(model)) {
-        return {
-            type: 'input_audio',
-            input_audio: {
-                data: `data:${mime};base64,${data}`
-            }
-        } as unknown as MessageContentComplex
-    }
-
-    return {
-        type: 'audio_url',
-        audio_url: {
-            url: `data:${mime};base64,${data}`,
-            mimeType: mime
-        }
-    } as unknown as MessageContentComplex
-}
-
-export function buildImageContent(
-    data: string,
-    mime: string
-): MessageContentComplex {
-    return {
-        type: 'image_url',
-        image_url: {
-            url: `data:${mime};base64,${data}`
-        }
-    } as unknown as MessageContentComplex
-}
diff --git a/packages/service-multimodal/src/index.ts b/packages/service-multimodal/src/index.ts
index b7b402a42..62ff61556 100644
--- a/packages/service-multimodal/src/index.ts
+++ b/packages/service-multimodal/src/index.ts
@@ -83,7 +83,7 @@ export const Config: Schema<Config> = Schema.intersect([
 
 export const inject = {
     required: ['chatluna'],
-    optional: ['chatluna_storage', 'ffmpeg', 'silk']
+    optional: ['ffmpeg', 'silk']
 }
 
 export const name = 'chatluna-multimodal-service'
@@ -99,10 +99,4 @@ export const usage = `
 
 ### 注意
 建议搭配 \`chatluna-storage-service\` 使用。请求中的图像、文件大小限制遵循模型平台配置（如 Gemini：PDF 单文件 50MB、其他单文件 100MB、单轮总计 100MB，以文件被编码为 Base64 后的大小为准）。
-
-### MiMo 音频理解
-\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的音频理解走 OpenAI 兼容 \`input_audio\`。启用音频转换后，服务会先读取语音 URL，必要时用 ffmpeg/Silk 转为 MP3，再以 Base64 data URL 注入模型，避免 QQ CDN 过期和 AMR/Silk 等格式兼容问题。MiMo Base64 单音频上限为 50MB。
-
-### MiMo 图片理解
-\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的图片理解走 OpenAI 兼容 \`image_url\`。即使适配器暂未声明 \`ImageInput\`，服务也会按 MiMo 官方能力接入 JPEG、PNG、GIF、WebP、BMP，Base64 与 URL 单图片上限均为 50MB，多图受模型上下文限制。
 `
diff --git a/packages/service-multimodal/src/media.ts b/packages/service-multimodal/src/media.ts
deleted file mode 100644
index 0e458913c..000000000
--- a/packages/service-multimodal/src/media.ts
+++ /dev/null
@@ -1,30 +0,0 @@
-export function detectAudioMimeType(
-    buffer: Buffer,
-    declaredMimeType?: string | null
-): string | null {
-    const header = buffer.subarray(0, 16).toString('latin1')
-
-    if (header.startsWith('#!AMR')) return 'audio/amr'
-    if (
-        header.startsWith('#!SILK_V3') ||
-        buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
-    ) {
-        return 'audio/silk'
-    }
-    if (
-        header.startsWith('ID3') ||
-        (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0)
-    ) {
-        return 'audio/mpeg'
-    }
-    if (
-        header.startsWith('RIFF') &&
-        buffer.subarray(8, 12).toString('latin1') === 'WAVE'
-    ) {
-        return 'audio/wav'
-    }
-    if (header.startsWith('fLaC')) return 'audio/flac'
-    if (header.startsWith('OggS')) return 'audio/ogg'
-
-    return declaredMimeType ?? null
-}
diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts
index 57ccf5f64..6afad7fab 100644
--- a/packages/service-multimodal/src/plugins/audio.ts
+++ b/packages/service-multimodal/src/plugins/audio.ts
@@ -1,109 +1,93 @@
 import { MessageContentComplex } from '@langchain/core/messages'
 import { Context, h, Session } from 'koishi'
 import type { OneBotBot } from 'koishi-plugin-adapter-onebot'
-import { Message } from 'koishi-plugin-chatluna'
-import type {} from 'koishi-plugin-chatluna-storage-service'
+import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import type {} from 'koishi-plugin-ffmpeg-path'
 import { Config, logger } from '..'
 import {
-    buildAudioContent,
-    isMimoAudioModel,
-    MIMO_BASE64_AUDIO_BYTES,
-    modelCanReadAudio
-} from '../audio'
-
-const CHATLUNA_DOWNLOAD_USER_AGENT =
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
-const MAX_AUDIO_BYTES = 25 * 1024 * 1024
+    BROWSER_UA,
+    convertAudioToMp3,
+    detectAudioMimeType,
+    ensureContentArray
+} from '../utils'
+
+// MIMEs commonly accepted by OpenAI / Gemini / MiMo audio inputs. Anything
+// else (Silk, AMR, ...) is transcoded to MP3.
+const NATIVE_AUDIO_MIMES = new Set([
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/flac',
+    'audio/ogg',
+    'audio/mp4',
+    'audio/aac',
+    'audio/webm'
+])
+
+const MIME_TO_EXT: Record<string, string> = {
+    'audio/mpeg': 'mp3',
+    'audio/mp3': 'mp3',
+    'audio/wav': 'wav',
+    'audio/flac': 'flac',
+    'audio/ogg': 'ogg',
+    'audio/mp4': 'm4a'
+}
 
+/**
+ * Intercept voice/audio elements: download, transcode unfriendly formats
+ * (Silk/AMR/...) to MP3, then inject as a Base64 `audio_url` content part.
+ * OpenAI-compatible adapters convert the result to `input_audio` downstream.
+ */
 export function apply(ctx: Context, config: Config) {
-    if (!config.enableAudioFfmpegConversion) {
-        return
-    }
+    if (!config.enableAudioFfmpegConversion) return
 
     ctx.effect(() =>
         ctx.chatluna.messageTransformer.intercept(
             'audio',
             async (session, element, message, model) => {
-                const modelInfo = model
-                    ? ctx.chatluna.platform.findModel(model)
-                    : undefined
+                if (!modelAcceptsAudio(ctx, model)) return false
 
-                if (!modelCanReadAudio(modelInfo, model)) {
-                    return false
-                }
+                const sourceUrl = await resolveAudioSourceUrl(session, element)
+                if (!sourceUrl) return false
 
-                const sourceUrl = await resolveAudioSourceUrl(
-                    ctx,
-                    session,
-                    element
-                )
-                if (!sourceUrl) {
-                    return false
-                }
-
-                const fileName =
-                    element.attrs['file'] ??
-                    element.attrs['name'] ??
-                    element.attrs['filename']
-
-                const fileData = await readFile(ctx, sourceUrl)
-                if (!fileData.buffer) {
-                    return false
-                }
+                const buffer = await downloadAudio(ctx, sourceUrl)
+                if (!buffer) return false
 
-                const converted = await tryConvertAudioToMp3(
-                    ctx,
-                    fileData.buffer,
-                    fileName
+                const detected = detectAudioMimeType(
+                    buffer,
+                    element.attrs['mime'] as string | null
                 )
-                if (!converted) {
-                    logger.warn(`Failed to convert audio to MP3: ${sourceUrl}`)
-                    return false
-                }
-
-                const { fileName: displayFileName, buffer } = converted
-                element.attrs['file'] = displayFileName
-                element.attrs['filename'] = displayFileName
-
-                const base64 = buffer.toString('base64')
 
-                if (
-                    isMimoAudioModel(model) &&
-                    Buffer.byteLength(base64) > MIMO_BASE64_AUDIO_BYTES
-                ) {
-                    logger.warn(
-                        `Skip oversized MiMo audio after base64 encoding: ${Buffer.byteLength(base64)} bytes > ${MIMO_BASE64_AUDIO_BYTES} bytes`
-                    )
-                    return false
+                let outBuffer = buffer
+                let outMime = detected ?? 'audio/mpeg'
+
+                if (!detected || !NATIVE_AUDIO_MIMES.has(detected)) {
+                    const converted = await convertAudioToMp3(ctx, buffer)
+                    if (!converted) {
+                        logger.warn(
+                            `Skip audio: format ${detected ?? 'unknown'} not natively supported and ffmpeg conversion failed.`
+                        )
+                        return false
+                    }
+                    outBuffer = converted
+                    outMime = 'audio/mpeg'
                 }
 
-                const audioUrl =
-                    !isMimoAudioModel(model) && ctx.chatluna_storage
-                        ? (element.attrs['chatluna_file_url'] = (
-                              await ctx.chatluna_storage.createTempFile(
-                                  buffer,
-                                  displayFileName
-                              )
-                          ).url)
-                        : ((element.attrs['chatluna_file_url'] = sourceUrl),
-                          `data:audio/mpeg;base64,${base64}`)
+                const dataUrl = `data:${outMime};base64,${outBuffer.toString('base64')}`
+                const ext = MIME_TO_EXT[outMime] ?? 'mp3'
+                const fileName = `${stripExtension(audioName(element))}.${ext}`
+                element.attrs['file'] = fileName
+                element.attrs['filename'] = fileName
+                element.attrs['chatluna_file_url'] = sourceUrl
 
-                ensureContentArray(message, `[voice:${displayFileName}]`)
-                ;(message.content as MessageContentComplex[]).push(
-                    isMimoAudioModel(model)
-                        ? buildAudioContent(model, base64, 'audio/mpeg')
-                        : ({
-                              type: 'audio_url',
-                              audio_url: {
-                                  url: audioUrl,
-                                  mimeType: 'audio/mpeg'
-                              }
-                          } as unknown as MessageContentComplex)
-                )
+                ensureContentArray(message, `[voice:${fileName}]`)
+                ;(message.content as MessageContentComplex[]).push({
+                    type: 'audio_url',
+                    audio_url: { url: dataUrl, mimeType: outMime }
+                } as unknown as MessageContentComplex)
 
                 logger.debug(
-                    `Transcoded unsupported audio to mp3 for multimodal input: ${displayFileName}`
+                    `Injected audio for ${model}: ${fileName} (${outMime}, ${outBuffer.byteLength} bytes)`
                 )
                 return true
             },
@@ -112,22 +96,28 @@ export function apply(ctx: Context, config: Config) {
     )
 }
 
+function modelAcceptsAudio(ctx: Context, model: string | undefined): boolean {
+    if (!model) return false
+    return (
+        ctx.chatluna.platform
+            .findModel(model)
+            ?.value?.capabilities?.includes(ModelCapabilities.AudioInput) ===
+        true
+    )
+}
+
 async function resolveAudioSourceUrl(
-    ctx: Context,
     session: Session,
     element: h
 ): Promise<string | null> {
-    const srcAttr = (element.attrs['src'] ?? element.attrs['url']) as
+    const src = (element.attrs['src'] ?? element.attrs['url']) as
         | string
         | undefined
-    if (srcAttr?.startsWith('http')) {
-        return srcAttr
-    }
-
-    if (session.platform !== 'onebot') return srcAttr ?? null
+    if (src?.startsWith('http')) return src
+    if (session.platform !== 'onebot') return src ?? null
 
     const fileId = element.attrs['fileId'] ?? element.attrs['fileid']
-    if (!fileId) return srcAttr ?? null
+    if (!fileId) return src ?? null
 
     try {
         const bot = session.bot as OneBotBot<Context>
@@ -136,239 +126,37 @@ async function resolveAudioSourceUrl(
             ? await bot.internal.getPrivateFileUrl(session.userId, fileId)
             : await bot.internal.getGroupFileUrl(session.guildId, fileId, busId)
     } catch {
-        return srcAttr ?? null
+        return src ?? null
     }
 }
 
-async function readFile(
+async function downloadAudio(
     ctx: Context,
     url: string
-): Promise<{ buffer: Buffer | null; mimeType: string | null }> {
-    const headers = { 'User-Agent': CHATLUNA_DOWNLOAD_USER_AGENT }
-
-    let sanitizedUrl: string
+): Promise<Buffer | null> {
     try {
-        const parsed = new URL(url)
-        sanitizedUrl = parsed.origin + parsed.pathname
-    } catch {
-        sanitizedUrl = url
-    }
-
-    let mimeTypeFromHead: string | null = null
-
-    // Try HEAD request for size check
-    try {
-        const headResponse = await ctx.http(url, { method: 'head', headers })
-        const headHeaders: Headers = headResponse?.headers
-        mimeTypeFromHead =
-            headHeaders
-                ?.get('content-type')
-                ?.split(';')[0]
-                ?.trim()
-                ?.toLowerCase() ?? null
-
-        const headContentLength = headHeaders?.get('content-length')
-            ? Number(headHeaders.get('content-length'))
-            : null
-
-        if (
-            headContentLength != null &&
-            Number.isFinite(headContentLength) &&
-            headContentLength > MAX_AUDIO_BYTES
-        ) {
-            logger.warn(
-                `Skip reading oversized audio from ${sanitizedUrl}: ${headContentLength} bytes > ${MAX_AUDIO_BYTES} bytes`
-            )
-            return { buffer: null, mimeType: mimeTypeFromHead }
-        }
-    } catch {
-        // Some endpoints do not support HEAD; continue with GET safeguards.
-    }
-
-    try {
-        const response = await fetch(url, { method: 'GET', headers })
-
-        if (!response.ok) {
-            throw new Error(`HTTP ${response.status}`)
-        }
-
-        const mimeType =
-            response.headers
-                .get('content-type')
-                ?.split(';')[0]
-                ?.trim()
-                ?.toLowerCase() ?? mimeTypeFromHead
-        const responseContentLength = response.headers.get('content-length')
-            ? Number(response.headers.get('content-length'))
-            : null
-
-        if (
-            responseContentLength != null &&
-            Number.isFinite(responseContentLength) &&
-            responseContentLength > MAX_AUDIO_BYTES
-        ) {
-            logger.warn(
-                `Skip reading oversized audio from ${sanitizedUrl}: ${responseContentLength} bytes > ${MAX_AUDIO_BYTES} bytes`
-            )
-            return { buffer: null, mimeType }
-        }
-
-        if (response.body == null) {
-            const arrayBuffer = await response.arrayBuffer()
-            if (arrayBuffer.byteLength > MAX_AUDIO_BYTES) {
-                logger.warn(
-                    `Skip reading oversized audio from ${sanitizedUrl}: ${arrayBuffer.byteLength} bytes > ${MAX_AUDIO_BYTES} bytes`
-                )
-                return { buffer: null, mimeType }
-            }
-            return { buffer: Buffer.from(arrayBuffer), mimeType }
-        }
-
-        const reader = response.body.getReader()
-        const chunks: Buffer[] = []
-        let totalBytes = 0
-
-        while (true) {
-            const { done, value } = await reader.read()
-            if (done) break
-
-            if (!value?.byteLength) continue
-
-            totalBytes += value.byteLength
-            if (totalBytes > MAX_AUDIO_BYTES) {
-                await reader.cancel('audio exceeds max size')
-                logger.warn(
-                    `Skip reading oversized audio from ${sanitizedUrl}: streamed bytes exceed ${MAX_AUDIO_BYTES} bytes`
-                )
-                return { buffer: null, mimeType }
-            }
-
-            chunks.push(Buffer.from(value))
-        }
-
-        return { buffer: Buffer.concat(chunks, totalBytes), mimeType }
-    } catch (error) {
-        logger.warn(`Failed to read audio from ${sanitizedUrl}:`, error)
-        return { buffer: null, mimeType: null }
-    }
-}
-
-function toMp3FileName(fileName?: string): string {
-    const baseName = (fileName ?? 'voice').trim()
-    const dotIndex = baseName.lastIndexOf('.')
-    return `${dotIndex <= 0 ? baseName : baseName.slice(0, dotIndex)}.mp3`
-}
-
-async function tryConvertAudioToMp3(
-    ctx: Context,
-    inputBuffer: Buffer,
-    fileName?: string
-): Promise<{ buffer: Buffer; fileName: string } | null> {
-    try {
-        let sourceBuffer = inputBuffer
-        let decodedPcmSampleRate: number | null = null
-
-        if (isSilkAudio(inputBuffer)) {
-            const decoded = await decodeSilkAudio(ctx, inputBuffer)
-            sourceBuffer = decoded.buffer
-            decodedPcmSampleRate = decoded.sampleRate
-            logger.debug('Decoded silk audio before mp3 transcoding.')
-        }
-
-        const ffmpeg = ctx.ffmpeg
-        if (!ffmpeg) {
-            throw new Error(
-                'FFmpeg service is unavailable. Please enable koishi-plugin-ffmpeg-path.'
-            )
-        }
-
-        const builder = ffmpeg.builder().input(sourceBuffer)
-        if (decodedPcmSampleRate != null) {
-            builder.inputOption(
-                '-f',
-                's16le',
-                '-ar',
-                String(decodedPcmSampleRate),
-                '-ac',
-                '1'
-            )
-        }
-
-        const outputBuffer = await builder
-            .outputOption(
-                '-vn',
-                '-acodec',
-                'libmp3lame',
-                '-q:a',
-                '4',
-                '-f',
-                'mp3'
-            )
-            .run('buffer')
-
-        return {
-            buffer: outputBuffer,
-            fileName: toMp3FileName(fileName)
-        }
+        const { data } = await ctx.http(url, {
+            responseType: 'arraybuffer',
+            method: 'get',
+            headers: { 'User-Agent': BROWSER_UA }
+        })
+        return Buffer.from(data)
     } catch (error) {
-        logger.warn(
-            `Audio transcoding to mp3 failed, fallback to original audio: ${error instanceof Error ? error.message : String(error)}`
-        )
+        logger.warn(`Failed to fetch audio from ${url}:`, error)
         return null
     }
 }
 
-function isSilkAudio(inputBuffer: Buffer): boolean {
-    if (inputBuffer.length < 9) return false
-    const sig = inputBuffer.subarray(0, 9).toString('latin1')
+function audioName(element: h): string {
     return (
-        sig === '#!SILK_V3' ||
-        inputBuffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+        (element.attrs['file'] as string | undefined) ??
+        (element.attrs['name'] as string | undefined) ??
+        (element.attrs['filename'] as string | undefined) ??
+        'voice'
     )
 }
 
-async function decodeSilkAudio(
-    ctx: Context,
-    inputBuffer: Buffer
-): Promise<{ buffer: Buffer; sampleRate: number }> {
-    const silk = ctx.silk
-    if (!silk) {
-        throw new Error(
-            'Detected silk audio, but no silk service is available for decoding'
-        )
-    }
-    for (const sampleRate of [24000, 16000, 12000, 8000]) {
-        try {
-            const result = (await silk.decode(
-                inputBuffer,
-                sampleRate
-            )) as DecodeResult
-
-            if (result?.data != null) {
-                return { buffer: Buffer.from(result.data), sampleRate }
-            }
-        } catch {
-            continue
-        }
-    }
-
-    throw new Error('silk decode returned empty output')
-}
-
-function ensureContentArray(message: Message, fallbackText: string) {
-    if (typeof message.content === 'string') {
-        message.content = [
-            {
-                type: 'text',
-                text: message.content.trim().length
-                    ? message.content
-                    : fallbackText
-            }
-        ]
-    }
-}
-
-interface DecodeResult {
-    data: Uint8Array
-    duration: number
+function stripExtension(name: string): string {
+    const dot = name.lastIndexOf('.')
+    return dot > 0 ? name.slice(0, dot) : name
 }
diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts
index 560eb959a..f1b64e51a 100644
--- a/packages/service-multimodal/src/plugins/image.ts
+++ b/packages/service-multimodal/src/plugins/image.ts
@@ -1,6 +1,6 @@
-/* eslint-disable max-len */
 import { Context } from 'koishi'
 import { Message } from 'koishi-plugin-chatluna'
+import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
 import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import { Config, logger } from '..'
@@ -11,8 +11,12 @@ import {
     processImageWithModel,
     readImage
 } from '../utils'
-import { modelCanReadImage } from '../audio'
 
+/**
+ * Intercept image elements. Native-capable models receive the data URL
+ * directly (GIFs are split into frames). Otherwise fall back to describing
+ * the image via the configured vision model and inject the description.
+ */
 export async function apply(
     ctx: Context,
     config: Config,
@@ -22,133 +26,106 @@ export async function apply(
         config.imageModel
     )
 
-    const disposable = ctx.chatluna.messageTransformer.intercept(
-        'img',
-        async (_session, element, message, model) => {
-            const parsedModelInfo =
-                model != null
-                    ? ctx.chatluna.platform.findModel(model)
-                    : undefined
-            const modelSupportsImageInput =
-                modelCanReadImage(parsedModelInfo, model)
+    ctx.effect(() =>
+        ctx.chatluna.messageTransformer.intercept(
+            'img',
+            async (_session, element, message, model) => {
+                const url = (element.attrs.url ?? element.attrs.src) as string
+                if (!url) return false
 
-            let imageData: Awaited<ReturnType<typeof readImage>>
-            const url = (element.attrs.url ?? element.attrs.src) as string
-
-            if (modelSupportsImageInput) {
-                imageData = await readImage(ctx, url)
-
-                if (imageData.ext == null) {
+                const native = modelAcceptsImage(ctx, model)
+                if (!native && !config.enableContextImageDescription) {
                     return false
                 }
 
-                if (imageData.ext === 'image/gif') {
-                    if (!config.enableContextGifHandling) {
-                        return false
-                    }
+                const imageData = await readImage(ctx, url)
+                if (imageData.ext == null) return false
 
-                    logger.debug(`image url: ${url.substring(0, 50)}...`)
-                    const frames = await parseGifToFrames(imageData.buffer, {
-                        strategy: config.gifStrategy,
-                        frameCount: config.gifFrameCount
-                    })
+                const isGif = imageData.ext === 'image/gif'
+                if (isGif && !config.enableContextGifHandling) return false
 
-                    logger.debug(`Extracted ${frames.length} frames from GIF`)
-
-                    for (const frame of frames) {
-                        addImageToContent(message, frame)
+                if (native) {
+                    if (isGif) {
+                        await injectGifFrames(message, imageData.buffer, config)
+                        addTextToContent(message, '[image: GIF]')
+                    } else if (imageData.base64Source) {
+                        addImageToContent(message, imageData.base64Source)
                     }
-
-                    addTextToContent(message, '[image: GIF]')
-
                     return true
                 }
 
-                if (imageData.base64Source != null) {
-                    addImageToContent(message, imageData.base64Source)
-                    return true
-                }
-            }
-
-            if (!config.enableContextImageDescription) {
-                return false
-            }
-
-            if (imageUnderstandModel.value == null) {
-                logger.warn(
-                    `The model ${config.imageModel} is not loaded, please check your chat adapter`
-                )
-                return false
-            }
-
-            if (
-                !imageUnderstandModel.value.modelInfo.capabilities.includes(
-                    ModelCapabilities.ImageInput
-                )
-            ) {
-                logger.warn(
-                    `The model ${config.imageModel} in image-service does not support image input, please check your chat adapter`
-                )
-                return false
-            }
-
-            try {
-                const fakeMessage: Message = {
-                    content: []
-                }
-
-                logger.debug(`image url: ${url}`)
-
-                imageData = imageData ?? (await readImage(ctx, url))
-
-                if (imageData.ext == null) {
-                    return false
-                }
-
-                if (imageData.ext === 'image/gif') {
-                    if (!config.enableContextGifHandling) {
-                        return false
-                    }
-
-                    const frames = await parseGifToFrames(imageData.buffer, {
-                        strategy: config.gifStrategy,
-                        frameCount: config.gifFrameCount
-                    })
-
-                    logger.debug(
-                        `Extracted ${frames.length} frames from GIF for model processing`
-                    )
-
-                    addTextToContent(
-                        fakeMessage,
-                        'This is a GIF image. See the frames below:'
-                    )
-                    for (const frame of frames) {
-                        addImageToContent(fakeMessage, frame)
-                    }
-                } else {
-                    addImageToContent(fakeMessage, imageData.base64Source)
-                }
-
-                const result = await processImageWithModel(
-                    imageUnderstandModel.value,
+                return describeAndInject(
+                    message,
+                    imageData,
+                    isGif,
                     config,
-                    fakeMessage
+                    imageUnderstandModel.value,
+                    url
                 )
+            },
+            100
+        )
+    )
+}
 
-                if (result) {
-                    addTextToContent(message, '\n\n' + result)
-                    return true
-                }
-            } catch (error) {
-                logger.warn(
-                    `Read image ${url} error, check your chat adapter`,
-                    error
-                )
-            }
-        },
-        100
+function modelAcceptsImage(ctx: Context, model: string | undefined): boolean {
+    if (!model) return false
+    return (
+        ctx.chatluna.platform
+            .findModel(model)
+            ?.value?.capabilities?.includes(ModelCapabilities.ImageInput) ===
+        true
     )
+}
 
-    ctx.effect(() => disposable)
+async function injectGifFrames(
+    message: Message,
+    buffer: Buffer,
+    config: Config
+): Promise<void> {
+    const frames = await parseGifToFrames(buffer, {
+        strategy: config.gifStrategy,
+        frameCount: config.gifFrameCount
+    })
+    logger.debug(`Extracted ${frames.length} frames from GIF`)
+    for (const frame of frames) addImageToContent(message, frame)
+}
+
+async function describeAndInject(
+    message: Message,
+    imageData: Awaited<ReturnType<typeof readImage>>,
+    isGif: boolean,
+    config: Config,
+    imageModel: ChatLunaChatModel | undefined,
+    url: string
+): Promise<boolean> {
+    if (
+        imageModel == null ||
+        !imageModel.modelInfo.capabilities.includes(
+            ModelCapabilities.ImageInput
+        )
+    ) {
+        logger.warn(
+            `Image-description model "${config.imageModel}" is missing or lacks image input — skip.`
+        )
+        return false
+    }
+
+    try {
+        const fake: Message = { content: [] }
+        if (isGif) {
+            addTextToContent(fake, 'This is a GIF image. See the frames below:')
+            await injectGifFrames(fake, imageData.buffer, config)
+        } else if (imageData.base64Source) {
+            addImageToContent(fake, imageData.base64Source)
+        }
+        const result = await processImageWithModel(imageModel, config, fake)
+        if (result) {
+            addTextToContent(message, '\n\n' + result)
+            return true
+        }
+    } catch (error) {
+        logger.warn(`Image describe failed for ${url}:`, error)
+    }
+    return false
 }
diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index 896261e50..06fe42d33 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -3,330 +3,45 @@ import { StructuredTool } from '@langchain/core/tools'
 import { HumanMessage, MessageContentComplex } from '@langchain/core/messages'
 import { Context } from 'koishi'
 import { ComputedRef, Message } from 'koishi-plugin-chatluna'
-import type {} from 'koishi-plugin-ffmpeg-path'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
+import type { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client'
 import {
     ChatLunaToolRunnable,
     ModelCapabilities
 } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
-import {
-    isMessageContentAudio,
-    isMessageContentVideo,
-    type MessageContentVideo
-} from 'koishi-plugin-chatluna/utils/langchain'
 import { getBase64EncodedSize } from 'koishi-plugin-chatluna/utils/base64'
 import { Config, logger } from '..'
 import {
     addImageToContent,
     addTextToContent,
+    BROWSER_UA,
+    convertAudioToMp3,
+    detectAudioMimeType,
+    IMAGE_MIME_TYPES,
+    inferMimeTypeFromUrl,
+    normalizeMimeType,
     parseGifToFrames,
     processImageWithModel
 } from '../utils'
-import {
-    buildAudioContent,
-    buildImageContent,
-    isMimoAudioMime,
-    isMimoImageMime,
-    MIMO_BASE64_AUDIO_BYTES,
-    MIMO_BASE64_IMAGE_BYTES,
-    modelCanReadAudio,
-    modelCanReadImage
-} from '../audio'
-import { detectAudioMimeType } from '../media'
-import { readFilesInputSchema } from '../read_files_schema'
 import z from 'zod'
 
-// ---------------------------------------------------------------------------
-// Constants
-// ---------------------------------------------------------------------------
-
-const IMAGE_MIME_TYPES = new Set([
-    'image/png',
-    'image/jpeg',
-    'image/bmp',
-    'image/webp',
-    'image/gif'
-])
-
 const DEFAULT_MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024
 const DEFAULT_MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024
 
-const FILE_EXTENSION_TO_MIME_TYPE = new Map<string, string>([
-    ['.png', 'image/png'],
-    ['.jpg', 'image/jpeg'],
-    ['.jpeg', 'image/jpeg'],
-    ['.bmp', 'image/bmp'],
-    ['.webp', 'image/webp'],
-    ['.gif', 'image/gif'],
-    ['.pdf', 'application/pdf'],
-    ['.txt', 'text/plain'],
-    ['.md', 'text/markdown'],
-    ['.html', 'text/html'],
-    ['.htm', 'text/html'],
-    ['.css', 'text/css'],
-    ['.xml', 'text/xml'],
-    ['.csv', 'text/csv'],
-    ['.rtf', 'text/rtf'],
-    ['.js', 'text/javascript'],
-    ['.mjs', 'text/javascript'],
-    ['.json', 'application/json'],
-    ['.mp4', 'video/mp4'],
-    ['.mpeg', 'video/mpeg'],
-    ['.mov', 'video/mov'],
-    ['.avi', 'video/avi'],
-    ['.flv', 'video/x-flv'],
-    ['.mpg', 'video/mpg'],
-    ['.webm', 'video/webm'],
-    ['.wmv', 'video/wmv'],
-    ['.3gp', 'video/3gpp'],
-    ['.3gpp', 'video/3gpp'],
-    ['.mp3', 'audio/mpeg'],
-    ['.aiff', 'audio/aiff'],
-    ['.aac', 'audio/aac'],
-    ['.flac', 'audio/flac'],
-    ['.wav', 'audio/wav'],
-    ['.ogg', 'audio/ogg'],
-    ['.m4a', 'audio/mp4']
-])
-
-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
-
-function isHttpOrHttpsUrl(url: string): boolean {
-    try {
-        const parsed = new URL(url)
-        return parsed.protocol === 'http:' || parsed.protocol === 'https:'
-    } catch {
-        return false
-    }
-}
-
-function normalizeMimeType(raw: string | null): string | null {
-    if (raw == null) return null
-    const mimeType = raw.split(';')[0]?.trim()?.toLowerCase()
-    return mimeType || null
-}
-
-function getHeaderValue(headers: unknown, name: string): string | null {
-    if (headers == null) return null
-
-    if (
-        typeof (headers as { get?: unknown }).get === 'function'
-    ) {
-        const value = (headers as { get(name: string): string | null }).get(
-            name
+const fileSchema = z.object({ url: z.string().url() })
+const readFilesSchema = z.object({
+    files: z
+        .union([fileSchema, z.array(fileSchema).min(1).max(10)])
+        .describe(
+            'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
         )
-        return typeof value === 'string' ? value : null
-    }
-
-    const record = headers as Record<string, unknown>
-    const value = record[name] ?? record[name.toLowerCase()]
-    if (typeof value === 'string') return value
-    if (Array.isArray(value) && typeof value[0] === 'string') {
-        return value[0]
-    }
-
-    return null
-}
-
-function inferMimeTypeFromPath(path: string): string | null {
-    const sanitizedPath = path.toLowerCase().split(/[?#]/, 1)[0]
-    const fileName = sanitizedPath.split(/[/\\]/).pop() ?? sanitizedPath
-    const extensionIndex = fileName.lastIndexOf('.')
+})
 
-    if (extensionIndex < 0) {
-        return null
-    }
-
-    const extension = fileName.slice(extensionIndex)
-    return FILE_EXTENSION_TO_MIME_TYPE.get(extension) ?? null
-}
-
-function inferMimeTypeFromUrl(url: string): string | null {
-    try {
-        const pathname = new URL(url).pathname
-        return inferMimeTypeFromPath(pathname)
-    } catch {
-        // ignore
-    }
-
-    return null
-}
-
-async function convertAudioBufferToMp3(
-    ctx: Context,
-    buffer: Buffer
-): Promise<Buffer | null> {
-    const ffmpeg = ctx.ffmpeg
-    if (!ffmpeg) {
-        return null
-    }
-
-    try {
-        return await ffmpeg
-            .builder()
-            .input(buffer)
-            .outputOption(
-                '-vn',
-                '-acodec',
-                'libmp3lame',
-                '-q:a',
-                '4',
-                '-f',
-                'mp3'
-            )
-            .run('buffer')
-    } catch (error) {
-        logger.warn(
-            `read_files audio transcoding to mp3 failed: ${error instanceof Error ? error.message : String(error)}`
-        )
-        return null
-    }
-}
-
-/**
- * Check whether the model natively supports a given MIME type based on its
- * capabilities and `FileHandlingConfig`.
- */
-function modelSupportsNativeMimeType(
-    model: ChatLunaChatModel,
+interface NativePart {
     mimeType: string
-): boolean {
-    const caps = model.modelInfo.capabilities
-
-    let capabilitySupportsMime = false
-    if (IMAGE_MIME_TYPES.has(mimeType)) {
-        capabilitySupportsMime = modelCanReadImage(
-            { value: model.modelInfo },
-            model.modelInfo.name
-        )
-    } else if (mimeType.startsWith('audio/')) {
-        capabilitySupportsMime = modelCanReadAudio(
-            { value: model.modelInfo },
-            model.modelInfo.name
-        )
-    } else if (mimeType.startsWith('video/')) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.VideoInput)
-    } else if (
-        mimeType.startsWith('text/') ||
-        mimeType === 'application/json' ||
-        mimeType === 'application/pdf'
-    ) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.FileInput)
-    }
-
-    if (!capabilitySupportsMime) {
-        return false
-    }
-
-    const fileConfig = model.fileHandlingConfig
-    if (fileConfig != null) {
-        return fileConfig.supportedMimeTypes.has(mimeType)
-    }
-
-    return true
-}
-
-function isMimeTypeEnabled(config: Config, mimeType: string): boolean {
-    if (mimeType === 'image/gif') {
-        return config.enableGifReadTool
-    }
-
-    if (IMAGE_MIME_TYPES.has(mimeType)) {
-        return config.enableImageReadTool
-    }
-
-    return config.enableFileReadTool
-}
-
-function buildReadFilesDescription(config: Config): string {
-    const sections: string[] = []
-
-    if (config.enableImageReadTool) {
-        sections.push(
-            '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.'
-        )
-    }
-
-    if (config.enableGifReadTool) {
-        sections.push(
-            '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.'
-        )
-    }
-
-    if (config.enableFileReadTool) {
-        sections.push(
-            '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).'
-        )
-    }
-
-    return `Read files from URL(s) and return their content.
-Enabled read_files capabilities:
-${sections.join('\n')}
-Use this tool when you need to read files from URL(s) as context.`
-}
-
-/**
- * Build a multimodal `HumanMessage` containing the file(s) as content parts,
- * suitable for injecting into the conversation context.
- */
-function buildMultimodalMessage(
-    parts: {
-        mimeType: string
-        base64Data: string
-        sourceUrl: string
-    }[],
-    insertPrompt: string,
-    model?: string
-): HumanMessage {
-    const content: MessageContentComplex[] = []
-
-    for (const part of parts) {
-        const { mimeType, base64Data } = part
-
-        if (IMAGE_MIME_TYPES.has(mimeType)) {
-            content.push(buildImageContent(base64Data, mimeType))
-        } else if (mimeType.startsWith('audio/')) {
-            const audioContent = buildAudioContent(model, base64Data, mimeType)
-
-            if (isMessageContentAudio(audioContent as MessageContentComplex)) {
-                content.push(audioContent as MessageContentComplex)
-            } else if (audioContent.type === 'input_audio') {
-                content.push(audioContent as MessageContentComplex)
-            }
-        } else if (mimeType.startsWith('video/')) {
-            const videoContent: MessageContentVideo = {
-                type: 'video_url',
-                video_url: {
-                    url: `data:${mimeType};base64,${base64Data}`,
-                    mimeType
-                }
-            }
-
-            if (isMessageContentVideo(videoContent as MessageContentComplex)) {
-                content.push(videoContent as MessageContentComplex)
-            }
-        } else {
-            // Inline data for text/pdf/etc. (Gemini-style)
-            content.push({
-                inline_data: {
-                    mime_type: mimeType,
-                    data: base64Data
-                }
-            } as unknown as MessageContentComplex)
-        }
-    }
-
-    if (content.length > 0) {
-        content.unshift({
-            type: 'text',
-            text: insertPrompt
-        })
-    }
-
-    return new HumanMessage({ content })
+    base64Data: string
+    sourceUrl: string
 }
 
 // ---------------------------------------------------------------------------
@@ -335,10 +50,9 @@ function buildMultimodalMessage(
 
 export class ReadFilesTool extends StructuredTool {
     name = 'read_files'
+    schema = readFilesSchema
     description: string
 
-    schema = readFilesInputSchema
-
     constructor(
         private readonly ctx: Context,
         private readonly config: Config,
@@ -347,7 +61,7 @@ export class ReadFilesTool extends StructuredTool {
         >
     ) {
         super({})
-        this.description = buildReadFilesDescription(config)
+        this.description = describeTool(config)
     }
 
     async _call(
@@ -359,407 +73,301 @@ export class ReadFilesTool extends StructuredTool {
         const model = runConfig?.configurable?.model
         const conversationId = runConfig?.configurable?.conversationId
         const fileConfig = model?.fileHandlingConfig
-
-        let totalBase64Bytes = 0
-        const maxTotalSize =
+        const maxTotal =
             fileConfig?.maxTotalSizeBytes ?? DEFAULT_MAX_TOTAL_SIZE_BYTES
 
-        const nativeParts: {
-            mimeType: string
-            base64Data: string
-            sourceUrl: string
-        }[] = []
-
-        const response: {
-            files: {
-                sourceUrl: string
-                mimeType?: string
-                status: 'ok' | 'described' | 'error'
-                description?: string
-                error?: string
-            }[]
-            successCount: number
-            failureCount: number
-        } = {
+        const native: NativePart[] = []
+        const report: ToolReport = {
             files: [],
             successCount: 0,
             failureCount: 0
         }
+        let totalBytes = 0
         let describedCount = 0
 
-        for (const file of files) {
-            const sourceUrl = file.url
-
-            const pushError = (errorMessage: string, mimeType?: string) => {
-                response.files.push({
+        for (const { url: sourceUrl } of files) {
+            if (!isHttp(sourceUrl)) {
+                pushError(
+                    report,
                     sourceUrl,
-                    mimeType,
-                    status: 'error',
-                    error: errorMessage
-                })
-                response.failureCount++
+                    'Only http/https URLs are supported.'
+                )
+                continue
             }
 
             try {
-                if (!isHttpOrHttpsUrl(sourceUrl)) {
-                    pushError(
-                        'Only http/https URLs are supported for read_files.'
-                    )
+                const fetched = await this._fetch(sourceUrl)
+                if (!fetched) {
+                    pushError(report, sourceUrl, 'Failed to fetch URL.')
                     continue
                 }
 
-                // Determine MIME type first by fetching with headers
-                const controller = new AbortController()
-                const timeout = setTimeout(() => controller.abort(), 60_000)
-                const httpResponse = await this.ctx
-                    .http(sourceUrl, {
-                        responseType: 'arraybuffer',
-                        method: 'get',
-                        headers: {
-                            'User-Agent':
-                                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
-                        },
-                        signal: controller.signal
-                    })
-                    .finally(() => {
-                        clearTimeout(timeout)
-                    })
-
-                const buffer = Buffer.from(httpResponse.data)
-
-                // Resolve MIME type from response headers or URL
-                const responseMimeType = normalizeMimeType(
-                    getHeaderValue(httpResponse.headers, 'content-type')
-                )
-
-                const declaredMimeType =
-                    responseMimeType ?? inferMimeTypeFromUrl(sourceUrl)
-                const detectedAudioMimeType = detectAudioMimeType(
-                    buffer,
-                    declaredMimeType
+                const declared =
+                    normalizeMimeType(fetched.contentType) ??
+                    inferMimeTypeFromUrl(sourceUrl)
+                const detectedAudio = detectAudioMimeType(
+                    fetched.buffer,
+                    declared
                 )
-                const mimeType =
-                    declaredMimeType?.startsWith('audio/') ||
-                    detectedAudioMimeType?.startsWith('audio/')
-                        ? detectedAudioMimeType
-                        : declaredMimeType
+                const mime =
+                    declared?.startsWith('audio/') || detectedAudio
+                        ? detectedAudio
+                        : declared
 
-                if (!mimeType) {
+                if (!mime) {
                     pushError(
-                        `Could not determine MIME type for ${sourceUrl}. Please ensure the URL returns a valid content type.`
+                        report,
+                        sourceUrl,
+                        `Could not determine MIME type for ${sourceUrl}.`
                     )
                     continue
                 }
-
-                if (!isMimeTypeEnabled(this.config, mimeType)) {
+                if (!mimeEnabled(this.config, mime)) {
                     pushError(
-                        `Feature disabled for MIME type "${mimeType}". Please enable the corresponding read_files switch.`,
-                        mimeType
+                        report,
+                        sourceUrl,
+                        `Feature disabled for MIME type "${mime}".`,
+                        mime
                     )
                     continue
                 }
 
-                // Check if the model supports this MIME type natively
-                const isImage = IMAGE_MIME_TYPES.has(mimeType)
-                const isAudio = mimeType.startsWith('audio/')
-                const modelSupports =
-                    model != null &&
-                    (isAudio
-                        ? modelCanReadAudio(
-                              { value: model.modelInfo },
-                              model.modelInfo.name
-                          )
-                        : modelSupportsNativeMimeType(model, mimeType))
-
-                if (modelSupports && !isImage) {
-                    // Non-image file that the model supports natively -> inline inject
-                    let nativeBuffer: Buffer = buffer
-                    let nativeMimeType = mimeType
-
-                    if (isAudio && !isMimoAudioMime(mimeType)) {
-                        const converted = await convertAudioBufferToMp3(
+                const isImage = IMAGE_MIME_TYPES.has(mime)
+                const isAudio = mime.startsWith('audio/')
+                const supportsNative =
+                    model != null && modelSupportsMime(model, mime)
+
+                // ----- Non-image native: maybe transcode audio, then inline ---
+                if (!isImage && supportsNative) {
+                    let bytes = fetched.buffer
+                    let outMime = mime
+                    if (
+                        isAudio &&
+                        fileConfig?.supportedMimeTypes &&
+                        !fileConfig.supportedMimeTypes.has(mime)
+                    ) {
+                        const converted = await convertAudioToMp3(
                             this.ctx,
-                            buffer
+                            bytes
                         )
-
                         if (!converted) {
                             pushError(
-                                `Unsupported audio MIME type "${mimeType}" and ffmpeg conversion to MP3 failed.`,
-                                mimeType
+                                report,
+                                sourceUrl,
+                                `Unsupported audio MIME "${mime}" and ffmpeg conversion failed.`,
+                                mime
                             )
                             continue
                         }
-
-                        nativeBuffer = converted
-                        nativeMimeType = 'audio/mpeg'
-                        logger.debug(
-                            `Transcoded read_files audio from ${mimeType} to audio/mpeg for multimodal input`
-                        )
+                        bytes = converted
+                        outMime = 'audio/mpeg'
                     }
 
-                    const maxFileSize =
-                        isMimoAudioMime(nativeMimeType) &&
-                        modelCanReadAudio(undefined, model?.modelInfo.name)
-                            ? MIMO_BASE64_AUDIO_BYTES
-                            : (fileConfig?.maxFileSizeBytesOverrides?.[
-                                  nativeMimeType
-                              ] ??
-                              fileConfig?.maxFileSizeBytes ??
-                              DEFAULT_MAX_FILE_SIZE_BYTES)
-
-                    const encodedSize = getBase64EncodedSize(
-                        nativeBuffer.byteLength
+                    const sizeError = checkSize(
+                        bytes,
+                        outMime,
+                        fileConfig,
+                        totalBytes,
+                        maxTotal
                     )
-
-                    if (encodedSize > maxFileSize) {
-                        pushError(
-                            `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${nativeMimeType}`,
-                            nativeMimeType
-                        )
+                    if (sizeError) {
+                        pushError(report, sourceUrl, sizeError, outMime)
                         continue
                     }
-
-                    if (totalBase64Bytes + encodedSize > maxTotalSize) {
-                        pushError(
-                            `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`,
-                            nativeMimeType
-                        )
-                        continue
-                    }
-
-                    totalBase64Bytes += encodedSize
-                    nativeParts.push({
-                        mimeType: nativeMimeType,
-                        base64Data: nativeBuffer.toString('base64'),
-                        sourceUrl
-                    })
-
-                    response.files.push({
+                    totalBytes += getBase64EncodedSize(bytes.byteLength)
+                    pushNative(
+                        report,
+                        native,
                         sourceUrl,
-                        mimeType: nativeMimeType,
-                        status: 'ok'
-                    })
-                    response.successCount++
-                } else if (isImage && modelSupports) {
-                    // Image that the model supports natively -> inject directly
-                    // Unified per-file size check before any branching
-                    const maxFileSize =
-                        isMimoImageMime(mimeType) &&
-                        modelCanReadImage(undefined, model?.modelInfo.name)
-                            ? MIMO_BASE64_IMAGE_BYTES
-                            : (fileConfig?.maxFileSizeBytesOverrides?.[
-                                  mimeType
-                              ] ??
-                              fileConfig?.maxFileSizeBytes ??
-                              DEFAULT_MAX_FILE_SIZE_BYTES)
-
-                    const encodedSize = getBase64EncodedSize(buffer.byteLength)
-
-                    if (encodedSize > maxFileSize) {
-                        pushError(
-                            `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`,
-                            mimeType
-                        )
+                        outMime,
+                        bytes.toString('base64')
+                    )
+                    continue
+                }
+
+                // ----- Image native: inject directly (GIF splits to frames) ---
+                if (isImage && supportsNative) {
+                    const sizeError = checkSize(
+                        fetched.buffer,
+                        mime,
+                        fileConfig,
+                        totalBytes,
+                        maxTotal
+                    )
+                    if (sizeError) {
+                        pushError(report, sourceUrl, sizeError, mime)
                         continue
                     }
 
-                    // For GIF: split into frames
-                    if (mimeType === 'image/gif') {
-                        const frames = await parseGifToFrames(buffer, {
+                    if (mime === 'image/gif') {
+                        const frames = await parseGifToFrames(fetched.buffer, {
                             strategy: this.config.gifStrategy,
                             frameCount: this.config.gifFrameCount
                         })
-
-                        logger.debug(
-                            `Extracted ${frames.length} frames from GIF for native model injection`
-                        )
-
                         for (const frame of frames) {
-                            // Frames are data:image/png;base64,... strings
                             const frameBase64 = frame.split(',')[1]
                             const frameSize = getBase64EncodedSize(
                                 Buffer.from(frameBase64, 'base64').byteLength
                             )
-
-                            if (totalBase64Bytes + frameSize > maxTotalSize) {
+                            if (totalBytes + frameSize > maxTotal) {
                                 logger.warn(
                                     'Skipping remaining GIF frames due to total size limit'
                                 )
                                 break
                             }
-
-                            totalBase64Bytes += frameSize
-                            nativeParts.push({
-                                mimeType: 'image/png',
-                                base64Data: frameBase64,
-                                sourceUrl
-                            })
-                        }
-                    } else {
-                        if (totalBase64Bytes + encodedSize > maxTotalSize) {
-                            pushError(
-                                `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`,
-                                mimeType
+                            totalBytes += frameSize
+                            pushNative(
+                                report,
+                                native,
+                                sourceUrl,
+                                'image/png',
+                                frameBase64
                             )
-                            continue
                         }
-
-                        totalBase64Bytes += encodedSize
-                        nativeParts.push({
-                            mimeType,
-                            base64Data: buffer.toString('base64'),
-                            sourceUrl
-                        })
-                    }
-
-                    response.files.push({
-                        sourceUrl,
-                        mimeType,
-                        status: 'ok'
-                    })
-                    response.successCount++
-                } else if (isImage) {
-                    // Image but model doesn't support it natively -> describe using image model
-                    const maxFileSize =
-                        fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ??
-                        fileConfig?.maxFileSizeBytes ??
-                        DEFAULT_MAX_FILE_SIZE_BYTES
-
-                    const encodedSize = getBase64EncodedSize(buffer.byteLength)
-
-                    if (encodedSize > maxFileSize) {
-                        pushError(
-                            `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`,
-                            mimeType
+                    } else {
+                        totalBytes += getBase64EncodedSize(
+                            fetched.buffer.byteLength
+                        )
+                        pushNative(
+                            report,
+                            native,
+                            sourceUrl,
+                            mime,
+                            fetched.buffer.toString('base64')
                         )
-                        continue
                     }
+                    continue
+                }
 
-                    const describeResult = await this._describeImageWithModel(
+                // ----- Image without native support: describe via vision model -
+                if (isImage) {
+                    const described = await this._describeImage(
                         sourceUrl,
-                        buffer,
-                        mimeType
+                        fetched.buffer,
+                        mime
                     )
-
-                    if (describeResult) {
-                        response.files.push({
+                    if (described) {
+                        report.files.push({
                             sourceUrl,
-                            mimeType,
+                            mimeType: mime,
                             status: 'described',
-                            description: describeResult
+                            description: described
                         })
-                        response.successCount++
+                        report.successCount++
                         describedCount++
                     } else {
                         pushError(
-                            `Failed to describe image from ${sourceUrl}`,
-                            mimeType
+                            report,
+                            sourceUrl,
+                            'Failed to describe image.',
+                            mime
                         )
-                        continue
                     }
-                } else {
-                    // Non-image, model doesn't support it natively
-                    pushError(
-                        `Unsupported MIME type "${mimeType}" for the current model. The model does not natively support this file type.`,
-                        mimeType
-                    )
                     continue
                 }
+
+                pushError(
+                    report,
+                    sourceUrl,
+                    `Unsupported MIME "${mime}" for the current model.`,
+                    mime
+                )
             } catch (error) {
                 logger.warn(`read_files error for ${sourceUrl}:`, error)
-                const errorMessage =
+                pushError(
+                    report,
+                    sourceUrl,
                     error instanceof Error ? error.message : String(error)
-                pushError(errorMessage)
+                )
             }
         }
 
-        // Inject native parts into next-round context via contextManager
-        if (nativeParts.length > 0 && conversationId) {
-            const message = buildMultimodalMessage(
-                nativeParts,
-                this.config.fileInsertPrompt,
-                model?.modelInfo.name
-            )
-
+        if (native.length > 0 && conversationId) {
             this.ctx.chatluna.contextManager.inject({
                 conversationId,
                 name: 'read_files_context',
-                value: message,
+                value: buildMultimodalMessage(
+                    native,
+                    this.config.fileInsertPrompt
+                ),
                 once: true,
                 stage: 'after_scratchpad'
             })
-
             logger.debug(
-                `Injected ${nativeParts.length} file part(s) into context for conversation ${conversationId}`
+                `Injected ${native.length} file part(s) into context for conversation ${conversationId}`
             )
         }
 
         return JSON.stringify({
-            response,
+            response: report,
             note:
-                nativeParts.length > 0
-                    ? `Successfully read ${nativeParts.length} file(s). The file content has been added to the conversation context and will be available in the next turn.`
+                native.length > 0
+                    ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.`
                     : describedCount > 0
                       ? `Described ${describedCount} image file(s) using the vision model.`
-                      : response.failureCount > 0
-                        ? `Failed to read ${response.failureCount} file(s).`
+                      : report.failureCount > 0
+                        ? `Failed to read ${report.failureCount} file(s).`
                         : 'No files were processed.'
         })
     }
 
-    /**
-     * Describe an image using the configured image model (fallback when the
-     * main model doesn't support image input).
-     */
-    private async _describeImageWithModel(
+    private async _fetch(
+        url: string
+    ): Promise<{ buffer: Buffer; contentType: string | null } | null> {
+        try {
+            const response = await this.ctx.http(url, {
+                responseType: 'arraybuffer',
+                method: 'get',
+                headers: { 'User-Agent': BROWSER_UA },
+                timeout: 60_000
+            })
+            return {
+                buffer: Buffer.from(response.data),
+                contentType:
+                    (response.headers as Headers)?.get?.('content-type') ?? null
+            }
+        } catch {
+            return null
+        }
+    }
+
+    private async _describeImage(
         url: string,
         buffer: Buffer,
         mimeType: string
     ): Promise<string | null> {
         const imageModel = this.imageModelRef().value
-        if (imageModel == null) {
-            logger.warn(
-                'Image model is not loaded, cannot describe image. Please check your chat adapter.'
-            )
-            return null
-        }
-
         if (
+            !imageModel ||
             !imageModel.modelInfo.capabilities.includes(
                 ModelCapabilities.ImageInput
             )
         ) {
-            logger.warn('Image model does not support image input.')
+            logger.warn(
+                'Image model not loaded or lacks image input; cannot describe.'
+            )
             return null
         }
 
         try {
-            const fakeMessage: Message = { content: [] }
-
+            const fake: Message = { content: [] }
             if (mimeType === 'image/gif') {
                 const frames = await parseGifToFrames(buffer, {
                     strategy: this.config.gifStrategy,
                     frameCount: this.config.gifFrameCount
                 })
-
                 addTextToContent(
-                    fakeMessage,
+                    fake,
                     'This is a GIF image. See the frames below:'
                 )
-                for (const frame of frames) {
-                    addImageToContent(fakeMessage, frame)
-                }
+                for (const frame of frames) addImageToContent(fake, frame)
             } else {
-                const base64 = buffer.toString('base64')
-                const base64Source = `data:${mimeType};base64,${base64}`
-                addImageToContent(fakeMessage, base64Source)
+                addImageToContent(
+                    fake,
+                    `data:${mimeType};base64,${buffer.toString('base64')}`
+                )
             }
-
-            return await processImageWithModel(
-                imageModel,
-                this.config,
-                fakeMessage
-            )
+            return await processImageWithModel(imageModel, this.config, fake)
         } catch (error) {
             logger.warn(`Describe image ${url} error:`, error)
             return null
@@ -767,6 +375,148 @@ export class ReadFilesTool extends StructuredTool {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+interface ToolReport {
+    files: {
+        sourceUrl: string
+        mimeType?: string
+        status: 'ok' | 'described' | 'error'
+        description?: string
+        error?: string
+    }[]
+    successCount: number
+    failureCount: number
+}
+
+function pushError(
+    report: ToolReport,
+    sourceUrl: string,
+    error: string,
+    mimeType?: string
+) {
+    report.files.push({ sourceUrl, mimeType, status: 'error', error })
+    report.failureCount++
+}
+
+function pushNative(
+    report: ToolReport,
+    native: NativePart[],
+    sourceUrl: string,
+    mimeType: string,
+    base64Data: string
+) {
+    native.push({ sourceUrl, mimeType, base64Data })
+    report.files.push({ sourceUrl, mimeType, status: 'ok' })
+    report.successCount++
+}
+
+function isHttp(url: string): boolean {
+    try {
+        const { protocol } = new URL(url)
+        return protocol === 'http:' || protocol === 'https:'
+    } catch {
+        return false
+    }
+}
+
+function modelSupportsMime(model: ChatLunaChatModel, mime: string): boolean {
+    const caps = model.modelInfo.capabilities
+    const isImage = IMAGE_MIME_TYPES.has(mime)
+    const capOk = isImage
+        ? caps.includes(ModelCapabilities.ImageInput)
+        : mime.startsWith('audio/')
+          ? caps.includes(ModelCapabilities.AudioInput)
+          : mime.startsWith('video/')
+            ? caps.includes(ModelCapabilities.VideoInput)
+            : caps.includes(ModelCapabilities.FileInput)
+    if (!capOk) return false
+    const file = model.fileHandlingConfig
+    return file == null || file.supportedMimeTypes.has(mime)
+}
+
+function mimeEnabled(config: Config, mime: string): boolean {
+    if (mime === 'image/gif') return config.enableGifReadTool
+    if (IMAGE_MIME_TYPES.has(mime)) return config.enableImageReadTool
+    return config.enableFileReadTool
+}
+
+function checkSize(
+    buffer: Buffer,
+    mime: string,
+    fileConfig: FileHandlingConfig | undefined,
+    totalBytes: number,
+    maxTotal: number
+): string | null {
+    const max =
+        fileConfig?.maxFileSizeBytesOverrides?.[mime] ??
+        fileConfig?.maxFileSizeBytes ??
+        DEFAULT_MAX_FILE_SIZE_BYTES
+    const encoded = getBase64EncodedSize(buffer.byteLength)
+    if (encoded > max) {
+        return `File too large (${encoded} bytes after base64, raw ${buffer.byteLength} bytes), max ${max} bytes for ${mime}.`
+    }
+    if (totalBytes + encoded > maxTotal) {
+        return `Total inline upload size too large (${totalBytes + encoded} bytes), max ${maxTotal} bytes per request.`
+    }
+    return null
+}
+
+function buildMultimodalMessage(
+    parts: NativePart[],
+    prompt: string
+): HumanMessage {
+    const content: MessageContentComplex[] = []
+    for (const { mimeType, base64Data } of parts) {
+        const dataUrl = `data:${mimeType};base64,${base64Data}`
+        if (IMAGE_MIME_TYPES.has(mimeType)) {
+            content.push({ type: 'image_url', image_url: { url: dataUrl } })
+        } else if (mimeType.startsWith('audio/')) {
+            content.push({
+                type: 'audio_url',
+                audio_url: { url: dataUrl, mimeType }
+            } as unknown as MessageContentComplex)
+        } else if (mimeType.startsWith('video/')) {
+            content.push({
+                type: 'video_url',
+                video_url: { url: dataUrl, mimeType }
+            } as unknown as MessageContentComplex)
+        } else {
+            // Inline data for text/pdf/etc. (Gemini-style)
+            content.push({
+                inline_data: { mime_type: mimeType, data: base64Data }
+            } as unknown as MessageContentComplex)
+        }
+    }
+    if (content.length > 0) content.unshift({ type: 'text', text: prompt })
+    return new HumanMessage({ content })
+}
+
+function describeTool(config: Config): string {
+    const sections: string[] = []
+    if (config.enableImageReadTool) {
+        sections.push(
+            '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.'
+        )
+    }
+    if (config.enableGifReadTool) {
+        sections.push(
+            '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.'
+        )
+    }
+    if (config.enableFileReadTool) {
+        sections.push(
+            '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).'
+        )
+    }
+    return `Read files from URL(s) and return their content.
+Enabled read_files capabilities:
+${sections.join('\n')}
+Use this tool when you need to read files from URL(s) as context.`
+}
+
 // ---------------------------------------------------------------------------
 // Plugin registration
 // ---------------------------------------------------------------------------
diff --git a/packages/service-multimodal/src/read_files_schema.ts b/packages/service-multimodal/src/read_files_schema.ts
deleted file mode 100644
index 8368f395d..000000000
--- a/packages/service-multimodal/src/read_files_schema.ts
+++ /dev/null
@@ -1,31 +0,0 @@
-import z from 'zod'
-
-const READ_FILE_SCHEMA = z.object({
-    url: z.string().url()
-})
-
-function parseJsonStringInput(value: unknown): unknown {
-    if (typeof value !== 'string') {
-        return value
-    }
-
-    try {
-        return JSON.parse(value)
-    } catch {
-        return value
-    }
-}
-
-export const readFilesInputSchema = z.object({
-    files: z
-        .preprocess(
-            parseJsonStringInput,
-            z.union([
-                READ_FILE_SCHEMA,
-                z.array(READ_FILE_SCHEMA).min(1).max(10)
-            ])
-        )
-        .describe(
-            'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
-        )
-})
diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts
index bfb0532d0..564ccb9f0 100644
--- a/packages/service-multimodal/src/utils.ts
+++ b/packages/service-multimodal/src/utils.ts
@@ -1,6 +1,5 @@
 import {
     HumanMessage,
-    MessageContent,
     MessageContentComplex,
     MessageContentText
 } from '@langchain/core/messages'
@@ -12,195 +11,310 @@ import {
     isMessageContentImageUrl
 } from 'koishi-plugin-chatluna/utils/string'
 import { Context } from 'koishi'
+import type {} from 'koishi-plugin-ffmpeg-path'
 import { Config, logger } from '.'
 import { GifReader } from 'omggif'
 import { Jimp } from 'jimp'
 
-export interface GifExtractionConfig {
-    strategy: 'first' | 'head' | 'average'
-    frameCount: number
+// ---------------------------------------------------------------------------
+// MIME helpers
+// ---------------------------------------------------------------------------
+
+export const IMAGE_MIME_TYPES = new Set([
+    'image/png',
+    'image/jpeg',
+    'image/bmp',
+    'image/webp',
+    'image/gif'
+])
+
+const FILE_EXTENSION_TO_MIME_TYPE: Record<string, string> = {
+    '.png': 'image/png',
+    '.jpg': 'image/jpeg',
+    '.jpeg': 'image/jpeg',
+    '.bmp': 'image/bmp',
+    '.webp': 'image/webp',
+    '.gif': 'image/gif',
+    '.pdf': 'application/pdf',
+    '.txt': 'text/plain',
+    '.md': 'text/markdown',
+    '.html': 'text/html',
+    '.htm': 'text/html',
+    '.css': 'text/css',
+    '.xml': 'text/xml',
+    '.csv': 'text/csv',
+    '.rtf': 'text/rtf',
+    '.js': 'text/javascript',
+    '.mjs': 'text/javascript',
+    '.json': 'application/json',
+    '.mp4': 'video/mp4',
+    '.mpeg': 'video/mpeg',
+    '.mov': 'video/mov',
+    '.avi': 'video/avi',
+    '.flv': 'video/x-flv',
+    '.webm': 'video/webm',
+    '.wmv': 'video/wmv',
+    '.3gp': 'video/3gpp',
+    '.3gpp': 'video/3gpp',
+    '.mp3': 'audio/mpeg',
+    '.aiff': 'audio/aiff',
+    '.aac': 'audio/aac',
+    '.flac': 'audio/flac',
+    '.wav': 'audio/wav',
+    '.ogg': 'audio/ogg',
+    '.m4a': 'audio/mp4'
 }
 
-/**
- * Check if any frame in the range [start, end) has complex disposal methods
- * that require resetting the canvas (disposal method 2 or 3)
- */
-function hasComplexDisposal(
-    reader: GifReader,
-    start: number,
-    end: number
-): boolean {
-    for (let i = start; i < end; i++) {
-        const disposal = reader.frameInfo(i).disposal
-        // disposal 2: restore to background color
-        // disposal 3: restore to previous (before current frame was drawn)
-        if (disposal === 2 || disposal === 3) {
-            return true
-        }
+export function inferMimeTypeFromUrl(url: string): string | null {
+    try {
+        const path = new URL(url).pathname.toLowerCase()
+        const dot = path.lastIndexOf('.')
+        return dot < 0
+            ? null
+            : (FILE_EXTENSION_TO_MIME_TYPE[path.slice(dot)] ?? null)
+    } catch {
+        return null
     }
-    return false
 }
 
-export async function extractGifFrames(
+export function normalizeMimeType(
+    raw: string | null | undefined
+): string | null {
+    return raw?.split(';')[0]?.trim()?.toLowerCase() || null
+}
+
+/**
+ * Detect audio MIME from buffer header. Recognises QQ Silk + AMR + common
+ * audio container magic bytes. Falls back to the declared MIME otherwise.
+ */
+export function detectAudioMimeType(
     buffer: Buffer,
-    config: GifExtractionConfig
-): Promise<Buffer[]> {
-    try {
-        const reader = new GifReader(buffer)
-        const totalFrames = reader.numFrames()
+    declared?: string | null
+): string | null {
+    const head = buffer.subarray(0, 16).toString('latin1')
+
+    if (head.startsWith('#!AMR')) return 'audio/amr'
+    if (
+        head.startsWith('#!SILK_V3') ||
+        buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+    ) {
+        return 'audio/silk'
+    }
+    // MP3 frame sync: 0xFFEx. Reject JPEG (0xFFD8) by checking the full sync word.
+    if (
+        head.startsWith('ID3') ||
+        (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0)
+    ) {
+        return 'audio/mpeg'
+    }
+    if (
+        head.startsWith('RIFF') &&
+        buffer.subarray(8, 12).toString('latin1') === 'WAVE'
+    ) {
+        return 'audio/wav'
+    }
+    if (head.startsWith('fLaC')) return 'audio/flac'
+    if (head.startsWith('OggS')) return 'audio/ogg'
 
-        if (totalFrames === 0) {
-            throw new Error('No frames found in GIF')
-        }
+    return declared ?? null
+}
 
-        const width = reader.width
-        const height = reader.height
+// ---------------------------------------------------------------------------
+// FFmpeg / Silk
+// ---------------------------------------------------------------------------
+
+export async function convertAudioToMp3(
+    ctx: Context,
+    buffer: Buffer
+): Promise<Buffer | null> {
+    if (!ctx.ffmpeg) {
+        logger.warn(
+            'FFmpeg service unavailable; install koishi-plugin-ffmpeg-path to enable audio transcoding.'
+        )
+        return null
+    }
 
-        let frameIndices: number[] = []
+    try {
+        const isSilk =
+            buffer.subarray(0, 9).toString('latin1') === '#!SILK_V3' ||
+            buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+
+        let source = buffer
+        let silkSampleRate: number | null = null
+        if (isSilk) {
+            const decoded = await decodeSilkToPcm(ctx, buffer)
+            if (!decoded) return null
+            source = decoded.buffer
+            silkSampleRate = decoded.sampleRate
+        }
 
-        switch (config.strategy) {
-            case 'first':
-                frameIndices = [0]
-                break
+        const builder = ctx.ffmpeg.builder().input(source)
+        if (silkSampleRate != null) {
+            builder.inputOption(
+                '-f',
+                's16le',
+                '-ar',
+                String(silkSampleRate),
+                '-ac',
+                '1'
+            )
+        }
+        return await builder
+            .outputOption(
+                '-vn',
+                '-acodec',
+                'libmp3lame',
+                '-q:a',
+                '4',
+                '-f',
+                'mp3'
+            )
+            .run('buffer')
+    } catch (error) {
+        logger.warn(`Audio transcoding to mp3 failed:`, error)
+        return null
+    }
+}
 
-            case 'head': {
-                const count = Math.min(config.frameCount, totalFrames)
-                frameIndices = Array.from({ length: count }, (_, i) => i)
-                break
+async function decodeSilkToPcm(
+    ctx: Context,
+    buffer: Buffer
+): Promise<{ buffer: Buffer; sampleRate: number } | null> {
+    if (!ctx.silk) {
+        logger.warn(
+            'Silk service unavailable; install koishi-plugin-ffmpeg-path 2.0+ for silk decoding.'
+        )
+        return null
+    }
+    for (const sampleRate of [24000, 16000, 12000, 8000]) {
+        try {
+            const result = (await ctx.silk.decode(buffer, sampleRate)) as {
+                data?: Uint8Array
             }
-
-            case 'average': {
-                const count = Math.min(config.frameCount, totalFrames)
-                if (count >= totalFrames) {
-                    frameIndices = Array.from(
-                        { length: totalFrames },
-                        (_, i) => i
-                    )
-                } else if (count === 1) {
-                    // Special case: single frame, pick the first one
-                    frameIndices = [0]
-                } else {
-                    // Use span (totalFrames - 1) to ensure first and last frames are included
-                    const step = (totalFrames - 1) / (count - 1)
-                    frameIndices = Array.from({ length: count }, (_, i) =>
-                        Math.floor(i * step)
-                    )
-                }
-                break
+            if (result?.data != null) {
+                return { buffer: Buffer.from(result.data), sampleRate }
             }
+        } catch {
+            // try next sample rate
         }
+    }
+    return null
+}
 
-        const frameBuffers: Buffer[] = []
-
-        // Build canvas incrementally, only decoding frames we need
-        const canvas = new Uint8ClampedArray(width * height * 4)
-        let lastDecodedFrame = -1
+// ---------------------------------------------------------------------------
+// GIF
+// ---------------------------------------------------------------------------
 
-        for (const frameIndex of frameIndices) {
-            // Check if we need to restart decoding from frame 0
-            // This happens when:
-            // 1. Jumping backwards in frame sequence
-            // 2. Any frames between lastDecodedFrame and current have complex disposal methods
-            //    (disposal 2 or 3) which affect how the canvas should be prepared
-            const needsFullDecode =
-                frameIndex < lastDecodedFrame ||
-                (lastDecodedFrame >= 0 &&
-                    hasComplexDisposal(reader, lastDecodedFrame, frameIndex))
+export interface GifExtractionConfig {
+    strategy: 'first' | 'head' | 'average'
+    frameCount: number
+}
 
-            if (needsFullDecode) {
-                canvas.fill(0) // Clear canvas
-                // Decode from frame 0 to current frame
-                for (let i = 0; i <= frameIndex; i++) {
-                    reader.decodeAndBlitFrameRGBA(i, canvas)
-                }
-            } else {
-                // Disposal method 0 (no disposal) or 1 (do not dispose)
-                // Just decode from last position to current frame
-                for (let i = lastDecodedFrame + 1; i <= frameIndex; i++) {
-                    reader.decodeAndBlitFrameRGBA(i, canvas)
-                }
+export async function parseGifToFrames(
+    buffer: Buffer,
+    config: GifExtractionConfig
+): Promise<string[]> {
+    const reader = new GifReader(buffer)
+    const total = reader.numFrames()
+    if (total === 0) throw new Error('No frames found in GIF')
+
+    const indices = pickGifFrameIndices(total, config)
+    const { width, height } = reader
+    const canvas = new Uint8ClampedArray(width * height * 4)
+    let lastDecoded = -1
+    const frames: string[] = []
+
+    for (const idx of indices) {
+        const needsFullDecode =
+            idx < lastDecoded ||
+            (lastDecoded >= 0 && hasComplexDisposal(reader, lastDecoded, idx))
+        if (needsFullDecode) {
+            canvas.fill(0)
+            for (let i = 0; i <= idx; i++)
+                reader.decodeAndBlitFrameRGBA(i, canvas)
+        } else {
+            for (let i = lastDecoded + 1; i <= idx; i++) {
+                reader.decodeAndBlitFrameRGBA(i, canvas)
             }
-
-            lastDecodedFrame = frameIndex
-
-            // Copy canvas to avoid reference issues
-            const frameData = new Uint8ClampedArray(canvas)
-            const image = new Jimp({
-                data: Buffer.from(frameData),
-                width,
-                height
-            })
-
-            const pngBuffer = await image.getBuffer('image/png')
-            frameBuffers.push(pngBuffer)
         }
-
-        return frameBuffers
-    } catch (error) {
-        logger.error('Failed to extract GIF frames:', error)
-        throw error
+        lastDecoded = idx
+
+        const png = await new Jimp({
+            data: Buffer.from(new Uint8ClampedArray(canvas)),
+            width,
+            height
+        }).getBuffer('image/png')
+        frames.push(`data:image/png;base64,${png.toString('base64')}`)
     }
+    return frames
 }
 
-export async function parseGifToFrames(
-    buffer: Buffer,
+function pickGifFrameIndices(
+    total: number,
     config: GifExtractionConfig
-): Promise<string[]> {
-    const frameBuffers = await extractGifFrames(buffer, config)
-    return frameBuffers.map((frameBuffer) => {
-        const base64 = frameBuffer.toString('base64')
-        return `data:image/png;base64,${base64}`
-    })
+): number[] {
+    if (config.strategy === 'first') return [0]
+    const count = Math.min(config.frameCount, total)
+    if (config.strategy === 'head') {
+        return Array.from({ length: count }, (_, i) => i)
+    }
+    // average
+    if (count >= total) return Array.from({ length: total }, (_, i) => i)
+    if (count === 1) return [0]
+    const step = (total - 1) / (count - 1)
+    return Array.from({ length: count }, (_, i) => Math.floor(i * step))
+}
+
+function hasComplexDisposal(
+    reader: GifReader,
+    start: number,
+    end: number
+): boolean {
+    for (let i = start; i < end; i++) {
+        const d = reader.frameInfo(i).disposal
+        if (d === 2 || d === 3) return true
+    }
+    return false
 }
 
+// ---------------------------------------------------------------------------
+// Image
+// ---------------------------------------------------------------------------
+
 export async function readImage(ctx: Context, url: string) {
     if (url.startsWith('data:image') && url.includes('base64')) {
         const buffer = Buffer.from(url.split(',')[1], 'base64')
-        const ext = getImageType(buffer)
-
-        return {
-            base64Source: url,
-            buffer,
-            ext
-        }
+        return { base64Source: url, buffer, ext: getImageType(buffer) }
     }
-
     try {
-        const response = await ctx.http(url, {
+        const { data } = await ctx.http(url, {
             responseType: 'arraybuffer',
             method: 'get',
-            headers: {
-                'User-Agent':
-                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
-            }
+            headers: { 'User-Agent': BROWSER_UA }
         })
-
-        const buffer = Buffer.from(response.data)
-
-        const base64 = buffer.toString('base64')
-
+        const buffer = Buffer.from(data)
         const ext = getImageType(buffer)
-
         return {
-            base64Source: `data:${ext};base64,${base64}`,
+            base64Source: `data:${ext};base64,${buffer.toString('base64')}`,
             buffer,
             ext
         }
     } catch (error) {
         logger.error(`Failed to read image from ${url}:`, error)
-        return {
-            base64Source: null,
-            buffer: null,
-            ext: null
-        }
+        return { base64Source: null, buffer: null, ext: null }
     }
 }
+
 export async function processImageWithModel(
     model: ChatLunaChatModel,
     config: Config,
     message: Message
-) {
-    const images = extractImages(message.content)
+): Promise<string | null> {
+    const images = Array.isArray(message.content)
+        ? message.content.filter((item: MessageContentComplex) =>
+              isMessageContentImageUrl(item)
+          )
+        : []
     if (images.length === 0) return null
 
     try {
@@ -208,9 +322,7 @@ export async function processImageWithModel(
             { type: 'text', text: config.imagePrompt } as MessageContentText,
             ...images
         ]
-
         const result = await model.invoke([new HumanMessage({ content })])
-
         return config.imageInsertPrompt.replace(
             '{img}',
             getMessageContent(result.content)
@@ -221,45 +333,36 @@ export async function processImageWithModel(
     }
 }
 
-export const addImageToContent = (message: Message, imageUrl: string) => {
-    if (typeof message.content === 'string') {
-        message.content = [
-            {
-                type: 'text',
-                text: message.content
-            }
-        ]
-    }
+export function addImageToContent(message: Message, imageUrl: string) {
+    ensureContentArray(message)
     ;(message.content as MessageContentComplex[]).push({
         type: 'image_url',
-        image_url: {
-            url: imageUrl
-        }
+        image_url: { url: imageUrl }
     })
 }
 
-export const addTextToContent = (message: Message, text: string) => {
+export function addTextToContent(message: Message, text: string) {
     if (typeof message.content === 'string') {
         message.content += text
         return
     }
-
     const content = message.content as MessageContentComplex[]
-    const lastItem = content[content.length - 1]
-
-    if (lastItem && lastItem.type === 'text') {
-        lastItem.text += text
+    const last = content[content.length - 1]
+    if (last && last.type === 'text') {
+        last.text += text
     } else {
-        content.push({
-            type: 'text',
-            text
-        })
+        content.push({ type: 'text', text })
     }
 }
 
-export const extractImages = (content: MessageContent) =>
-    Array.isArray(content)
-        ? content.filter((item: MessageContentComplex) =>
-              isMessageContentImageUrl(item)
-          )
-        : []
+export function ensureContentArray(message: Message, fallbackText = '') {
+    if (typeof message.content !== 'string') return
+    message.content = message.content.length
+        ? [{ type: 'text', text: message.content }]
+        : fallbackText.length
+          ? [{ type: 'text', text: fallbackText }]
+          : []
+}
+
+export const BROWSER_UA =
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
diff --git a/packages/service-multimodal/tests/audio-mimo.test.ts b/packages/service-multimodal/tests/audio-mimo.test.ts
deleted file mode 100644
index 240e67c3d..000000000
--- a/packages/service-multimodal/tests/audio-mimo.test.ts
+++ /dev/null
@@ -1,169 +0,0 @@
-import assert from 'node:assert/strict'
-import { test } from 'node:test'
-import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
-import {
-    MIMO_BASE64_AUDIO_BYTES,
-    MIMO_BASE64_IMAGE_BYTES,
-    buildAudioContent,
-    buildImageContent,
-    isMimoAudioMime,
-    isMimoImageMime,
-    modelCanReadAudio,
-    modelCanReadImage
-} from '../src/audio'
-import { detectAudioMimeType } from '../src/media'
-import { readFilesInputSchema } from '../src/read_files_schema'
-
-test('recognizes MiMo audio models even when adapter metadata lacks AudioInput', () => {
-    assert.equal(
-        modelCanReadAudio(
-            { value: { capabilities: [ModelCapabilities.ToolCall] } },
-            'unifyllm/mimo-v2.5'
-        ),
-        true
-    )
-    assert.equal(
-        modelCanReadAudio(
-            { value: { capabilities: [ModelCapabilities.ToolCall] } },
-            'mimo-v2-omni'
-        ),
-        true
-    )
-    assert.equal(
-        modelCanReadAudio(
-            { value: { capabilities: [ModelCapabilities.ToolCall] } },
-            'unifyllm/deepseek-v4-flash'
-        ),
-        false
-    )
-})
-
-test('uses MiMo input_audio data URL instead of ChatLuna audio_url', () => {
-    assert.deepEqual(buildAudioContent('mimo-v2.5', 'abc', 'audio/mpeg'), {
-        type: 'input_audio',
-        input_audio: {
-            data: 'data:audio/mpeg;base64,abc'
-        }
-    })
-    assert.deepEqual(buildAudioContent('gpt-4o-audio', 'abc', 'audio/mpeg'), {
-        type: 'audio_url',
-        audio_url: {
-            url: 'data:audio/mpeg;base64,abc',
-            mimeType: 'audio/mpeg'
-        }
-    })
-})
-
-test('keeps MiMo base64 audio within the documented 50 MB limit', () => {
-    assert.equal(MIMO_BASE64_AUDIO_BYTES, 50 * 1024 * 1024)
-    assert.equal(isMimoAudioMime('audio/mpeg'), true)
-    assert.equal(isMimoAudioMime('audio/wav'), true)
-    assert.equal(isMimoAudioMime('audio/flac'), true)
-    assert.equal(isMimoAudioMime('audio/mp4'), true)
-    assert.equal(isMimoAudioMime('audio/ogg'), true)
-    assert.equal(isMimoAudioMime('audio/aac'), false)
-})
-
-test('recognizes MiMo image models even when adapter metadata lacks ImageInput', () => {
-    assert.equal(
-        modelCanReadImage(
-            { value: { capabilities: [ModelCapabilities.ToolCall] } },
-            'unifyllm/mimo-v2.5'
-        ),
-        true
-    )
-    assert.equal(
-        modelCanReadImage(
-            { value: { capabilities: [ModelCapabilities.ToolCall] } },
-            'mimo-v2-omni'
-        ),
-        true
-    )
-    assert.equal(
-        modelCanReadImage(
-            { value: { capabilities: [ModelCapabilities.ToolCall] } },
-            'unifyllm/deepseek-v4-flash'
-        ),
-        false
-    )
-})
-
-test('uses OpenAI image_url content for MiMo images', () => {
-    assert.deepEqual(buildImageContent('abc', 'image/png'), {
-        type: 'image_url',
-        image_url: {
-            url: 'data:image/png;base64,abc'
-        }
-    })
-})
-
-test('keeps MiMo base64 images within the documented 50 MB limit', () => {
-    assert.equal(MIMO_BASE64_IMAGE_BYTES, 50 * 1024 * 1024)
-    assert.equal(isMimoImageMime('image/jpeg'), true)
-    assert.equal(isMimoImageMime('image/png'), true)
-    assert.equal(isMimoImageMime('image/gif'), true)
-    assert.equal(isMimoImageMime('image/webp'), true)
-    assert.equal(isMimoImageMime('image/bmp'), true)
-    assert.equal(isMimoImageMime('image/svg+xml'), false)
-})
-
-test('accepts JSON-stringified read_files input from tool calls', () => {
-    assert.deepEqual(
-        readFilesInputSchema.parse({
-            files: '{"url":"http://127.0.0.1:5140/image.png"}'
-        }),
-        {
-            files: {
-                url: 'http://127.0.0.1:5140/image.png'
-            }
-        }
-    )
-
-    assert.deepEqual(
-        readFilesInputSchema.parse({
-            files: '[{"url":"http://127.0.0.1:5140/image.png"}]'
-        }),
-        {
-            files: [
-                {
-                    url: 'http://127.0.0.1:5140/image.png'
-                }
-            ]
-        }
-    )
-})
-
-test('detects AMR audio even when storage declares it as MP3', () => {
-    assert.equal(
-        detectAudioMimeType(Buffer.from('#!AMR\nabc'), 'audio/mp3'),
-        'audio/amr'
-    )
-    assert.equal(
-        detectAudioMimeType(Buffer.from('#!AMR\nabc'), null),
-        'audio/amr'
-    )
-    assert.equal(
-        detectAudioMimeType(Buffer.from('ID3abc'), 'audio/mp3'),
-        'audio/mpeg'
-    )
-})
-
-test('does not misidentify JPEG as audio/mpeg', () => {
-    // JPEG starts with FF D8 FF E0 (JFIF) — 0xD8 & 0xE0 = 0xC0, not an MP3 sync
-    const jpegHeader = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46])
-    assert.equal(
-        detectAudioMimeType(jpegHeader, 'image/jpeg'),
-        'image/jpeg'
-    )
-    assert.equal(detectAudioMimeType(jpegHeader, null), null)
-})
-
-test('still detects valid MP3 frame sync', () => {
-    // MP3: FF FB (MPEG1 Layer3) — 0xFB & 0xE0 = 0xE0, valid sync
-    const mp3Header = Buffer.from([0xff, 0xfb, 0x90, 0x00])
-    assert.equal(detectAudioMimeType(mp3Header, null), 'audio/mpeg')
-
-    // MP3: FF F3 (MPEG2 Layer3)
-    const mp3v2Header = Buffer.from([0xff, 0xf3, 0x90, 0x00])
-    assert.equal(detectAudioMimeType(mp3v2Header, null), 'audio/mpeg')
-})
diff --git a/packages/shared-adapter/src/client.ts b/packages/shared-adapter/src/client.ts
index 36af06aa3..1015a4350 100644
--- a/packages/shared-adapter/src/client.ts
+++ b/packages/shared-adapter/src/client.ts
@@ -1,6 +1,10 @@
+import { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client'
 import { ModelInfo } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import { getModelContextSize } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
 
+export const DEFAULT_AUDIO_MAX_BASE64_BYTES = 50 * 1024 * 1024
+export const DEFAULT_IMAGE_MAX_BASE64_BYTES = 50 * 1024 * 1024
+
 export type OpenAIReasoningEffort =
     | 'none'
     | 'minimal'
@@ -153,7 +157,11 @@ function createGlobMatcher(pattern: string): (text: string) => boolean {
     return (text: string) => regex.test(text)
 }
 
-const imageModelMatchers = [
+function createRegexMatcher(regex: RegExp): (text: string) => boolean {
+    return (text: string) => regex.test(text)
+}
+
+const imageModelMatchers: ((text: string) => boolean)[] = [
     'vision',
     'vl',
     'gpt-4o',
@@ -176,11 +184,76 @@ const imageModelMatchers = [
     'glm-*v',
     'kimi-k2.5',
     'step3',
-    'grok-4',
-    'mimo-v2.5*'
-].map((pattern) => createGlobMatcher(pattern))
+    'grok-4'
+].map(createGlobMatcher)
+
+// mimo-v2.5 supports image/audio; mimo-v2.5-pro does NOT (text only).
+imageModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/))
 
 export function supportImageInput(modelName: string) {
     const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase()
     return imageModelMatchers.some((matcher) => matcher(lowerModel))
 }
+
+const audioModelMatchers: ((text: string) => boolean)[] = [
+    'gpt-4o-audio',
+    'gpt-4o-mini-audio',
+    'gpt-audio',
+    'mimo-v2-omni'
+].map(createGlobMatcher)
+
+audioModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/))
+
+export function supportAudioInput(modelName: string) {
+    const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase()
+    return audioModelMatchers.some((matcher) => matcher(lowerModel))
+}
+
+const openAIImageMimeTypes = [
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp'
+]
+
+const openAIAudioMimeTypes = [
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/flac',
+    'audio/mp4',
+    'audio/ogg'
+]
+
+export function getOpenAIFileHandlingConfig(
+    modelName: string
+): FileHandlingConfig | undefined {
+    const image = supportImageInput(modelName)
+    const audio = supportAudioInput(modelName)
+    if (!image && !audio) return undefined
+
+    const supportedMimeTypes = new Set<string>()
+    const overrides: Record<string, number> = {}
+
+    if (image) {
+        for (const mime of openAIImageMimeTypes) {
+            supportedMimeTypes.add(mime)
+            overrides[mime] = DEFAULT_IMAGE_MAX_BASE64_BYTES
+        }
+    }
+
+    if (audio) {
+        for (const mime of openAIAudioMimeTypes) {
+            supportedMimeTypes.add(mime)
+            overrides[mime] = DEFAULT_AUDIO_MAX_BASE64_BYTES
+        }
+    }
+
+    return {
+        supportedMimeTypes,
+        maxTotalSizeBytes: 100 * 1024 * 1024,
+        maxFileSizeBytes: 100 * 1024 * 1024,
+        maxFileSizeBytesOverrides: overrides
+    }
+}
diff --git a/packages/shared-adapter/src/utils.ts b/packages/shared-adapter/src/utils.ts
index a171e25f6..f88b5fc4a 100644
--- a/packages/shared-adapter/src/utils.ts
+++ b/packages/shared-adapter/src/utils.ts
@@ -30,15 +30,24 @@ import {
     ResponseUsage
 } from './types'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
+import { logger } from 'koishi-plugin-chatluna'
 import {
     getImageMimeType,
     getMimeTypeFromSource,
     isMessageContentImageUrl
 } from 'koishi-plugin-chatluna/utils/string'
-import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain'
+import {
+    isChatLunaUserMessage,
+    isMessageContentAudio
+} from 'koishi-plugin-chatluna/utils/langchain'
 import { ToolCallChunk } from '@langchain/core/messages/tool'
 import { isZodSchemaV3 } from '@langchain/core/utils/types'
-import { normalizeOpenAIModelName, supportImageInput } from './client'
+import {
+    DEFAULT_AUDIO_MAX_BASE64_BYTES,
+    normalizeOpenAIModelName,
+    supportAudioInput,
+    supportImageInput
+} from './client'
 
 export function createUsageMetadata(data: {
     inputTokens: number
@@ -222,6 +231,7 @@ export function responseInputContent(
                 } satisfies ResponseInputContent
             }
 
+            // OpenAI Response API does not accept `input_audio` yet — drop it.
             return undefined
         })
         .filter((part) => part != null)
@@ -343,64 +353,58 @@ export async function langchainMessageToOpenAIMessage(
             }
         }
 
-        const images = rawMessage.additional_kwargs.images as string[] | null
-
-        const lowerModel = normalizedModel?.toLowerCase() ?? ''
-        if (
-            images != null &&
-            (supportImageInput(lowerModel) || supportImageInputType)
-        ) {
-            msg.content = [
-                {
-                    type: 'text',
-                    text: rawMessage.content as string
-                }
-            ]
-
-            const imageContents = await Promise.all(
-                images.map(async (image) => {
-                    try {
-                        const url = await fetchImageUrl(plugin, {
-                            type: 'image_url',
-                            image_url: { url: image }
-                        } as MessageContentImageUrl)
-                        return {
-                            type: 'image_url',
-                            image_url: {
-                                url,
-                                detail: 'low'
-                            }
-                        } as const
-                    } catch {
-                        return null
-                    }
-                })
+        if (rawMessage.additional_kwargs.images != null) {
+            logger.warn(
+                'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
             )
+        }
 
-            msg.content.push(
-                ...imageContents.filter((content) => content != null)
-            )
-        } else if (Array.isArray(msg.content) && msg.content.length > 0) {
+        if (Array.isArray(msg.content) && msg.content.length > 0) {
+            const supportsAudio = supportAudioInput(normalizedModel ?? '')
+            const supportsImage =
+                supportImageInput(normalizedModel ?? '') ||
+                supportImageInputType === true
             const mappedContent = await Promise.all(
                 msg.content.map(async (content) => {
-                    if (!isMessageContentImageUrl(content)) return content
-
-                    try {
-                        const url = await fetchImageUrl(plugin, content)
-                        return {
-                            type: 'image_url',
-                            image_url: {
-                                url,
-                                detail: 'low'
+                    if (isMessageContentImageUrl(content)) {
+                        if (!supportsImage) {
+                            logger.warn(
+                                `Model ${normalizedModel} does not accept image input; dropping image content.`
+                            )
+                            return null
+                        }
+                        try {
+                            const url = await fetchImageUrl(plugin, content)
+                            return {
+                                type: 'image_url',
+                                image_url: { url, detail: 'low' }
                             }
+                        } catch {
+                            return null
+                        }
+                    }
+
+                    if (isMessageContentAudio(content)) {
+                        if (!supportsAudio) {
+                            logger.warn(
+                                `Model ${normalizedModel} does not accept audio input; dropping audio content.`
+                            )
+                            return null
+                        }
+                        try {
+                            return await fetchAudioContentPart(plugin, content)
+                        } catch {
+                            return null
                         }
-                    } catch {
-                        return null
                     }
+
+                    return content
                 })
             )
 
-            msg.content = mappedContent.filter((content) => content != null)
+            msg.content = mappedContent.filter(
+                (content) => content != null
+            ) as ChatCompletionResponseMessage['content']
         }
 
         result.push(msg)
@@ -676,6 +680,48 @@ export async function fetchFileLikeUrl(
     }
 }
 
+const AUDIO_MIME_TO_FORMAT: Record<string, string> = {
+    'audio/mpeg': 'mp3',
+    'audio/mp3': 'mp3',
+    'audio/wav': 'wav',
+    'audio/x-wav': 'wav',
+    'audio/flac': 'flac',
+    'audio/x-flac': 'flac',
+    'audio/ogg': 'ogg',
+    'audio/mp4': 'mp4',
+    'audio/aac': 'aac',
+    'audio/webm': 'webm'
+}
+
+function audioMimeToFormat(mime: string): string {
+    return AUDIO_MIME_TO_FORMAT[mime.toLowerCase()] ?? 'mp3'
+}
+
+/**
+ * Fetch an `audio_url` content part and convert it to the OpenAI-compatible
+ * `input_audio` shape used by gpt-4o-audio / MiMo. Returns `null` when the
+ * encoded payload exceeds {@link DEFAULT_AUDIO_MAX_BASE64_BYTES}.
+ */
+async function fetchAudioContentPart(
+    plugin: ChatLunaPlugin,
+    content: MessageContentFileLike & { type: 'audio_url' }
+): Promise<MessageContentComplex | null> {
+    const { buffer, mimeType } = await fetchFileLikeUrl(plugin, content)
+    const base64 = buffer.toString('base64')
+
+    if (base64.length > DEFAULT_AUDIO_MAX_BASE64_BYTES) {
+        return null
+    }
+
+    return {
+        type: 'input_audio',
+        input_audio: {
+            data: base64,
+            format: audioMimeToFormat(mimeType)
+        }
+    } as unknown as MessageContentComplex
+}
+
 export function messageTypeToOpenAIRole(
     type: MessageType
 ): ChatCompletionResponseMessageRoleEnum {

From e00ea561222642e22980ddced5f5b6018c9eb471 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Mon, 18 May 2026 16:12:49 +0800
Subject: [PATCH 4/7] [Fix] harden multimodal audio handling

---
 packages/adapter-openai/src/client.ts         | 12 ++++----
 .../service-multimodal/src/plugins/audio.ts   |  4 ++-
 .../src/plugins/read_files.ts                 | 29 +++++++++++++++----
 packages/service-multimodal/src/utils.ts      |  4 +++
 packages/shared-adapter/src/utils.ts          | 27 ++++++++++++++---
 5 files changed, 58 insertions(+), 18 deletions(-)

diff --git a/packages/adapter-openai/src/client.ts b/packages/adapter-openai/src/client.ts
index 0e0f0ffc1..9251a5fdc 100644
--- a/packages/adapter-openai/src/client.ts
+++ b/packages/adapter-openai/src/client.ts
@@ -67,13 +67,11 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient<ClientConfig>
                     (model) =>
                         !(
                             model.includes('instruct') ||
-                            [
-                                'whisper',
-                                'tts',
-                                'dall-e',
-                                'audio',
-                                'realtime'
-                            ].some((keyword) => model.includes(keyword))
+                            ['whisper', 'tts', 'dall-e', 'realtime'].some(
+                                (keyword) => model.includes(keyword)
+                            ) ||
+                            (model.includes('audio') &&
+                                !supportAudioInput(model))
                         )
                 )
                 .map((model) => {
diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts
index 6afad7fab..8d0a6ea15 100644
--- a/packages/service-multimodal/src/plugins/audio.ts
+++ b/packages/service-multimodal/src/plugins/audio.ts
@@ -30,7 +30,9 @@ const MIME_TO_EXT: Record<string, string> = {
     'audio/wav': 'wav',
     'audio/flac': 'flac',
     'audio/ogg': 'ogg',
-    'audio/mp4': 'm4a'
+    'audio/mp4': 'm4a',
+    'audio/aac': 'aac',
+    'audio/webm': 'webm'
 }
 
 /**
diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index 06fe42d33..84280670a 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -109,10 +109,7 @@ export class ReadFilesTool extends StructuredTool {
                     fetched.buffer,
                     declared
                 )
-                const mime =
-                    declared?.startsWith('audio/') || detectedAudio
-                        ? detectedAudio
-                        : declared
+                const mime = detectedAudio ?? declared
 
                 if (!mime) {
                     pushError(
@@ -323,8 +320,7 @@ export class ReadFilesTool extends StructuredTool {
             })
             return {
                 buffer: Buffer.from(response.data),
-                contentType:
-                    (response.headers as Headers)?.get?.('content-type') ?? null
+                contentType: getHeaderValue(response.headers, 'content-type')
             }
         } catch {
             return null
@@ -413,6 +409,27 @@ function pushNative(
     report.successCount++
 }
 
+function getHeaderValue(headers: unknown, name: string): string | null {
+    if (headers == null) return null
+
+    if (typeof (headers as { get?: unknown }).get === 'function') {
+        const value = (headers as { get(name: string): string | null }).get(
+            name
+        )
+        return typeof value === 'string' ? value : null
+    }
+
+    const record = headers as Record<string, unknown>
+    const lower = name.toLowerCase()
+    for (const key of Object.keys(record)) {
+        if (key.toLowerCase() === lower) {
+            const value = record[key]
+            return typeof value === 'string' ? value : null
+        }
+    }
+    return null
+}
+
 function isHttp(url: string): boolean {
     try {
         const { protocol } = new URL(url)
diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts
index 564ccb9f0..55ecfdbb4 100644
--- a/packages/service-multimodal/src/utils.ts
+++ b/packages/service-multimodal/src/utils.ts
@@ -94,6 +94,8 @@ export function detectAudioMimeType(
     const head = buffer.subarray(0, 16).toString('latin1')
 
     if (head.startsWith('#!AMR')) return 'audio/amr'
+    // QQ/OneBot ships SILK voice files with a leading flag byte before the
+    // standard `#!SILK_V3` magic, so we also check offset 1 for that variant.
     if (
         head.startsWith('#!SILK_V3') ||
         buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
@@ -135,6 +137,8 @@ export async function convertAudioToMp3(
     }
 
     try {
+        // Match both the standard SILK magic and the QQ/OneBot variant that
+        // prepends a flag byte before `#!SILK_V3`.
         const isSilk =
             buffer.subarray(0, 9).toString('latin1') === '#!SILK_V3' ||
             buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
diff --git a/packages/shared-adapter/src/utils.ts b/packages/shared-adapter/src/utils.ts
index f88b5fc4a..fe130f7fc 100644
--- a/packages/shared-adapter/src/utils.ts
+++ b/packages/shared-adapter/src/utils.ts
@@ -392,9 +392,22 @@ export async function langchainMessageToOpenAIMessage(
                             return null
                         }
                         try {
-                            return await fetchAudioContentPart(plugin, content)
-                        } catch {
-                            return null
+                            const part = await fetchAudioContentPart(
+                                plugin,
+                                content
+                            )
+                            if (part == null) {
+                                logger.warn(
+                                    `Audio content for model ${normalizedModel} was dropped (exceeded size limits or no data).`
+                                )
+                            }
+                            return part
+                        } catch (err) {
+                            logger.error(
+                                `Failed to fetch audio part for model ${normalizedModel}`,
+                                err
+                            )
+                            throw err
                         }
                     }
 
@@ -694,7 +707,13 @@ const AUDIO_MIME_TO_FORMAT: Record<string, string> = {
 }
 
 function audioMimeToFormat(mime: string): string {
-    return AUDIO_MIME_TO_FORMAT[mime.toLowerCase()] ?? 'mp3'
+    const format = AUDIO_MIME_TO_FORMAT[mime.toLowerCase()]
+    if (!format) {
+        throw new Error(
+            `Unsupported audio MIME for OpenAI input_audio: ${mime}`
+        )
+    }
+    return format
 }
 
 /**

From a66e7090273efcfc48de5774739e0f25534213b8 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Tue, 19 May 2026 03:58:30 +0800
Subject: [PATCH 5/7] fix(servuce-multimodal): inline read_files schema in tool
 definition

---
 .../src/plugins/read_files.ts                 | 30 ++++++++++++-------
 1 file changed, 20 insertions(+), 10 deletions(-)

diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index 84280670a..f987defdc 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -29,15 +29,6 @@ import z from 'zod'
 const DEFAULT_MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024
 const DEFAULT_MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024
 
-const fileSchema = z.object({ url: z.string().url() })
-const readFilesSchema = z.object({
-    files: z
-        .union([fileSchema, z.array(fileSchema).min(1).max(10)])
-        .describe(
-            'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
-        )
-})
-
 interface NativePart {
     mimeType: string
     base64Data: string
@@ -50,7 +41,26 @@ interface NativePart {
 
 export class ReadFilesTool extends StructuredTool {
     name = 'read_files'
-    schema = readFilesSchema
+    schema = z.object({
+        files: z
+            .union([
+                z.object({
+                    url: z.string().url()
+                }),
+                z
+                    .array(
+                        z.object({
+                            url: z.string().url()
+                        })
+                    )
+                    .min(1)
+                    .max(10)
+            ])
+            .describe(
+                'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
+            )
+    })
+
     description: string
 
     constructor(

From 58b0247422d4b4331f21e508c8e2a8b5db9efbf5 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Tue, 19 May 2026 04:03:04 +0800
Subject: [PATCH 6/7] fix(servuce-multimodal): handle stringified file lists in
 read_files

---
 .../src/plugins/read_files.ts                 | 45 +++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index f987defdc..43585655e 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -43,19 +43,38 @@ export class ReadFilesTool extends StructuredTool {
     name = 'read_files'
     schema = z.object({
         files: z
-            .union([
-                z.object({
-                    url: z.string().url()
-                }),
-                z
-                    .array(
-                        z.object({
-                            url: z.string().url()
-                        })
-                    )
-                    .min(1)
-                    .max(10)
-            ])
+            .preprocess(
+                (arg: unknown) => {
+                    if (typeof arg === 'string') {
+                        const base = JSON.parse(arg)
+                        if (
+                            typeof base === 'object' &&
+                            typeof base['files'] === 'string'
+                        ) {
+                            try {
+                                base['files'] = JSON.parse(base['files'])
+                                return base
+                            } catch {
+                                return base
+                            }
+                        }
+                    }
+                    return arg
+                },
+                z.union([
+                    z.object({
+                        url: z.string().url()
+                    }),
+                    z
+                        .array(
+                            z.object({
+                                url: z.string().url()
+                            })
+                        )
+                        .min(1)
+                        .max(10)
+                ])
+            )
             .describe(
                 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
             )

From 64d3eef080018ba04bf7d21808b7c165060a8f07 Mon Sep 17 00:00:00 2001
From: dingyi <dingyi222666@foxmail.com>
Date: Tue, 19 May 2026 05:58:17 +0800
Subject: [PATCH 7/7] [Fix] handle read_files GIF injection limits

---
 .../src/plugins/read_files.ts                 | 41 +++++++++++++------
 1 file changed, 29 insertions(+), 12 deletions(-)

diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index 43585655e..1d40cd135 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -226,22 +226,36 @@ export class ReadFilesTool extends StructuredTool {
                     }
 
                     if (mime === 'image/gif') {
+                        let pushed = 0
                         const frames = await parseGifToFrames(fetched.buffer, {
                             strategy: this.config.gifStrategy,
                             frameCount: this.config.gifFrameCount
                         })
                         for (const frame of frames) {
                             const frameBase64 = frame.split(',')[1]
-                            const frameSize = getBase64EncodedSize(
-                                Buffer.from(frameBase64, 'base64').byteLength
+                            const buf = Buffer.from(frameBase64, 'base64')
+                            const sizeError = checkSize(
+                                buf,
+                                'image/png',
+                                fileConfig,
+                                totalBytes,
+                                maxTotal
                             )
-                            if (totalBytes + frameSize > maxTotal) {
+                            if (sizeError) {
+                                if (pushed < 1) {
+                                    pushError(
+                                        report,
+                                        sourceUrl,
+                                        sizeError,
+                                        'image/png'
+                                    )
+                                }
                                 logger.warn(
                                     'Skipping remaining GIF frames due to total size limit'
                                 )
                                 break
                             }
-                            totalBytes += frameSize
+                            totalBytes += getBase64EncodedSize(buf.byteLength)
                             pushNative(
                                 report,
                                 native,
@@ -249,6 +263,7 @@ export class ReadFilesTool extends StructuredTool {
                                 'image/png',
                                 frameBase64
                             )
+                            pushed++
                         }
                     } else {
                         totalBytes += getBase64EncodedSize(
@@ -308,6 +323,7 @@ export class ReadFilesTool extends StructuredTool {
             }
         }
 
+        const injected = native.length > 0 && !!conversationId
         if (native.length > 0 && conversationId) {
             this.ctx.chatluna.contextManager.inject({
                 conversationId,
@@ -326,14 +342,15 @@ export class ReadFilesTool extends StructuredTool {
 
         return JSON.stringify({
             response: report,
-            note:
-                native.length > 0
-                    ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.`
-                    : describedCount > 0
-                      ? `Described ${describedCount} image file(s) using the vision model.`
-                      : report.failureCount > 0
-                        ? `Failed to read ${report.failureCount} file(s).`
-                        : 'No files were processed.'
+            note: injected
+                ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.`
+                : native.length > 0
+                  ? `Successfully read ${native.length} file(s), but no conversation id was available, so the file content was not added to the conversation context.`
+                  : describedCount > 0
+                    ? `Described ${describedCount} image file(s) using the vision model.`
+                    : report.failureCount > 0
+                      ? `Failed to read ${report.failureCount} file(s).`
+                      : 'No files were processed.'
         })
     }