diff --git a/packages/adapter-claude/src/utils.ts b/packages/adapter-claude/src/utils.ts
index 06c8f8e9e..b9d38a422 100644
--- a/packages/adapter-claude/src/utils.ts
+++ b/packages/adapter-claude/src/utils.ts
@@ -56,41 +56,15 @@ export async function langchainMessageToClaudeMessage(
 
     const mappedMessages = await Promise.all(
         messages.map(async (rawMessage) => {
-            let content: string | ClaudeInputContentBlockParam[] | undefined =
+            const content: string | ClaudeInputContentBlockParam[] | undefined =
                 typeof rawMessage.content === 'string'
                     ? rawMessage.content
                     : await processMessageContent(plugin, rawMessage.content)
 
-            const images = rawMessage.additional_kwargs.images as
-                | string[]
-                | null
-
-            if (
-                (model?.includes('claude-3') || model?.includes('claude-4')) &&
-                images != null
-            ) {
-                const mappedImages = await Promise.all(
-                    images.map(async (image) =>
-                        processImageContent(plugin, {
-                            type: 'image_url',
-                            image_url: { url: image }
-                        } as MessageContentImageUrl)
-                    )
+            if (rawMessage.additional_kwargs.images != null) {
+                logger.warn(
+                    'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
                 )
-
-                const nextContent: ClaudeInputContentBlockParam[] =
-                    mappedImages.filter((item) => item != null)
-
-                if (Array.isArray(content)) {
-                    nextContent.push(...content)
-                } else if ((content?.length ?? 0) > 0) {
-                    nextContent.push({
-                        type: 'text',
-                        text: content
-                    })
-                }
-
-                content = nextContent
             }
 
             const result: ClaudeMessage = {
diff --git a/packages/adapter-gemini/src/utils.ts b/packages/adapter-gemini/src/utils.ts
index 3875a1b95..9f5a0c5ba 100644
--- a/packages/adapter-gemini/src/utils.ts
+++ b/packages/adapter-gemini/src/utils.ts
@@ -14,7 +14,6 @@ import {
     ChatCompletionResponseMessageRoleEnum,
     ChatFunctionCallingPart,
     ChatFunctionResponsePart,
-    ChatMessagePart,
     ChatPart,
     ChatResponse,
     GeminiUsageMetadata
@@ -78,9 +77,10 @@ export async function langchainMessageToGeminiMessage(
                           thoughtData
                       )
 
-            const images = message.additional_kwargs.images as string[] | null
-            if (images) {
-                processImageParts(result, images, model)
+            if (message.additional_kwargs.images != null) {
+                logger.warn(
+                    'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
+                )
             }
 
             return result
@@ -203,39 +203,6 @@ async function processFunctionMessage(
         ]
     }
 }
-function processImageParts(
-    result: ChatCompletionResponseMessage,
-    images: string[],
-    model: string
-) {
-    if (
-        !(
-            (model.includes('vision') ||
-                model.includes('gemini') ||
-                model.includes('gemma2')) &&
-            !model.includes('gemini-1.0')
-        )
-    ) {
-        return
-    }
-
-    for (const image of images) {
-        const mineType = image.split(';')?.[0]?.split(':')?.[1] ?? 'image/jpeg'
-        const data = image.replace(/^data:image\/\w+;base64,/, '')
-
-        result.parts.push({
-            inline_data: { data, mime_type: mineType }
-        })
-    }
-
-    result.parts = result.parts.filter((uncheckedPart) => {
-        const part = partAsTypeCheck<ChatMessagePart>(
-            uncheckedPart,
-            (part) => part['text'] != null
-        )
-        return part == null || part.text.length > 0
-    })
-}
 
 async function processGeminiImageContent(
     plugin: ChatLunaPlugin,
diff --git a/packages/adapter-ollama/src/utils.ts b/packages/adapter-ollama/src/utils.ts
index 30f671cc6..946e2bab1 100644
--- a/packages/adapter-ollama/src/utils.ts
+++ b/packages/adapter-ollama/src/utils.ts
@@ -21,25 +21,24 @@ export async function langchainMessageToOllamaMessage(
 
     const mappedMessage = await Promise.all(
         messages.map(async (rawMessage) => {
-            let images: string[] = []
-
-            if (rawMessage.additional_kwargs.images != null && supportImage) {
-                images = rawMessage.additional_kwargs.images as string[]
-            } else {
-                images =
-                    typeof rawMessage.content === 'string'
-                        ? undefined
-                        : await Promise.all(
-                              rawMessage.content
-                                  .filter((part) =>
-                                      isMessageContentImageUrl(part)
-                                  )
-                                  .map((part) =>
-                                      processOllamaImageContent(plugin, part)
-                                  )
-                          )
+            if (rawMessage.additional_kwargs.images != null) {
+                logger.warn(
+                    'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
+                )
             }
 
+            const images: string[] | undefined = supportImage
+                ? typeof rawMessage.content === 'string'
+                    ? undefined
+                    : await Promise.all(
+                          rawMessage.content
+                              .filter((part) => isMessageContentImageUrl(part))
+                              .map((part) =>
+                                  processOllamaImageContent(plugin, part)
+                              )
+                      )
+                : undefined
+
             const result = {
                 role: messageTypeToOllamaRole(rawMessage.getType()),
                 content: getMessageContent(rawMessage.content),
diff --git a/packages/adapter-openai-like/src/client.ts b/packages/adapter-openai-like/src/client.ts
index 20403fdf3..3435f9df2 100644
--- a/packages/adapter-openai-like/src/client.ts
+++ b/packages/adapter-openai-like/src/client.ts
@@ -20,10 +20,12 @@ import { OpenAIRequester } from './requester'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import {
     getModelMaxContextSize,
+    getOpenAIFileHandlingConfig,
     isEmbeddingModel,
     isImageGenerationModel,
     isNonLLMModel,
     isRerankerModel,
+    supportAudioInput,
     supportImageInput
 } from '@chatluna/v1-shared-adapter'
 import { RunnableConfig } from '@langchain/core/runnables'
@@ -92,6 +94,9 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient {
                         ModelCapabilities.ToolCall,
                         supportImageInput(model)
                             ? ModelCapabilities.ImageInput
+                            : null,
+                        supportAudioInput(model)
+                            ? ModelCapabilities.AudioInput
                             : null
                     ].filter(Boolean)
                 }
@@ -167,6 +172,7 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient {
                 temperature: this._config.temperature,
                 maxRetries: this._config.maxRetries,
                 llmType: 'openai',
+                fileHandlingConfig: getOpenAIFileHandlingConfig(model),
                 isThinkModel:
                     model.includes('reasoner') ||
                     model.includes('r1') ||
diff --git a/packages/adapter-openai/src/client.ts b/packages/adapter-openai/src/client.ts
index 6a4857855..9251a5fdc 100644
--- a/packages/adapter-openai/src/client.ts
+++ b/packages/adapter-openai/src/client.ts
@@ -20,6 +20,8 @@ import { OpenAIRequester } from './requester'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import {
     getModelMaxContextSize,
+    getOpenAIFileHandlingConfig,
+    supportAudioInput,
     supportImageInput
 } from '@chatluna/v1-shared-adapter'
 import { RunnableConfig } from '@langchain/core/runnables'
@@ -65,13 +67,11 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient<ClientConfig>
                     (model) =>
                         !(
                             model.includes('instruct') ||
-                            [
-                                'whisper',
-                                'tts',
-                                'dall-e',
-                                'audio',
-                                'realtime'
-                            ].some((keyword) => model.includes(keyword))
+                            ['whisper', 'tts', 'dall-e', 'realtime'].some(
+                                (keyword) => model.includes(keyword)
+                            ) ||
+                            (model.includes('audio') &&
+                                !supportAudioInput(model))
                         )
                 )
                 .map((model) => {
@@ -84,6 +84,9 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient<ClientConfig>
                             ModelCapabilities.ToolCall,
                             supportImageInput(model)
                                 ? ModelCapabilities.ImageInput
+                                : undefined,
+                            supportAudioInput(model)
+                                ? ModelCapabilities.AudioInput
                                 : undefined
                         ].filter(Boolean)
                     } as ModelInfo
@@ -125,6 +128,7 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient<ClientConfig>
                 timeout: this._config.timeout,
                 temperature: this._config.temperature,
                 maxRetries: this._config.maxRetries,
+                fileHandlingConfig: getOpenAIFileHandlingConfig(model),
                 llmType: 'openai'
             })
         }
diff --git a/packages/adapter-qwen/src/utils.ts b/packages/adapter-qwen/src/utils.ts
index 09201bb36..9491127cf 100644
--- a/packages/adapter-qwen/src/utils.ts
+++ b/packages/adapter-qwen/src/utils.ts
@@ -5,7 +5,6 @@ import {
     ChatMessageChunk,
     FunctionMessageChunk,
     HumanMessageChunk,
-    MessageContentImageUrl,
     MessageType,
     SystemMessageChunk,
     ToolMessage,
@@ -21,11 +20,11 @@ import {
 } from './types'
 import {
     fetchImageUrl,
-    removeAdditionalProperties,
-    supportImageInput
+    removeAdditionalProperties
 } from '@chatluna/v1-shared-adapter'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import { isZodSchemaV3 } from '@langchain/core/utils/types'
+import { logger } from '.'
 
 export function formatToolsToQWenTools(
     tools: StructuredTool[]
@@ -113,50 +112,13 @@ export async function langchainMessageToQWenMessage(
             }
         }
 
-        const images = rawMessage.additional_kwargs.images as string[] | null
-
-        if (
-            (model?.includes('qwen-vl') ||
-                model?.includes('omni') ||
-                model?.includes('qwen2.5-vl') ||
-                model?.includes('qwen2.5-omni') ||
-                model?.includes('qwen-omni') ||
-                model?.includes('qwen2-vl') ||
-                model?.includes('qvq') ||
-                supportImageInput(model)) &&
-            images != null
-        ) {
-            msg.content = [
-                {
-                    type: 'text',
-                    text: rawMessage.content as string
-                }
-            ]
-
-            const imageContents = await Promise.all(
-                images.map(async (image) => {
-                    try {
-                        const url = await fetchImageUrl(plugin, {
-                            type: 'image_url',
-                            image_url: { url: image }
-                        } as MessageContentImageUrl)
-                        return {
-                            type: 'image_url',
-                            image_url: {
-                                url,
-                                detail: 'low'
-                            }
-                        } as const
-                    } catch {
-                        return null
-                    }
-                })
+        if (rawMessage.additional_kwargs.images != null) {
+            logger.warn(
+                'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
             )
+        }
 
-            msg.content.push(
-                ...imageContents.filter((content) => content != null)
-            )
-        } else if (Array.isArray(msg.content) && msg.content.length > 0) {
+        if (Array.isArray(msg.content) && msg.content.length > 0) {
             const mappedContent = await Promise.all(
                 msg.content.map(async (content) => {
                     if (!isMessageContentImageUrl(content)) return content
diff --git a/packages/service-multimodal/README.md b/packages/service-multimodal/README.md
index e96b4badc..84e92d7bf 100644
--- a/packages/service-multimodal/README.md
+++ b/packages/service-multimodal/README.md
@@ -1,7 +1,7 @@
-## koishi-plugin-chatluna-long-memory
+## koishi-plugin-chatluna-multimodal-service
 
-## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package/koishi-plugin-chatluna-long-memory) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package//koishi-plugin-chatluna-long-memory)
+## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service)
 
-> 提供长期记忆支持的插件
+> ChatLuna 的多模态服务插件，提供上下文图像/语音描述、GIF 处理与 `read_files` 文件读取工具。
 
-[长期记忆文档](https://chatluna.chat/ecosystem/renderer/image.html)
+[多模态插件文档](https://chatluna.chat/ecosystem/plugin/multimodal-service.html)
diff --git a/packages/service-multimodal/src/index.ts b/packages/service-multimodal/src/index.ts
index f1e7588a6..62ff61556 100644
--- a/packages/service-multimodal/src/index.ts
+++ b/packages/service-multimodal/src/index.ts
@@ -83,7 +83,7 @@ export const Config: Schema<Config> = Schema.intersect([
 
 export const inject = {
     required: ['chatluna'],
-    optional: ['chatluna_storage', 'ffmpeg', 'silk']
+    optional: ['ffmpeg', 'silk']
 }
 
 export const name = 'chatluna-multimodal-service'
diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts
index f8ce7fbdb..8d0a6ea15 100644
--- a/packages/service-multimodal/src/plugins/audio.ts
+++ b/packages/service-multimodal/src/plugins/audio.ts
@@ -1,92 +1,95 @@
 import { MessageContentComplex } from '@langchain/core/messages'
 import { Context, h, Session } from 'koishi'
 import type { OneBotBot } from 'koishi-plugin-adapter-onebot'
-import { Message } from 'koishi-plugin-chatluna'
 import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
-import type {} from 'koishi-plugin-chatluna-storage-service'
 import type {} from 'koishi-plugin-ffmpeg-path'
 import { Config, logger } from '..'
+import {
+    BROWSER_UA,
+    convertAudioToMp3,
+    detectAudioMimeType,
+    ensureContentArray
+} from '../utils'
+
+// MIMEs commonly accepted by OpenAI / Gemini / MiMo audio inputs. Anything
+// else (Silk, AMR, ...) is transcoded to MP3.
+const NATIVE_AUDIO_MIMES = new Set([
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/flac',
+    'audio/ogg',
+    'audio/mp4',
+    'audio/aac',
+    'audio/webm'
+])
+
+const MIME_TO_EXT: Record<string, string> = {
+    'audio/mpeg': 'mp3',
+    'audio/mp3': 'mp3',
+    'audio/wav': 'wav',
+    'audio/flac': 'flac',
+    'audio/ogg': 'ogg',
+    'audio/mp4': 'm4a',
+    'audio/aac': 'aac',
+    'audio/webm': 'webm'
+}
 
-const CHATLUNA_DOWNLOAD_USER_AGENT =
-    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
-const MAX_AUDIO_BYTES = 25 * 1024 * 1024
-
+/**
+ * Intercept voice/audio elements: download, transcode unfriendly formats
+ * (Silk/AMR/...) to MP3, then inject as a Base64 `audio_url` content part.
+ * OpenAI-compatible adapters convert the result to `input_audio` downstream.
+ */
 export function apply(ctx: Context, config: Config) {
-    if (!config.enableAudioFfmpegConversion) {
-        return
-    }
+    if (!config.enableAudioFfmpegConversion) return
 
     ctx.effect(() =>
         ctx.chatluna.messageTransformer.intercept(
             'audio',
             async (session, element, message, model) => {
-                const modelInfo = model
-                    ? ctx.chatluna.platform.findModel(model)
-                    : undefined
+                if (!modelAcceptsAudio(ctx, model)) return false
 
-                // If the model doesn't accept audio input, keep fallback path unchanged.
-                if (
-                    modelInfo?.value?.capabilities?.includes(
-                        ModelCapabilities.AudioInput
-                    ) === false
-                ) {
-                    return false
-                }
+                const sourceUrl = await resolveAudioSourceUrl(session, element)
+                if (!sourceUrl) return false
 
-                const sourceUrl = await resolveAudioSourceUrl(
-                    ctx,
-                    session,
-                    element
-                )
-                if (!sourceUrl) {
-                    return false
-                }
+                const buffer = await downloadAudio(ctx, sourceUrl)
+                if (!buffer) return false
 
-                const fileName =
-                    element.attrs['file'] ??
-                    element.attrs['name'] ??
-                    element.attrs['filename']
+                const detected = detectAudioMimeType(
+                    buffer,
+                    element.attrs['mime'] as string | null
+                )
 
-                const fileData = await readFile(ctx, sourceUrl)
-                if (!fileData.buffer) {
-                    return false
-                }
+                let outBuffer = buffer
+                let outMime = detected ?? 'audio/mpeg'
 
-                const converted = await tryConvertAudioToMp3(
-                    ctx,
-                    fileData.buffer,
-                    fileName
-                )
-                if (!converted) {
-                    logger.warn(`Failed to convert audio to MP3: ${sourceUrl}`)
-                    return false
+                if (!detected || !NATIVE_AUDIO_MIMES.has(detected)) {
+                    const converted = await convertAudioToMp3(ctx, buffer)
+                    if (!converted) {
+                        logger.warn(
+                            `Skip audio: format ${detected ?? 'unknown'} not natively supported and ffmpeg conversion failed.`
+                        )
+                        return false
+                    }
+                    outBuffer = converted
+                    outMime = 'audio/mpeg'
                 }
 
-                const { fileName: displayFileName, buffer } = converted
-                element.attrs['file'] = displayFileName
-                element.attrs['filename'] = displayFileName
-
-                const audioUrl = ctx.chatluna_storage
-                    ? (element.attrs['chatluna_file_url'] = (
-                          await ctx.chatluna_storage.createTempFile(
-                              buffer,
-                              displayFileName
-                          )
-                      ).url)
-                    : ((element.attrs['chatluna_file_url'] = sourceUrl),
-                      `data:audio/mpeg;base64,${buffer.toString('base64')}`)
+                const dataUrl = `data:${outMime};base64,${outBuffer.toString('base64')}`
+                const ext = MIME_TO_EXT[outMime] ?? 'mp3'
+                const fileName = `${stripExtension(audioName(element))}.${ext}`
+                element.attrs['file'] = fileName
+                element.attrs['filename'] = fileName
+                element.attrs['chatluna_file_url'] = sourceUrl
 
-                ensureContentArray(message, `[voice:${displayFileName}]`)
+                ensureContentArray(message, `[voice:${fileName}]`)
                 ;(message.content as MessageContentComplex[]).push({
                     type: 'audio_url',
-                    audio_url: {
-                        url: audioUrl,
-                        mimeType: 'audio/mpeg'
-                    }
+                    audio_url: { url: dataUrl, mimeType: outMime }
                 } as unknown as MessageContentComplex)
 
                 logger.debug(
-                    `Transcoded unsupported audio to mp3 for multimodal input: ${displayFileName}`
+                    `Injected audio for ${model}: ${fileName} (${outMime}, ${outBuffer.byteLength} bytes)`
                 )
                 return true
             },
@@ -95,22 +98,28 @@ export function apply(ctx: Context, config: Config) {
     )
 }
 
+function modelAcceptsAudio(ctx: Context, model: string | undefined): boolean {
+    if (!model) return false
+    return (
+        ctx.chatluna.platform
+            .findModel(model)
+            ?.value?.capabilities?.includes(ModelCapabilities.AudioInput) ===
+        true
+    )
+}
+
 async function resolveAudioSourceUrl(
-    ctx: Context,
     session: Session,
     element: h
 ): Promise<string | null> {
-    const srcAttr = (element.attrs['src'] ?? element.attrs['url']) as
+    const src = (element.attrs['src'] ?? element.attrs['url']) as
         | string
         | undefined
-    if (srcAttr?.startsWith('http')) {
-        return srcAttr
-    }
-
-    if (session.platform !== 'onebot') return srcAttr ?? null
+    if (src?.startsWith('http')) return src
+    if (session.platform !== 'onebot') return src ?? null
 
     const fileId = element.attrs['fileId'] ?? element.attrs['fileid']
-    if (!fileId) return srcAttr ?? null
+    if (!fileId) return src ?? null
 
     try {
         const bot = session.bot as OneBotBot<Context>
@@ -119,239 +128,37 @@ async function resolveAudioSourceUrl(
             ? await bot.internal.getPrivateFileUrl(session.userId, fileId)
             : await bot.internal.getGroupFileUrl(session.guildId, fileId, busId)
     } catch {
-        return srcAttr ?? null
+        return src ?? null
     }
 }
 
-async function readFile(
+async function downloadAudio(
     ctx: Context,
     url: string
-): Promise<{ buffer: Buffer | null; mimeType: string | null }> {
-    const headers = { 'User-Agent': CHATLUNA_DOWNLOAD_USER_AGENT }
-
-    let sanitizedUrl: string
-    try {
-        const parsed = new URL(url)
-        sanitizedUrl = parsed.origin + parsed.pathname
-    } catch {
-        sanitizedUrl = url
-    }
-
-    let mimeTypeFromHead: string | null = null
-
-    // Try HEAD request for size check
-    try {
-        const headResponse = await ctx.http(url, { method: 'head', headers })
-        const headHeaders: Headers = headResponse?.headers
-        mimeTypeFromHead =
-            headHeaders
-                ?.get('content-type')
-                ?.split(';')[0]
-                ?.trim()
-                ?.toLowerCase() ?? null
-
-        const headContentLength = headHeaders?.get('content-length')
-            ? Number(headHeaders.get('content-length'))
-            : null
-
-        if (
-            headContentLength != null &&
-            Number.isFinite(headContentLength) &&
-            headContentLength > MAX_AUDIO_BYTES
-        ) {
-            logger.warn(
-                `Skip reading oversized audio from ${sanitizedUrl}: ${headContentLength} bytes > ${MAX_AUDIO_BYTES} bytes`
-            )
-            return { buffer: null, mimeType: mimeTypeFromHead }
-        }
-    } catch {
-        // Some endpoints do not support HEAD; continue with GET safeguards.
-    }
-
-    try {
-        const response = await fetch(url, { method: 'GET', headers })
-
-        if (!response.ok) {
-            throw new Error(`HTTP ${response.status}`)
-        }
-
-        const mimeType =
-            response.headers
-                .get('content-type')
-                ?.split(';')[0]
-                ?.trim()
-                ?.toLowerCase() ?? mimeTypeFromHead
-        const responseContentLength = response.headers.get('content-length')
-            ? Number(response.headers.get('content-length'))
-            : null
-
-        if (
-            responseContentLength != null &&
-            Number.isFinite(responseContentLength) &&
-            responseContentLength > MAX_AUDIO_BYTES
-        ) {
-            logger.warn(
-                `Skip reading oversized audio from ${sanitizedUrl}: ${responseContentLength} bytes > ${MAX_AUDIO_BYTES} bytes`
-            )
-            return { buffer: null, mimeType }
-        }
-
-        if (response.body == null) {
-            const arrayBuffer = await response.arrayBuffer()
-            if (arrayBuffer.byteLength > MAX_AUDIO_BYTES) {
-                logger.warn(
-                    `Skip reading oversized audio from ${sanitizedUrl}: ${arrayBuffer.byteLength} bytes > ${MAX_AUDIO_BYTES} bytes`
-                )
-                return { buffer: null, mimeType }
-            }
-            return { buffer: Buffer.from(arrayBuffer), mimeType }
-        }
-
-        const reader = response.body.getReader()
-        const chunks: Buffer[] = []
-        let totalBytes = 0
-
-        while (true) {
-            const { done, value } = await reader.read()
-            if (done) break
-
-            if (!value?.byteLength) continue
-
-            totalBytes += value.byteLength
-            if (totalBytes > MAX_AUDIO_BYTES) {
-                await reader.cancel('audio exceeds max size')
-                logger.warn(
-                    `Skip reading oversized audio from ${sanitizedUrl}: streamed bytes exceed ${MAX_AUDIO_BYTES} bytes`
-                )
-                return { buffer: null, mimeType }
-            }
-
-            chunks.push(Buffer.from(value))
-        }
-
-        return { buffer: Buffer.concat(chunks, totalBytes), mimeType }
-    } catch (error) {
-        logger.warn(`Failed to read audio from ${sanitizedUrl}:`, error)
-        return { buffer: null, mimeType: null }
-    }
-}
-
-function toMp3FileName(fileName?: string): string {
-    const baseName = (fileName ?? 'voice').trim()
-    const dotIndex = baseName.lastIndexOf('.')
-    return `${dotIndex <= 0 ? baseName : baseName.slice(0, dotIndex)}.mp3`
-}
-
-async function tryConvertAudioToMp3(
-    ctx: Context,
-    inputBuffer: Buffer,
-    fileName?: string
-): Promise<{ buffer: Buffer; fileName: string } | null> {
+): Promise<Buffer | null> {
     try {
-        let sourceBuffer = inputBuffer
-        let decodedPcmSampleRate: number | null = null
-
-        if (isSilkAudio(inputBuffer)) {
-            const decoded = await decodeSilkAudio(ctx, inputBuffer)
-            sourceBuffer = decoded.buffer
-            decodedPcmSampleRate = decoded.sampleRate
-            logger.debug('Decoded silk audio before mp3 transcoding.')
-        }
-
-        const ffmpeg = ctx.ffmpeg
-        if (!ffmpeg) {
-            throw new Error(
-                'FFmpeg service is unavailable. Please enable koishi-plugin-ffmpeg-path.'
-            )
-        }
-
-        const builder = ffmpeg.builder().input(sourceBuffer)
-        if (decodedPcmSampleRate != null) {
-            builder.inputOption(
-                '-f',
-                's16le',
-                '-ar',
-                String(decodedPcmSampleRate),
-                '-ac',
-                '1'
-            )
-        }
-
-        const outputBuffer = await builder
-            .outputOption(
-                '-vn',
-                '-acodec',
-                'libmp3lame',
-                '-q:a',
-                '4',
-                '-f',
-                'mp3'
-            )
-            .run('buffer')
-
-        return {
-            buffer: outputBuffer,
-            fileName: toMp3FileName(fileName)
-        }
+        const { data } = await ctx.http(url, {
+            responseType: 'arraybuffer',
+            method: 'get',
+            headers: { 'User-Agent': BROWSER_UA }
+        })
+        return Buffer.from(data)
     } catch (error) {
-        logger.warn(
-            `Audio transcoding to mp3 failed, fallback to original audio: ${error instanceof Error ? error.message : String(error)}`
-        )
+        logger.warn(`Failed to fetch audio from ${url}:`, error)
         return null
     }
 }
 
-function isSilkAudio(inputBuffer: Buffer): boolean {
-    if (inputBuffer.length < 9) return false
-    const sig = inputBuffer.subarray(0, 9).toString('latin1')
+function audioName(element: h): string {
     return (
-        sig === '#!SILK_V3' ||
-        inputBuffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+        (element.attrs['file'] as string | undefined) ??
+        (element.attrs['name'] as string | undefined) ??
+        (element.attrs['filename'] as string | undefined) ??
+        'voice'
     )
 }
 
-async function decodeSilkAudio(
-    ctx: Context,
-    inputBuffer: Buffer
-): Promise<{ buffer: Buffer; sampleRate: number }> {
-    const silk = ctx.silk
-    if (!silk) {
-        throw new Error(
-            'Detected silk audio, but no silk service is available for decoding'
-        )
-    }
-    for (const sampleRate of [24000, 16000, 12000, 8000]) {
-        try {
-            const result = (await silk.decode(
-                inputBuffer,
-                sampleRate
-            )) as DecodeResult
-
-            if (result?.data != null) {
-                return { buffer: Buffer.from(result.data), sampleRate }
-            }
-        } catch {
-            continue
-        }
-    }
-
-    throw new Error('silk decode returned empty output')
-}
-
-function ensureContentArray(message: Message, fallbackText: string) {
-    if (typeof message.content === 'string') {
-        message.content = [
-            {
-                type: 'text',
-                text: message.content.trim().length
-                    ? message.content
-                    : fallbackText
-            }
-        ]
-    }
-}
-
-interface DecodeResult {
-    data: Uint8Array
-    duration: number
+function stripExtension(name: string): string {
+    const dot = name.lastIndexOf('.')
+    return dot > 0 ? name.slice(0, dot) : name
 }
diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts
index e29db3c56..f1b64e51a 100644
--- a/packages/service-multimodal/src/plugins/image.ts
+++ b/packages/service-multimodal/src/plugins/image.ts
@@ -1,6 +1,6 @@
-/* eslint-disable max-len */
 import { Context } from 'koishi'
 import { Message } from 'koishi-plugin-chatluna'
+import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
 import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
 import { Config, logger } from '..'
@@ -12,6 +12,11 @@ import {
     readImage
 } from '../utils'
 
+/**
+ * Intercept image elements. Native-capable models receive the data URL
+ * directly (GIFs are split into frames). Otherwise fall back to describing
+ * the image via the configured vision model and inject the description.
+ */
 export async function apply(
     ctx: Context,
     config: Config,
@@ -21,136 +26,106 @@ export async function apply(
         config.imageModel
     )
 
-    const disposable = ctx.chatluna.messageTransformer.intercept(
-        'img',
-        async (_session, element, message, model) => {
-            const parsedModelInfo =
-                model != null
-                    ? ctx.chatluna.platform.findModel(model)
-                    : undefined
-            const modelSupportsImageInput =
-                parsedModelInfo?.value != null &&
-                parsedModelInfo.value.capabilities.includes(
-                    ModelCapabilities.ImageInput
-                )
-
-            let imageData: Awaited<ReturnType<typeof readImage>>
-            const url = (element.attrs.url ?? element.attrs.src) as string
-
-            if (modelSupportsImageInput) {
-                imageData = await readImage(ctx, url)
+    ctx.effect(() =>
+        ctx.chatluna.messageTransformer.intercept(
+            'img',
+            async (_session, element, message, model) => {
+                const url = (element.attrs.url ?? element.attrs.src) as string
+                if (!url) return false
 
-                if (imageData.ext == null) {
+                const native = modelAcceptsImage(ctx, model)
+                if (!native && !config.enableContextImageDescription) {
                     return false
                 }
 
-                if (imageData.ext === 'image/gif') {
-                    if (!config.enableContextGifHandling) {
-                        return false
-                    }
-
-                    logger.debug(`image url: ${url.substring(0, 50)}...`)
-                    const frames = await parseGifToFrames(imageData.buffer, {
-                        strategy: config.gifStrategy,
-                        frameCount: config.gifFrameCount
-                    })
+                const imageData = await readImage(ctx, url)
+                if (imageData.ext == null) return false
 
-                    logger.debug(`Extracted ${frames.length} frames from GIF`)
+                const isGif = imageData.ext === 'image/gif'
+                if (isGif && !config.enableContextGifHandling) return false
 
-                    for (const frame of frames) {
-                        addImageToContent(message, frame)
+                if (native) {
+                    if (isGif) {
+                        await injectGifFrames(message, imageData.buffer, config)
+                        addTextToContent(message, '[image: GIF]')
+                    } else if (imageData.base64Source) {
+                        addImageToContent(message, imageData.base64Source)
                     }
-
-                    addTextToContent(message, '[image: GIF]')
-
                     return true
                 }
 
-                if (imageData.base64Source != null) {
-                    addImageToContent(message, imageData.base64Source)
-                    return true
-                }
-            }
-
-            if (!config.enableContextImageDescription) {
-                return false
-            }
-
-            if (imageUnderstandModel.value == null) {
-                logger.warn(
-                    `The model ${config.imageModel} is not loaded, please check your chat adapter`
-                )
-                return false
-            }
-
-            if (
-                !imageUnderstandModel.value.modelInfo.capabilities.includes(
-                    ModelCapabilities.ImageInput
-                )
-            ) {
-                logger.warn(
-                    `The model ${config.imageModel} in image-service does not support image input, please check your chat adapter`
-                )
-                return false
-            }
-
-            try {
-                const fakeMessage: Message = {
-                    content: []
-                }
-
-                logger.debug(`image url: ${url}`)
-
-                imageData = imageData ?? (await readImage(ctx, url))
-
-                if (imageData.ext == null) {
-                    return false
-                }
-
-                if (imageData.ext === 'image/gif') {
-                    if (!config.enableContextGifHandling) {
-                        return false
-                    }
-
-                    const frames = await parseGifToFrames(imageData.buffer, {
-                        strategy: config.gifStrategy,
-                        frameCount: config.gifFrameCount
-                    })
-
-                    logger.debug(
-                        `Extracted ${frames.length} frames from GIF for model processing`
-                    )
-
-                    addTextToContent(
-                        fakeMessage,
-                        'This is a GIF image. See the frames below:'
-                    )
-                    for (const frame of frames) {
-                        addImageToContent(fakeMessage, frame)
-                    }
-                } else {
-                    addImageToContent(fakeMessage, imageData.base64Source)
-                }
-
-                const result = await processImageWithModel(
-                    imageUnderstandModel.value,
+                return describeAndInject(
+                    message,
+                    imageData,
+                    isGif,
                     config,
-                    fakeMessage
+                    imageUnderstandModel.value,
+                    url
                 )
+            },
+            100
+        )
+    )
+}
 
-                if (result) {
-                    addTextToContent(message, '\n\n' + result)
-                    return true
-                }
-            } catch (error) {
-                logger.warn(
-                    `Read image ${url} error, check your chat adapter`,
-                    error
-                )
-            }
-        },
-        100
+function modelAcceptsImage(ctx: Context, model: string | undefined): boolean {
+    if (!model) return false
+    return (
+        ctx.chatluna.platform
+            .findModel(model)
+            ?.value?.capabilities?.includes(ModelCapabilities.ImageInput) ===
+        true
     )
+}
 
-    ctx.effect(() => disposable)
+async function injectGifFrames(
+    message: Message,
+    buffer: Buffer,
+    config: Config
+): Promise<void> {
+    const frames = await parseGifToFrames(buffer, {
+        strategy: config.gifStrategy,
+        frameCount: config.gifFrameCount
+    })
+    logger.debug(`Extracted ${frames.length} frames from GIF`)
+    for (const frame of frames) addImageToContent(message, frame)
+}
+
+async function describeAndInject(
+    message: Message,
+    imageData: Awaited<ReturnType<typeof readImage>>,
+    isGif: boolean,
+    config: Config,
+    imageModel: ChatLunaChatModel | undefined,
+    url: string
+): Promise<boolean> {
+    if (
+        imageModel == null ||
+        !imageModel.modelInfo.capabilities.includes(
+            ModelCapabilities.ImageInput
+        )
+    ) {
+        logger.warn(
+            `Image-description model "${config.imageModel}" is missing or lacks image input — skip.`
+        )
+        return false
+    }
+
+    try {
+        const fake: Message = { content: [] }
+        if (isGif) {
+            addTextToContent(fake, 'This is a GIF image. See the frames below:')
+            await injectGifFrames(fake, imageData.buffer, config)
+        } else if (imageData.base64Source) {
+            addImageToContent(fake, imageData.base64Source)
+        }
+        const result = await processImageWithModel(imageModel, config, fake)
+        if (result) {
+            addTextToContent(message, '\n\n' + result)
+            return true
+        }
+    } catch (error) {
+        logger.warn(`Image describe failed for ${url}:`, error)
+    }
+    return false
 }
diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts
index ea2e578df..1d40cd135 100644
--- a/packages/service-multimodal/src/plugins/read_files.ts
+++ b/packages/service-multimodal/src/plugins/read_files.ts
@@ -4,266 +4,35 @@ import { HumanMessage, MessageContentComplex } from '@langchain/core/messages'
 import { Context } from 'koishi'
 import { ComputedRef, Message } from 'koishi-plugin-chatluna'
 import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model'
+import type { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client'
 import {
     ChatLunaToolRunnable,
     ModelCapabilities
 } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
-import {
-    isMessageContentAudio,
-    isMessageContentVideo,
-    type MessageContentAudio,
-    type MessageContentVideo
-} from 'koishi-plugin-chatluna/utils/langchain'
 import { getBase64EncodedSize } from 'koishi-plugin-chatluna/utils/base64'
 import { Config, logger } from '..'
 import {
     addImageToContent,
     addTextToContent,
+    BROWSER_UA,
+    convertAudioToMp3,
+    detectAudioMimeType,
+    IMAGE_MIME_TYPES,
+    inferMimeTypeFromUrl,
+    normalizeMimeType,
     parseGifToFrames,
     processImageWithModel
 } from '../utils'
 import z from 'zod'
 
-// ---------------------------------------------------------------------------
-// Constants
-// ---------------------------------------------------------------------------
-
-const IMAGE_MIME_TYPES = new Set([
-    'image/png',
-    'image/jpeg',
-    'image/bmp',
-    'image/webp',
-    'image/gif'
-])
-
 const DEFAULT_MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024
 const DEFAULT_MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024
 
-const FILE_EXTENSION_TO_MIME_TYPE = new Map<string, string>([
-    ['.png', 'image/png'],
-    ['.jpg', 'image/jpeg'],
-    ['.jpeg', 'image/jpeg'],
-    ['.bmp', 'image/bmp'],
-    ['.webp', 'image/webp'],
-    ['.gif', 'image/gif'],
-    ['.pdf', 'application/pdf'],
-    ['.txt', 'text/plain'],
-    ['.md', 'text/markdown'],
-    ['.html', 'text/html'],
-    ['.htm', 'text/html'],
-    ['.css', 'text/css'],
-    ['.xml', 'text/xml'],
-    ['.csv', 'text/csv'],
-    ['.rtf', 'text/rtf'],
-    ['.js', 'text/javascript'],
-    ['.mjs', 'text/javascript'],
-    ['.json', 'application/json'],
-    ['.mp4', 'video/mp4'],
-    ['.mpeg', 'video/mpeg'],
-    ['.mov', 'video/mov'],
-    ['.avi', 'video/avi'],
-    ['.flv', 'video/x-flv'],
-    ['.mpg', 'video/mpg'],
-    ['.webm', 'video/webm'],
-    ['.wmv', 'video/wmv'],
-    ['.3gp', 'video/3gpp'],
-    ['.3gpp', 'video/3gpp'],
-    ['.mp3', 'audio/mpeg'],
-    ['.aiff', 'audio/aiff'],
-    ['.aac', 'audio/aac'],
-    ['.flac', 'audio/flac'],
-    ['.wav', 'audio/wav'],
-    ['.ogg', 'audio/ogg'],
-    ['.m4a', 'audio/mp4']
-])
-
-// ---------------------------------------------------------------------------
-// Helpers
-// ---------------------------------------------------------------------------
-
-function isHttpOrHttpsUrl(url: string): boolean {
-    try {
-        const parsed = new URL(url)
-        return parsed.protocol === 'http:' || parsed.protocol === 'https:'
-    } catch {
-        return false
-    }
-}
-
-function normalizeMimeType(raw: string | null): string | null {
-    if (raw == null) return null
-    const mimeType = raw.split(';')[0]?.trim()?.toLowerCase()
-    return mimeType || null
-}
-
-function inferMimeTypeFromPath(path: string): string | null {
-    const sanitizedPath = path.toLowerCase().split(/[?#]/, 1)[0]
-    const fileName = sanitizedPath.split(/[/\\]/).pop() ?? sanitizedPath
-    const extensionIndex = fileName.lastIndexOf('.')
-
-    if (extensionIndex < 0) {
-        return null
-    }
-
-    const extension = fileName.slice(extensionIndex)
-    return FILE_EXTENSION_TO_MIME_TYPE.get(extension) ?? null
-}
-
-function inferMimeTypeFromUrl(url: string): string | null {
-    try {
-        const pathname = new URL(url).pathname
-        return inferMimeTypeFromPath(pathname)
-    } catch {
-        // ignore
-    }
-
-    return null
-}
-
-/**
- * Check whether the model natively supports a given MIME type based on its
- * capabilities and `FileHandlingConfig`.
- */
-function modelSupportsNativeMimeType(
-    model: ChatLunaChatModel,
+interface NativePart {
     mimeType: string
-): boolean {
-    const caps = model.modelInfo.capabilities
-
-    let capabilitySupportsMime = false
-    if (IMAGE_MIME_TYPES.has(mimeType)) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.ImageInput)
-    } else if (mimeType.startsWith('audio/')) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.AudioInput)
-    } else if (mimeType.startsWith('video/')) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.VideoInput)
-    } else if (
-        mimeType.startsWith('text/') ||
-        mimeType === 'application/json' ||
-        mimeType === 'application/pdf'
-    ) {
-        capabilitySupportsMime = caps.includes(ModelCapabilities.FileInput)
-    }
-
-    if (!capabilitySupportsMime) {
-        return false
-    }
-
-    const fileConfig = model.fileHandlingConfig
-    if (fileConfig != null) {
-        return fileConfig.supportedMimeTypes.has(mimeType)
-    }
-
-    return true
-}
-
-function isMimeTypeEnabled(config: Config, mimeType: string): boolean {
-    if (mimeType === 'image/gif') {
-        return config.enableGifReadTool
-    }
-
-    if (IMAGE_MIME_TYPES.has(mimeType)) {
-        return config.enableImageReadTool
-    }
-
-    return config.enableFileReadTool
-}
-
-function buildReadFilesDescription(config: Config): string {
-    const sections: string[] = []
-
-    if (config.enableImageReadTool) {
-        sections.push(
-            '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.'
-        )
-    }
-
-    if (config.enableGifReadTool) {
-        sections.push(
-            '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.'
-        )
-    }
-
-    if (config.enableFileReadTool) {
-        sections.push(
-            '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).'
-        )
-    }
-
-    return `Read files from URL(s) and return their content.
-Enabled read_files capabilities:
-${sections.join('\n')}
-Use this tool when you need to read files from URL(s) as context.`
-}
-
-/**
- * Build a multimodal `HumanMessage` containing the file(s) as content parts,
- * suitable for injecting into the conversation context.
- */
-function buildMultimodalMessage(
-    parts: {
-        mimeType: string
-        base64Data: string
-        sourceUrl: string
-    }[],
-    insertPrompt: string
-): HumanMessage {
-    const content: MessageContentComplex[] = []
-
-    for (const part of parts) {
-        const { mimeType, base64Data } = part
-
-        if (IMAGE_MIME_TYPES.has(mimeType)) {
-            content.push({
-                type: 'image_url',
-                image_url: {
-                    url: `data:${mimeType};base64,${base64Data}`
-                }
-            })
-        } else if (mimeType.startsWith('audio/')) {
-            const audioContent: MessageContentAudio = {
-                type: 'audio_url',
-                audio_url: {
-                    url: `data:${mimeType};base64,${base64Data}`,
-                    mimeType
-                }
-            }
-
-            if (isMessageContentAudio(audioContent as MessageContentComplex)) {
-                content.push(audioContent as MessageContentComplex)
-            }
-        } else if (mimeType.startsWith('video/')) {
-            const videoContent: MessageContentVideo = {
-                type: 'video_url',
-                video_url: {
-                    url: `data:${mimeType};base64,${base64Data}`,
-                    mimeType
-                }
-            }
-
-            if (isMessageContentVideo(videoContent as MessageContentComplex)) {
-                content.push(videoContent as MessageContentComplex)
-            }
-        } else {
-            // Inline data for text/pdf/etc. (Gemini-style)
-            content.push({
-                inline_data: {
-                    mime_type: mimeType,
-                    data: base64Data
-                }
-            } as unknown as MessageContentComplex)
-        }
-    }
-
-    if (content.length > 0) {
-        content.unshift({
-            type: 'text',
-            text: insertPrompt
-        })
-    }
-
-    return new HumanMessage({ content })
+    base64Data: string
+    sourceUrl: string
 }
 
 // ---------------------------------------------------------------------------
@@ -272,28 +41,47 @@ function buildMultimodalMessage(
 
 export class ReadFilesTool extends StructuredTool {
     name = 'read_files'
-    description: string
-
     schema = z.object({
         files: z
-            .union([
-                z.object({
-                    url: z.string().url()
-                }),
-                z
-                    .array(
-                        z.object({
-                            url: z.string().url()
-                        })
-                    )
-                    .min(1)
-                    .max(10)
-            ])
+            .preprocess(
+                (arg: unknown) => {
+                    if (typeof arg === 'string') {
+                        const base = JSON.parse(arg)
+                        if (
+                            typeof base === 'object' &&
+                            typeof base['files'] === 'string'
+                        ) {
+                            try {
+                                base['files'] = JSON.parse(base['files'])
+                                return base
+                            } catch {
+                                return base
+                            }
+                        }
+                    }
+                    return arg
+                },
+                z.union([
+                    z.object({
+                        url: z.string().url()
+                    }),
+                    z
+                        .array(
+                            z.object({
+                                url: z.string().url()
+                            })
+                        )
+                        .min(1)
+                        .max(10)
+                ])
+            )
             .describe(
                 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.'
             )
     })
 
+    description: string
+
     constructor(
         private readonly ctx: Context,
         private readonly config: Config,
@@ -302,7 +90,7 @@ export class ReadFilesTool extends StructuredTool {
         >
     ) {
         super({})
-        this.description = buildReadFilesDescription(config)
+        this.description = describeTool(config)
     }
 
     async _call(
@@ -314,366 +102,314 @@ export class ReadFilesTool extends StructuredTool {
         const model = runConfig?.configurable?.model
         const conversationId = runConfig?.configurable?.conversationId
         const fileConfig = model?.fileHandlingConfig
-
-        let totalBase64Bytes = 0
-        const maxTotalSize =
+        const maxTotal =
             fileConfig?.maxTotalSizeBytes ?? DEFAULT_MAX_TOTAL_SIZE_BYTES
 
-        const nativeParts: {
-            mimeType: string
-            base64Data: string
-            sourceUrl: string
-        }[] = []
-
-        const response: {
-            files: {
-                sourceUrl: string
-                mimeType?: string
-                status: 'ok' | 'described' | 'error'
-                description?: string
-                error?: string
-            }[]
-            successCount: number
-            failureCount: number
-        } = {
+        const native: NativePart[] = []
+        const report: ToolReport = {
             files: [],
             successCount: 0,
             failureCount: 0
         }
+        let totalBytes = 0
         let describedCount = 0
 
-        for (const file of files) {
-            const sourceUrl = file.url
-
-            const pushError = (errorMessage: string, mimeType?: string) => {
-                response.files.push({
+        for (const { url: sourceUrl } of files) {
+            if (!isHttp(sourceUrl)) {
+                pushError(
+                    report,
                     sourceUrl,
-                    mimeType,
-                    status: 'error',
-                    error: errorMessage
-                })
-                response.failureCount++
+                    'Only http/https URLs are supported.'
+                )
+                continue
             }
 
             try {
-                if (!isHttpOrHttpsUrl(sourceUrl)) {
-                    pushError(
-                        'Only http/https URLs are supported for read_files.'
-                    )
+                const fetched = await this._fetch(sourceUrl)
+                if (!fetched) {
+                    pushError(report, sourceUrl, 'Failed to fetch URL.')
                     continue
                 }
 
-                // Determine MIME type first by fetching with headers
-                const controller = new AbortController()
-                const timeout = setTimeout(() => controller.abort(), 60_000)
-                const httpResponse = await this.ctx
-                    .http(sourceUrl, {
-                        responseType: 'arraybuffer',
-                        method: 'get',
-                        headers: {
-                            'User-Agent':
-                                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
-                        },
-                        signal: controller.signal
-                    })
-                    .finally(() => {
-                        clearTimeout(timeout)
-                    })
-
-                const buffer = Buffer.from(httpResponse.data)
-
-                // Resolve MIME type from response headers or URL
-                const headers = httpResponse.headers as unknown as
-                    | Record<string, unknown>
-                    | undefined
-                const rawCt =
-                    headers?.['content-type'] ?? headers?.['Content-Type']
-                let responseMimeType: string | null = null
-                if (typeof rawCt === 'string') {
-                    responseMimeType = normalizeMimeType(rawCt)
-                } else if (
-                    Array.isArray(rawCt) &&
-                    typeof rawCt[0] === 'string'
-                ) {
-                    responseMimeType = normalizeMimeType(rawCt[0])
-                }
-
-                const mimeType =
-                    responseMimeType ?? inferMimeTypeFromUrl(sourceUrl)
+                const declared =
+                    normalizeMimeType(fetched.contentType) ??
+                    inferMimeTypeFromUrl(sourceUrl)
+                const detectedAudio = detectAudioMimeType(
+                    fetched.buffer,
+                    declared
+                )
+                const mime = detectedAudio ?? declared
 
-                if (!mimeType) {
+                if (!mime) {
                     pushError(
-                        `Could not determine MIME type for ${sourceUrl}. Please ensure the URL returns a valid content type.`
+                        report,
+                        sourceUrl,
+                        `Could not determine MIME type for ${sourceUrl}.`
                     )
                     continue
                 }
-
-                if (!isMimeTypeEnabled(this.config, mimeType)) {
+                if (!mimeEnabled(this.config, mime)) {
                     pushError(
-                        `Feature disabled for MIME type "${mimeType}". Please enable the corresponding read_files switch.`,
-                        mimeType
+                        report,
+                        sourceUrl,
+                        `Feature disabled for MIME type "${mime}".`,
+                        mime
                     )
                     continue
                 }
 
-                // Check if the model supports this MIME type natively
-                const isImage = IMAGE_MIME_TYPES.has(mimeType)
-                const modelSupports =
-                    model != null &&
-                    modelSupportsNativeMimeType(model, mimeType)
-
-                if (modelSupports && !isImage) {
-                    // Non-image file that the model supports natively -> inline inject
-                    const maxFileSize =
-                        fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ??
-                        fileConfig?.maxFileSizeBytes ??
-                        DEFAULT_MAX_FILE_SIZE_BYTES
-
-                    const encodedSize = getBase64EncodedSize(buffer.byteLength)
-
-                    if (encodedSize > maxFileSize) {
-                        pushError(
-                            `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${mimeType}`,
-                            mimeType
+                const isImage = IMAGE_MIME_TYPES.has(mime)
+                const isAudio = mime.startsWith('audio/')
+                const supportsNative =
+                    model != null && modelSupportsMime(model, mime)
+
+                // ----- Non-image native: maybe transcode audio, then inline ---
+                if (!isImage && supportsNative) {
+                    let bytes = fetched.buffer
+                    let outMime = mime
+                    if (
+                        isAudio &&
+                        fileConfig?.supportedMimeTypes &&
+                        !fileConfig.supportedMimeTypes.has(mime)
+                    ) {
+                        const converted = await convertAudioToMp3(
+                            this.ctx,
+                            bytes
                         )
-                        continue
+                        if (!converted) {
+                            pushError(
+                                report,
+                                sourceUrl,
+                                `Unsupported audio MIME "${mime}" and ffmpeg conversion failed.`,
+                                mime
+                            )
+                            continue
+                        }
+                        bytes = converted
+                        outMime = 'audio/mpeg'
                     }
 
-                    if (totalBase64Bytes + encodedSize > maxTotalSize) {
-                        pushError(
-                            `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`,
-                            mimeType
-                        )
+                    const sizeError = checkSize(
+                        bytes,
+                        outMime,
+                        fileConfig,
+                        totalBytes,
+                        maxTotal
+                    )
+                    if (sizeError) {
+                        pushError(report, sourceUrl, sizeError, outMime)
                         continue
                     }
-
-                    totalBase64Bytes += encodedSize
-                    nativeParts.push({
-                        mimeType,
-                        base64Data: buffer.toString('base64'),
-                        sourceUrl
-                    })
-
-                    response.files.push({
+                    totalBytes += getBase64EncodedSize(bytes.byteLength)
+                    pushNative(
+                        report,
+                        native,
                         sourceUrl,
-                        mimeType,
-                        status: 'ok'
-                    })
-                    response.successCount++
-                } else if (isImage && modelSupports) {
-                    // Image that the model supports natively -> inject directly
-                    // Unified per-file size check before any branching
-                    const maxFileSize =
-                        fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ??
-                        fileConfig?.maxFileSizeBytes ??
-                        DEFAULT_MAX_FILE_SIZE_BYTES
-
-                    const encodedSize = getBase64EncodedSize(buffer.byteLength)
-
-                    if (encodedSize > maxFileSize) {
-                        pushError(
-                            `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`,
-                            mimeType
-                        )
+                        outMime,
+                        bytes.toString('base64')
+                    )
+                    continue
+                }
+
+                // ----- Image native: inject directly (GIF splits to frames) ---
+                if (isImage && supportsNative) {
+                    const sizeError = checkSize(
+                        fetched.buffer,
+                        mime,
+                        fileConfig,
+                        totalBytes,
+                        maxTotal
+                    )
+                    if (sizeError) {
+                        pushError(report, sourceUrl, sizeError, mime)
                         continue
                     }
 
-                    // For GIF: split into frames
-                    if (mimeType === 'image/gif') {
-                        const frames = await parseGifToFrames(buffer, {
+                    if (mime === 'image/gif') {
+                        let pushed = 0
+                        const frames = await parseGifToFrames(fetched.buffer, {
                             strategy: this.config.gifStrategy,
                             frameCount: this.config.gifFrameCount
                         })
-
-                        logger.debug(
-                            `Extracted ${frames.length} frames from GIF for native model injection`
-                        )
-
                         for (const frame of frames) {
-                            // Frames are data:image/png;base64,... strings
                             const frameBase64 = frame.split(',')[1]
-                            const frameSize = getBase64EncodedSize(
-                                Buffer.from(frameBase64, 'base64').byteLength
+                            const buf = Buffer.from(frameBase64, 'base64')
+                            const sizeError = checkSize(
+                                buf,
+                                'image/png',
+                                fileConfig,
+                                totalBytes,
+                                maxTotal
                             )
-
-                            if (totalBase64Bytes + frameSize > maxTotalSize) {
+                            if (sizeError) {
+                                if (pushed < 1) {
+                                    pushError(
+                                        report,
+                                        sourceUrl,
+                                        sizeError,
+                                        'image/png'
+                                    )
+                                }
                                 logger.warn(
                                     'Skipping remaining GIF frames due to total size limit'
                                 )
                                 break
                             }
-
-                            totalBase64Bytes += frameSize
-                            nativeParts.push({
-                                mimeType: 'image/png',
-                                base64Data: frameBase64,
-                                sourceUrl
-                            })
-                        }
-                    } else {
-                        if (totalBase64Bytes + encodedSize > maxTotalSize) {
-                            pushError(
-                                `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`,
-                                mimeType
+                            totalBytes += getBase64EncodedSize(buf.byteLength)
+                            pushNative(
+                                report,
+                                native,
+                                sourceUrl,
+                                'image/png',
+                                frameBase64
                             )
-                            continue
+                            pushed++
                         }
-
-                        totalBase64Bytes += encodedSize
-                        nativeParts.push({
-                            mimeType,
-                            base64Data: buffer.toString('base64'),
-                            sourceUrl
-                        })
-                    }
-
-                    response.files.push({
-                        sourceUrl,
-                        mimeType,
-                        status: 'ok'
-                    })
-                    response.successCount++
-                } else if (isImage) {
-                    // Image but model doesn't support it natively -> describe using image model
-                    const maxFileSize =
-                        fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ??
-                        fileConfig?.maxFileSizeBytes ??
-                        DEFAULT_MAX_FILE_SIZE_BYTES
-
-                    const encodedSize = getBase64EncodedSize(buffer.byteLength)
-
-                    if (encodedSize > maxFileSize) {
-                        pushError(
-                            `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`,
-                            mimeType
+                    } else {
+                        totalBytes += getBase64EncodedSize(
+                            fetched.buffer.byteLength
+                        )
+                        pushNative(
+                            report,
+                            native,
+                            sourceUrl,
+                            mime,
+                            fetched.buffer.toString('base64')
                         )
-                        continue
                     }
+                    continue
+                }
 
-                    const describeResult = await this._describeImageWithModel(
+                // ----- Image without native support: describe via vision model -
+                if (isImage) {
+                    const described = await this._describeImage(
                         sourceUrl,
-                        buffer,
-                        mimeType
+                        fetched.buffer,
+                        mime
                     )
-
-                    if (describeResult) {
-                        response.files.push({
+                    if (described) {
+                        report.files.push({
                             sourceUrl,
-                            mimeType,
+                            mimeType: mime,
                             status: 'described',
-                            description: describeResult
+                            description: described
                         })
-                        response.successCount++
+                        report.successCount++
                         describedCount++
                     } else {
                         pushError(
-                            `Failed to describe image from ${sourceUrl}`,
-                            mimeType
+                            report,
+                            sourceUrl,
+                            'Failed to describe image.',
+                            mime
                         )
-                        continue
                     }
-                } else {
-                    // Non-image, model doesn't support it natively
-                    pushError(
-                        `Unsupported MIME type "${mimeType}" for the current model. The model does not natively support this file type.`,
-                        mimeType
-                    )
                     continue
                 }
+
+                pushError(
+                    report,
+                    sourceUrl,
+                    `Unsupported MIME "${mime}" for the current model.`,
+                    mime
+                )
             } catch (error) {
                 logger.warn(`read_files error for ${sourceUrl}:`, error)
-                const errorMessage =
+                pushError(
+                    report,
+                    sourceUrl,
                     error instanceof Error ? error.message : String(error)
-                pushError(errorMessage)
+                )
             }
         }
 
-        // Inject native parts into next-round context via contextManager
-        if (nativeParts.length > 0 && conversationId) {
-            const message = buildMultimodalMessage(
-                nativeParts,
-                this.config.fileInsertPrompt
-            )
-
+        const injected = native.length > 0 && !!conversationId
+        if (native.length > 0 && conversationId) {
             this.ctx.chatluna.contextManager.inject({
                 conversationId,
                 name: 'read_files_context',
-                value: message,
+                value: buildMultimodalMessage(
+                    native,
+                    this.config.fileInsertPrompt
+                ),
                 once: true,
                 stage: 'after_scratchpad'
             })
-
             logger.debug(
-                `Injected ${nativeParts.length} file part(s) into context for conversation ${conversationId}`
+                `Injected ${native.length} file part(s) into context for conversation ${conversationId}`
             )
         }
 
         return JSON.stringify({
-            response,
-            note:
-                nativeParts.length > 0
-                    ? `Successfully read ${nativeParts.length} file(s). The file content has been added to the conversation context and will be available in the next turn.`
-                    : describedCount > 0
-                      ? `Described ${describedCount} image file(s) using the vision model.`
-                      : response.failureCount > 0
-                        ? `Failed to read ${response.failureCount} file(s).`
-                        : 'No files were processed.'
+            response: report,
+            note: injected
+                ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.`
+                : native.length > 0
+                  ? `Successfully read ${native.length} file(s), but no conversation id was available, so the file content was not added to the conversation context.`
+                  : describedCount > 0
+                    ? `Described ${describedCount} image file(s) using the vision model.`
+                    : report.failureCount > 0
+                      ? `Failed to read ${report.failureCount} file(s).`
+                      : 'No files were processed.'
         })
     }
 
-    /**
-     * Describe an image using the configured image model (fallback when the
-     * main model doesn't support image input).
-     */
-    private async _describeImageWithModel(
+    private async _fetch(
+        url: string
+    ): Promise<{ buffer: Buffer; contentType: string | null } | null> {
+        try {
+            const response = await this.ctx.http(url, {
+                responseType: 'arraybuffer',
+                method: 'get',
+                headers: { 'User-Agent': BROWSER_UA },
+                timeout: 60_000
+            })
+            return {
+                buffer: Buffer.from(response.data),
+                contentType: getHeaderValue(response.headers, 'content-type')
+            }
+        } catch {
+            return null
+        }
+    }
+
+    private async _describeImage(
         url: string,
         buffer: Buffer,
         mimeType: string
     ): Promise<string | null> {
         const imageModel = this.imageModelRef().value
-        if (imageModel == null) {
-            logger.warn(
-                'Image model is not loaded, cannot describe image. Please check your chat adapter.'
-            )
-            return null
-        }
-
         if (
+            !imageModel ||
             !imageModel.modelInfo.capabilities.includes(
                 ModelCapabilities.ImageInput
             )
         ) {
-            logger.warn('Image model does not support image input.')
+            logger.warn(
+                'Image model not loaded or lacks image input; cannot describe.'
+            )
             return null
         }
 
         try {
-            const fakeMessage: Message = { content: [] }
-
+            const fake: Message = { content: [] }
             if (mimeType === 'image/gif') {
                 const frames = await parseGifToFrames(buffer, {
                     strategy: this.config.gifStrategy,
                     frameCount: this.config.gifFrameCount
                 })
-
                 addTextToContent(
-                    fakeMessage,
+                    fake,
                     'This is a GIF image. See the frames below:'
                 )
-                for (const frame of frames) {
-                    addImageToContent(fakeMessage, frame)
-                }
+                for (const frame of frames) addImageToContent(fake, frame)
             } else {
-                const base64 = buffer.toString('base64')
-                const base64Source = `data:${mimeType};base64,${base64}`
-                addImageToContent(fakeMessage, base64Source)
+                addImageToContent(
+                    fake,
+                    `data:${mimeType};base64,${buffer.toString('base64')}`
+                )
             }
-
-            return await processImageWithModel(
-                imageModel,
-                this.config,
-                fakeMessage
-            )
+            return await processImageWithModel(imageModel, this.config, fake)
         } catch (error) {
             logger.warn(`Describe image ${url} error:`, error)
             return null
@@ -681,6 +417,169 @@ export class ReadFilesTool extends StructuredTool {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+interface ToolReport {
+    files: {
+        sourceUrl: string
+        mimeType?: string
+        status: 'ok' | 'described' | 'error'
+        description?: string
+        error?: string
+    }[]
+    successCount: number
+    failureCount: number
+}
+
+function pushError(
+    report: ToolReport,
+    sourceUrl: string,
+    error: string,
+    mimeType?: string
+) {
+    report.files.push({ sourceUrl, mimeType, status: 'error', error })
+    report.failureCount++
+}
+
+function pushNative(
+    report: ToolReport,
+    native: NativePart[],
+    sourceUrl: string,
+    mimeType: string,
+    base64Data: string
+) {
+    native.push({ sourceUrl, mimeType, base64Data })
+    report.files.push({ sourceUrl, mimeType, status: 'ok' })
+    report.successCount++
+}
+
+function getHeaderValue(headers: unknown, name: string): string | null {
+    if (headers == null) return null
+
+    if (typeof (headers as { get?: unknown }).get === 'function') {
+        const value = (headers as { get(name: string): string | null }).get(
+            name
+        )
+        return typeof value === 'string' ? value : null
+    }
+
+    const record = headers as Record<string, unknown>
+    const lower = name.toLowerCase()
+    for (const key of Object.keys(record)) {
+        if (key.toLowerCase() === lower) {
+            const value = record[key]
+            return typeof value === 'string' ? value : null
+        }
+    }
+    return null
+}
+
+function isHttp(url: string): boolean {
+    try {
+        const { protocol } = new URL(url)
+        return protocol === 'http:' || protocol === 'https:'
+    } catch {
+        return false
+    }
+}
+
+function modelSupportsMime(model: ChatLunaChatModel, mime: string): boolean {
+    const caps = model.modelInfo.capabilities
+    const isImage = IMAGE_MIME_TYPES.has(mime)
+    const capOk = isImage
+        ? caps.includes(ModelCapabilities.ImageInput)
+        : mime.startsWith('audio/')
+          ? caps.includes(ModelCapabilities.AudioInput)
+          : mime.startsWith('video/')
+            ? caps.includes(ModelCapabilities.VideoInput)
+            : caps.includes(ModelCapabilities.FileInput)
+    if (!capOk) return false
+    const file = model.fileHandlingConfig
+    return file == null || file.supportedMimeTypes.has(mime)
+}
+
+function mimeEnabled(config: Config, mime: string): boolean {
+    if (mime === 'image/gif') return config.enableGifReadTool
+    if (IMAGE_MIME_TYPES.has(mime)) return config.enableImageReadTool
+    return config.enableFileReadTool
+}
+
+function checkSize(
+    buffer: Buffer,
+    mime: string,
+    fileConfig: FileHandlingConfig | undefined,
+    totalBytes: number,
+    maxTotal: number
+): string | null {
+    const max =
+        fileConfig?.maxFileSizeBytesOverrides?.[mime] ??
+        fileConfig?.maxFileSizeBytes ??
+        DEFAULT_MAX_FILE_SIZE_BYTES
+    const encoded = getBase64EncodedSize(buffer.byteLength)
+    if (encoded > max) {
+        return `File too large (${encoded} bytes after base64, raw ${buffer.byteLength} bytes), max ${max} bytes for ${mime}.`
+    }
+    if (totalBytes + encoded > maxTotal) {
+        return `Total inline upload size too large (${totalBytes + encoded} bytes), max ${maxTotal} bytes per request.`
+    }
+    return null
+}
+
+function buildMultimodalMessage(
+    parts: NativePart[],
+    prompt: string
+): HumanMessage {
+    const content: MessageContentComplex[] = []
+    for (const { mimeType, base64Data } of parts) {
+        const dataUrl = `data:${mimeType};base64,${base64Data}`
+        if (IMAGE_MIME_TYPES.has(mimeType)) {
+            content.push({ type: 'image_url', image_url: { url: dataUrl } })
+        } else if (mimeType.startsWith('audio/')) {
+            content.push({
+                type: 'audio_url',
+                audio_url: { url: dataUrl, mimeType }
+            } as unknown as MessageContentComplex)
+        } else if (mimeType.startsWith('video/')) {
+            content.push({
+                type: 'video_url',
+                video_url: { url: dataUrl, mimeType }
+            } as unknown as MessageContentComplex)
+        } else {
+            // Inline data for text/pdf/etc. (Gemini-style)
+            content.push({
+                inline_data: { mime_type: mimeType, data: base64Data }
+            } as unknown as MessageContentComplex)
+        }
+    }
+    if (content.length > 0) content.unshift({ type: 'text', text: prompt })
+    return new HumanMessage({ content })
+}
+
+function describeTool(config: Config): string {
+    const sections: string[] = []
+    if (config.enableImageReadTool) {
+        sections.push(
+            '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.'
+        )
+    }
+    if (config.enableGifReadTool) {
+        sections.push(
+            '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.'
+        )
+    }
+    if (config.enableFileReadTool) {
+        sections.push(
+            '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).'
+        )
+    }
+    return `Read files from URL(s) and return their content.
+Enabled read_files capabilities:
+${sections.join('\n')}
+Use this tool when you need to read files from URL(s) as context.`
+}
+
 // ---------------------------------------------------------------------------
 // Plugin registration
 // ---------------------------------------------------------------------------
diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts
index bfb0532d0..55ecfdbb4 100644
--- a/packages/service-multimodal/src/utils.ts
+++ b/packages/service-multimodal/src/utils.ts
@@ -1,6 +1,5 @@
 import {
     HumanMessage,
-    MessageContent,
     MessageContentComplex,
     MessageContentText
 } from '@langchain/core/messages'
@@ -12,195 +11,314 @@ import {
     isMessageContentImageUrl
 } from 'koishi-plugin-chatluna/utils/string'
 import { Context } from 'koishi'
+import type {} from 'koishi-plugin-ffmpeg-path'
 import { Config, logger } from '.'
 import { GifReader } from 'omggif'
 import { Jimp } from 'jimp'
 
-export interface GifExtractionConfig {
-    strategy: 'first' | 'head' | 'average'
-    frameCount: number
+// ---------------------------------------------------------------------------
+// MIME helpers
+// ---------------------------------------------------------------------------
+
+export const IMAGE_MIME_TYPES = new Set([
+    'image/png',
+    'image/jpeg',
+    'image/bmp',
+    'image/webp',
+    'image/gif'
+])
+
+const FILE_EXTENSION_TO_MIME_TYPE: Record<string, string> = {
+    '.png': 'image/png',
+    '.jpg': 'image/jpeg',
+    '.jpeg': 'image/jpeg',
+    '.bmp': 'image/bmp',
+    '.webp': 'image/webp',
+    '.gif': 'image/gif',
+    '.pdf': 'application/pdf',
+    '.txt': 'text/plain',
+    '.md': 'text/markdown',
+    '.html': 'text/html',
+    '.htm': 'text/html',
+    '.css': 'text/css',
+    '.xml': 'text/xml',
+    '.csv': 'text/csv',
+    '.rtf': 'text/rtf',
+    '.js': 'text/javascript',
+    '.mjs': 'text/javascript',
+    '.json': 'application/json',
+    '.mp4': 'video/mp4',
+    '.mpeg': 'video/mpeg',
+    '.mov': 'video/mov',
+    '.avi': 'video/avi',
+    '.flv': 'video/x-flv',
+    '.webm': 'video/webm',
+    '.wmv': 'video/wmv',
+    '.3gp': 'video/3gpp',
+    '.3gpp': 'video/3gpp',
+    '.mp3': 'audio/mpeg',
+    '.aiff': 'audio/aiff',
+    '.aac': 'audio/aac',
+    '.flac': 'audio/flac',
+    '.wav': 'audio/wav',
+    '.ogg': 'audio/ogg',
+    '.m4a': 'audio/mp4'
 }
 
-/**
- * Check if any frame in the range [start, end) has complex disposal methods
- * that require resetting the canvas (disposal method 2 or 3)
- */
-function hasComplexDisposal(
-    reader: GifReader,
-    start: number,
-    end: number
-): boolean {
-    for (let i = start; i < end; i++) {
-        const disposal = reader.frameInfo(i).disposal
-        // disposal 2: restore to background color
-        // disposal 3: restore to previous (before current frame was drawn)
-        if (disposal === 2 || disposal === 3) {
-            return true
-        }
+export function inferMimeTypeFromUrl(url: string): string | null {
+    try {
+        const path = new URL(url).pathname.toLowerCase()
+        const dot = path.lastIndexOf('.')
+        return dot < 0
+            ? null
+            : (FILE_EXTENSION_TO_MIME_TYPE[path.slice(dot)] ?? null)
+    } catch {
+        return null
     }
-    return false
 }
 
-export async function extractGifFrames(
+export function normalizeMimeType(
+    raw: string | null | undefined
+): string | null {
+    return raw?.split(';')[0]?.trim()?.toLowerCase() || null
+}
+
+/**
+ * Detect audio MIME from buffer header. Recognises QQ Silk + AMR + common
+ * audio container magic bytes. Falls back to the declared MIME otherwise.
+ */
+export function detectAudioMimeType(
     buffer: Buffer,
-    config: GifExtractionConfig
-): Promise<Buffer[]> {
-    try {
-        const reader = new GifReader(buffer)
-        const totalFrames = reader.numFrames()
+    declared?: string | null
+): string | null {
+    const head = buffer.subarray(0, 16).toString('latin1')
+
+    if (head.startsWith('#!AMR')) return 'audio/amr'
+    // QQ/OneBot ships SILK voice files with a leading flag byte before the
+    // standard `#!SILK_V3` magic, so we also check offset 1 for that variant.
+    if (
+        head.startsWith('#!SILK_V3') ||
+        buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+    ) {
+        return 'audio/silk'
+    }
+    // MP3 frame sync: 0xFFEx. Reject JPEG (0xFFD8) by checking the full sync word.
+    if (
+        head.startsWith('ID3') ||
+        (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0)
+    ) {
+        return 'audio/mpeg'
+    }
+    if (
+        head.startsWith('RIFF') &&
+        buffer.subarray(8, 12).toString('latin1') === 'WAVE'
+    ) {
+        return 'audio/wav'
+    }
+    if (head.startsWith('fLaC')) return 'audio/flac'
+    if (head.startsWith('OggS')) return 'audio/ogg'
 
-        if (totalFrames === 0) {
-            throw new Error('No frames found in GIF')
-        }
+    return declared ?? null
+}
 
-        const width = reader.width
-        const height = reader.height
+// ---------------------------------------------------------------------------
+// FFmpeg / Silk
+// ---------------------------------------------------------------------------
+
+export async function convertAudioToMp3(
+    ctx: Context,
+    buffer: Buffer
+): Promise<Buffer | null> {
+    if (!ctx.ffmpeg) {
+        logger.warn(
+            'FFmpeg service unavailable; install koishi-plugin-ffmpeg-path to enable audio transcoding.'
+        )
+        return null
+    }
 
-        let frameIndices: number[] = []
+    try {
+        // Match both the standard SILK magic and the QQ/OneBot variant that
+        // prepends a flag byte before `#!SILK_V3`.
+        const isSilk =
+            buffer.subarray(0, 9).toString('latin1') === '#!SILK_V3' ||
+            buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3'
+
+        let source = buffer
+        let silkSampleRate: number | null = null
+        if (isSilk) {
+            const decoded = await decodeSilkToPcm(ctx, buffer)
+            if (!decoded) return null
+            source = decoded.buffer
+            silkSampleRate = decoded.sampleRate
+        }
 
-        switch (config.strategy) {
-            case 'first':
-                frameIndices = [0]
-                break
+        const builder = ctx.ffmpeg.builder().input(source)
+        if (silkSampleRate != null) {
+            builder.inputOption(
+                '-f',
+                's16le',
+                '-ar',
+                String(silkSampleRate),
+                '-ac',
+                '1'
+            )
+        }
+        return await builder
+            .outputOption(
+                '-vn',
+                '-acodec',
+                'libmp3lame',
+                '-q:a',
+                '4',
+                '-f',
+                'mp3'
+            )
+            .run('buffer')
+    } catch (error) {
+        logger.warn(`Audio transcoding to mp3 failed:`, error)
+        return null
+    }
+}
 
-            case 'head': {
-                const count = Math.min(config.frameCount, totalFrames)
-                frameIndices = Array.from({ length: count }, (_, i) => i)
-                break
+async function decodeSilkToPcm(
+    ctx: Context,
+    buffer: Buffer
+): Promise<{ buffer: Buffer; sampleRate: number } | null> {
+    if (!ctx.silk) {
+        logger.warn(
+            'Silk service unavailable; install koishi-plugin-ffmpeg-path 2.0+ for silk decoding.'
+        )
+        return null
+    }
+    for (const sampleRate of [24000, 16000, 12000, 8000]) {
+        try {
+            const result = (await ctx.silk.decode(buffer, sampleRate)) as {
+                data?: Uint8Array
             }
-
-            case 'average': {
-                const count = Math.min(config.frameCount, totalFrames)
-                if (count >= totalFrames) {
-                    frameIndices = Array.from(
-                        { length: totalFrames },
-                        (_, i) => i
-                    )
-                } else if (count === 1) {
-                    // Special case: single frame, pick the first one
-                    frameIndices = [0]
-                } else {
-                    // Use span (totalFrames - 1) to ensure first and last frames are included
-                    const step = (totalFrames - 1) / (count - 1)
-                    frameIndices = Array.from({ length: count }, (_, i) =>
-                        Math.floor(i * step)
-                    )
-                }
-                break
+            if (result?.data != null) {
+                return { buffer: Buffer.from(result.data), sampleRate }
             }
+        } catch {
+            // try next sample rate
         }
+    }
+    return null
+}
 
-        const frameBuffers: Buffer[] = []
-
-        // Build canvas incrementally, only decoding frames we need
-        const canvas = new Uint8ClampedArray(width * height * 4)
-        let lastDecodedFrame = -1
+// ---------------------------------------------------------------------------
+// GIF
+// ---------------------------------------------------------------------------
 
-        for (const frameIndex of frameIndices) {
-            // Check if we need to restart decoding from frame 0
-            // This happens when:
-            // 1. Jumping backwards in frame sequence
-            // 2. Any frames between lastDecodedFrame and current have complex disposal methods
-            //    (disposal 2 or 3) which affect how the canvas should be prepared
-            const needsFullDecode =
-                frameIndex < lastDecodedFrame ||
-                (lastDecodedFrame >= 0 &&
-                    hasComplexDisposal(reader, lastDecodedFrame, frameIndex))
+export interface GifExtractionConfig {
+    strategy: 'first' | 'head' | 'average'
+    frameCount: number
+}
 
-            if (needsFullDecode) {
-                canvas.fill(0) // Clear canvas
-                // Decode from frame 0 to current frame
-                for (let i = 0; i <= frameIndex; i++) {
-                    reader.decodeAndBlitFrameRGBA(i, canvas)
-                }
-            } else {
-                // Disposal method 0 (no disposal) or 1 (do not dispose)
-                // Just decode from last position to current frame
-                for (let i = lastDecodedFrame + 1; i <= frameIndex; i++) {
-                    reader.decodeAndBlitFrameRGBA(i, canvas)
-                }
+export async function parseGifToFrames(
+    buffer: Buffer,
+    config: GifExtractionConfig
+): Promise<string[]> {
+    const reader = new GifReader(buffer)
+    const total = reader.numFrames()
+    if (total === 0) throw new Error('No frames found in GIF')
+
+    const indices = pickGifFrameIndices(total, config)
+    const { width, height } = reader
+    const canvas = new Uint8ClampedArray(width * height * 4)
+    let lastDecoded = -1
+    const frames: string[] = []
+
+    for (const idx of indices) {
+        const needsFullDecode =
+            idx < lastDecoded ||
+            (lastDecoded >= 0 && hasComplexDisposal(reader, lastDecoded, idx))
+        if (needsFullDecode) {
+            canvas.fill(0)
+            for (let i = 0; i <= idx; i++)
+                reader.decodeAndBlitFrameRGBA(i, canvas)
+        } else {
+            for (let i = lastDecoded + 1; i <= idx; i++) {
+                reader.decodeAndBlitFrameRGBA(i, canvas)
             }
-
-            lastDecodedFrame = frameIndex
-
-            // Copy canvas to avoid reference issues
-            const frameData = new Uint8ClampedArray(canvas)
-            const image = new Jimp({
-                data: Buffer.from(frameData),
-                width,
-                height
-            })
-
-            const pngBuffer = await image.getBuffer('image/png')
-            frameBuffers.push(pngBuffer)
         }
-
-        return frameBuffers
-    } catch (error) {
-        logger.error('Failed to extract GIF frames:', error)
-        throw error
+        lastDecoded = idx
+
+        const png = await new Jimp({
+            data: Buffer.from(new Uint8ClampedArray(canvas)),
+            width,
+            height
+        }).getBuffer('image/png')
+        frames.push(`data:image/png;base64,${png.toString('base64')}`)
     }
+    return frames
 }
 
-export async function parseGifToFrames(
-    buffer: Buffer,
+function pickGifFrameIndices(
+    total: number,
     config: GifExtractionConfig
-): Promise<string[]> {
-    const frameBuffers = await extractGifFrames(buffer, config)
-    return frameBuffers.map((frameBuffer) => {
-        const base64 = frameBuffer.toString('base64')
-        return `data:image/png;base64,${base64}`
-    })
+): number[] {
+    if (config.strategy === 'first') return [0]
+    const count = Math.min(config.frameCount, total)
+    if (config.strategy === 'head') {
+        return Array.from({ length: count }, (_, i) => i)
+    }
+    // average
+    if (count >= total) return Array.from({ length: total }, (_, i) => i)
+    if (count === 1) return [0]
+    const step = (total - 1) / (count - 1)
+    return Array.from({ length: count }, (_, i) => Math.floor(i * step))
+}
+
+function hasComplexDisposal(
+    reader: GifReader,
+    start: number,
+    end: number
+): boolean {
+    for (let i = start; i < end; i++) {
+        const d = reader.frameInfo(i).disposal
+        if (d === 2 || d === 3) return true
+    }
+    return false
 }
 
+// ---------------------------------------------------------------------------
+// Image
+// ---------------------------------------------------------------------------
+
 export async function readImage(ctx: Context, url: string) {
     if (url.startsWith('data:image') && url.includes('base64')) {
         const buffer = Buffer.from(url.split(',')[1], 'base64')
-        const ext = getImageType(buffer)
-
-        return {
-            base64Source: url,
-            buffer,
-            ext
-        }
+        return { base64Source: url, buffer, ext: getImageType(buffer) }
     }
-
     try {
-        const response = await ctx.http(url, {
+        const { data } = await ctx.http(url, {
             responseType: 'arraybuffer',
             method: 'get',
-            headers: {
-                'User-Agent':
-                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
-            }
+            headers: { 'User-Agent': BROWSER_UA }
         })
-
-        const buffer = Buffer.from(response.data)
-
-        const base64 = buffer.toString('base64')
-
+        const buffer = Buffer.from(data)
         const ext = getImageType(buffer)
-
         return {
-            base64Source: `data:${ext};base64,${base64}`,
+            base64Source: `data:${ext};base64,${buffer.toString('base64')}`,
             buffer,
             ext
         }
     } catch (error) {
         logger.error(`Failed to read image from ${url}:`, error)
-        return {
-            base64Source: null,
-            buffer: null,
-            ext: null
-        }
+        return { base64Source: null, buffer: null, ext: null }
     }
 }
+
 export async function processImageWithModel(
     model: ChatLunaChatModel,
     config: Config,
     message: Message
-) {
-    const images = extractImages(message.content)
+): Promise<string | null> {
+    const images = Array.isArray(message.content)
+        ? message.content.filter((item: MessageContentComplex) =>
+              isMessageContentImageUrl(item)
+          )
+        : []
     if (images.length === 0) return null
 
     try {
@@ -208,9 +326,7 @@ export async function processImageWithModel(
             { type: 'text', text: config.imagePrompt } as MessageContentText,
             ...images
         ]
-
         const result = await model.invoke([new HumanMessage({ content })])
-
         return config.imageInsertPrompt.replace(
             '{img}',
             getMessageContent(result.content)
@@ -221,45 +337,36 @@ export async function processImageWithModel(
     }
 }
 
-export const addImageToContent = (message: Message, imageUrl: string) => {
-    if (typeof message.content === 'string') {
-        message.content = [
-            {
-                type: 'text',
-                text: message.content
-            }
-        ]
-    }
+export function addImageToContent(message: Message, imageUrl: string) {
+    ensureContentArray(message)
     ;(message.content as MessageContentComplex[]).push({
         type: 'image_url',
-        image_url: {
-            url: imageUrl
-        }
+        image_url: { url: imageUrl }
     })
 }
 
-export const addTextToContent = (message: Message, text: string) => {
+export function addTextToContent(message: Message, text: string) {
     if (typeof message.content === 'string') {
         message.content += text
         return
     }
-
     const content = message.content as MessageContentComplex[]
-    const lastItem = content[content.length - 1]
-
-    if (lastItem && lastItem.type === 'text') {
-        lastItem.text += text
+    const last = content[content.length - 1]
+    if (last && last.type === 'text') {
+        last.text += text
     } else {
-        content.push({
-            type: 'text',
-            text
-        })
+        content.push({ type: 'text', text })
     }
 }
 
-export const extractImages = (content: MessageContent) =>
-    Array.isArray(content)
-        ? content.filter((item: MessageContentComplex) =>
-              isMessageContentImageUrl(item)
-          )
-        : []
+export function ensureContentArray(message: Message, fallbackText = '') {
+    if (typeof message.content !== 'string') return
+    message.content = message.content.length
+        ? [{ type: 'text', text: message.content }]
+        : fallbackText.length
+          ? [{ type: 'text', text: fallbackText }]
+          : []
+}
+
+export const BROWSER_UA =
+    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
diff --git a/packages/shared-adapter/src/client.ts b/packages/shared-adapter/src/client.ts
index 36af06aa3..1015a4350 100644
--- a/packages/shared-adapter/src/client.ts
+++ b/packages/shared-adapter/src/client.ts
@@ -1,6 +1,10 @@
+import { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client'
 import { ModelInfo } from 'koishi-plugin-chatluna/llm-core/platform/types'
 import { getModelContextSize } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens'
 
+export const DEFAULT_AUDIO_MAX_BASE64_BYTES = 50 * 1024 * 1024
+export const DEFAULT_IMAGE_MAX_BASE64_BYTES = 50 * 1024 * 1024
+
 export type OpenAIReasoningEffort =
     | 'none'
     | 'minimal'
@@ -153,7 +157,11 @@ function createGlobMatcher(pattern: string): (text: string) => boolean {
     return (text: string) => regex.test(text)
 }
 
-const imageModelMatchers = [
+function createRegexMatcher(regex: RegExp): (text: string) => boolean {
+    return (text: string) => regex.test(text)
+}
+
+const imageModelMatchers: ((text: string) => boolean)[] = [
     'vision',
     'vl',
     'gpt-4o',
@@ -176,11 +184,76 @@ const imageModelMatchers = [
     'glm-*v',
     'kimi-k2.5',
     'step3',
-    'grok-4',
-    'mimo-v2.5*'
-].map((pattern) => createGlobMatcher(pattern))
+    'grok-4'
+].map(createGlobMatcher)
+
+// mimo-v2.5 supports image/audio; mimo-v2.5-pro does NOT (text only).
+imageModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/))
 
 export function supportImageInput(modelName: string) {
     const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase()
     return imageModelMatchers.some((matcher) => matcher(lowerModel))
 }
+
+const audioModelMatchers: ((text: string) => boolean)[] = [
+    'gpt-4o-audio',
+    'gpt-4o-mini-audio',
+    'gpt-audio',
+    'mimo-v2-omni'
+].map(createGlobMatcher)
+
+audioModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/))
+
+export function supportAudioInput(modelName: string) {
+    const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase()
+    return audioModelMatchers.some((matcher) => matcher(lowerModel))
+}
+
+const openAIImageMimeTypes = [
+    'image/png',
+    'image/jpeg',
+    'image/gif',
+    'image/webp',
+    'image/bmp'
+]
+
+const openAIAudioMimeTypes = [
+    'audio/mpeg',
+    'audio/mp3',
+    'audio/wav',
+    'audio/flac',
+    'audio/mp4',
+    'audio/ogg'
+]
+
+export function getOpenAIFileHandlingConfig(
+    modelName: string
+): FileHandlingConfig | undefined {
+    const image = supportImageInput(modelName)
+    const audio = supportAudioInput(modelName)
+    if (!image && !audio) return undefined
+
+    const supportedMimeTypes = new Set<string>()
+    const overrides: Record<string, number> = {}
+
+    if (image) {
+        for (const mime of openAIImageMimeTypes) {
+            supportedMimeTypes.add(mime)
+            overrides[mime] = DEFAULT_IMAGE_MAX_BASE64_BYTES
+        }
+    }
+
+    if (audio) {
+        for (const mime of openAIAudioMimeTypes) {
+            supportedMimeTypes.add(mime)
+            overrides[mime] = DEFAULT_AUDIO_MAX_BASE64_BYTES
+        }
+    }
+
+    return {
+        supportedMimeTypes,
+        maxTotalSizeBytes: 100 * 1024 * 1024,
+        maxFileSizeBytes: 100 * 1024 * 1024,
+        maxFileSizeBytesOverrides: overrides
+    }
+}
diff --git a/packages/shared-adapter/src/utils.ts b/packages/shared-adapter/src/utils.ts
index a171e25f6..fe130f7fc 100644
--- a/packages/shared-adapter/src/utils.ts
+++ b/packages/shared-adapter/src/utils.ts
@@ -30,15 +30,24 @@ import {
     ResponseUsage
 } from './types'
 import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat'
+import { logger } from 'koishi-plugin-chatluna'
 import {
     getImageMimeType,
     getMimeTypeFromSource,
     isMessageContentImageUrl
 } from 'koishi-plugin-chatluna/utils/string'
-import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain'
+import {
+    isChatLunaUserMessage,
+    isMessageContentAudio
+} from 'koishi-plugin-chatluna/utils/langchain'
 import { ToolCallChunk } from '@langchain/core/messages/tool'
 import { isZodSchemaV3 } from '@langchain/core/utils/types'
-import { normalizeOpenAIModelName, supportImageInput } from './client'
+import {
+    DEFAULT_AUDIO_MAX_BASE64_BYTES,
+    normalizeOpenAIModelName,
+    supportAudioInput,
+    supportImageInput
+} from './client'
 
 export function createUsageMetadata(data: {
     inputTokens: number
@@ -222,6 +231,7 @@ export function responseInputContent(
                 } satisfies ResponseInputContent
             }
 
+            // OpenAI Response API does not accept `input_audio` yet — drop it.
             return undefined
         })
         .filter((part) => part != null)
@@ -343,64 +353,71 @@ export async function langchainMessageToOpenAIMessage(
             }
         }
 
-        const images = rawMessage.additional_kwargs.images as string[] | null
-
-        const lowerModel = normalizedModel?.toLowerCase() ?? ''
-        if (
-            images != null &&
-            (supportImageInput(lowerModel) || supportImageInputType)
-        ) {
-            msg.content = [
-                {
-                    type: 'text',
-                    text: rawMessage.content as string
-                }
-            ]
-
-            const imageContents = await Promise.all(
-                images.map(async (image) => {
-                    try {
-                        const url = await fetchImageUrl(plugin, {
-                            type: 'image_url',
-                            image_url: { url: image }
-                        } as MessageContentImageUrl)
-                        return {
-                            type: 'image_url',
-                            image_url: {
-                                url,
-                                detail: 'low'
-                            }
-                        } as const
-                    } catch {
-                        return null
-                    }
-                })
+        if (rawMessage.additional_kwargs.images != null) {
+            logger.warn(
+                'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.'
             )
+        }
 
-            msg.content.push(
-                ...imageContents.filter((content) => content != null)
-            )
-        } else if (Array.isArray(msg.content) && msg.content.length > 0) {
+        if (Array.isArray(msg.content) && msg.content.length > 0) {
+            const supportsAudio = supportAudioInput(normalizedModel ?? '')
+            const supportsImage =
+                supportImageInput(normalizedModel ?? '') ||
+                supportImageInputType === true
             const mappedContent = await Promise.all(
                 msg.content.map(async (content) => {
-                    if (!isMessageContentImageUrl(content)) return content
-
-                    try {
-                        const url = await fetchImageUrl(plugin, content)
-                        return {
-                            type: 'image_url',
-                            image_url: {
-                                url,
-                                detail: 'low'
+                    if (isMessageContentImageUrl(content)) {
+                        if (!supportsImage) {
+                            logger.warn(
+                                `Model ${normalizedModel} does not accept image input; dropping image content.`
+                            )
+                            return null
+                        }
+                        try {
+                            const url = await fetchImageUrl(plugin, content)
+                            return {
+                                type: 'image_url',
+                                image_url: { url, detail: 'low' }
+                            }
+                        } catch {
+                            return null
+                        }
+                    }
+
+                    if (isMessageContentAudio(content)) {
+                        if (!supportsAudio) {
+                            logger.warn(
+                                `Model ${normalizedModel} does not accept audio input; dropping audio content.`
+                            )
+                            return null
+                        }
+                        try {
+                            const part = await fetchAudioContentPart(
+                                plugin,
+                                content
+                            )
+                            if (part == null) {
+                                logger.warn(
+                                    `Audio content for model ${normalizedModel} was dropped (exceeded size limits or no data).`
+                                )
                             }
+                            return part
+                        } catch (err) {
+                            logger.error(
+                                `Failed to fetch audio part for model ${normalizedModel}`,
+                                err
+                            )
+                            throw err
                         }
-                    } catch {
-                        return null
                     }
+
+                    return content
                 })
             )
 
-            msg.content = mappedContent.filter((content) => content != null)
+            msg.content = mappedContent.filter(
+                (content) => content != null
+            ) as ChatCompletionResponseMessage['content']
         }
 
         result.push(msg)
@@ -676,6 +693,54 @@ export async function fetchFileLikeUrl(
     }
 }
 
+const AUDIO_MIME_TO_FORMAT: Record<string, string> = {
+    'audio/mpeg': 'mp3',
+    'audio/mp3': 'mp3',
+    'audio/wav': 'wav',
+    'audio/x-wav': 'wav',
+    'audio/flac': 'flac',
+    'audio/x-flac': 'flac',
+    'audio/ogg': 'ogg',
+    'audio/mp4': 'mp4',
+    'audio/aac': 'aac',
+    'audio/webm': 'webm'
+}
+
+function audioMimeToFormat(mime: string): string {
+    const format = AUDIO_MIME_TO_FORMAT[mime.toLowerCase()]
+    if (!format) {
+        throw new Error(
+            `Unsupported audio MIME for OpenAI input_audio: ${mime}`
+        )
+    }
+    return format
+}
+
+/**
+ * Fetch an `audio_url` content part and convert it to the OpenAI-compatible
+ * `input_audio` shape used by gpt-4o-audio / MiMo. Returns `null` when the
+ * encoded payload exceeds {@link DEFAULT_AUDIO_MAX_BASE64_BYTES}.
+ */
+async function fetchAudioContentPart(
+    plugin: ChatLunaPlugin,
+    content: MessageContentFileLike & { type: 'audio_url' }
+): Promise<MessageContentComplex | null> {
+    const { buffer, mimeType } = await fetchFileLikeUrl(plugin, content)
+    const base64 = buffer.toString('base64')
+
+    if (base64.length > DEFAULT_AUDIO_MAX_BASE64_BYTES) {
+        return null
+    }
+
+    return {
+        type: 'input_audio',
+        input_audio: {
+            data: base64,
+            format: audioMimeToFormat(mimeType)
+        }
+    } as unknown as MessageContentComplex
+}
+
 export function messageTypeToOpenAIRole(
     type: MessageType
 ): ChatCompletionResponseMessageRoleEnum {