From 0b1e08f8138a680faf2d26e141c70de764da25a5 Mon Sep 17 00:00:00 2001 From: yabo083 Date: Sun, 17 May 2026 20:08:48 +0800 Subject: [PATCH 1/7] feat(service-multimodal): support MiMo audio and image inputs --- packages/service-multimodal/README.md | 30 ++- packages/service-multimodal/package.json | 4 +- packages/service-multimodal/src/audio.ts | 111 +++++++++ packages/service-multimodal/src/index.ts | 6 + packages/service-multimodal/src/media.ts | 27 +++ .../service-multimodal/src/plugins/audio.ts | 63 +++-- .../service-multimodal/src/plugins/image.ts | 6 +- .../src/plugins/read_files.ts | 220 ++++++++++++------ .../src/read_files_schema.ts | 31 +++ .../tests/audio-mimo.test.ts | 149 ++++++++++++ 10 files changed, 548 insertions(+), 99 deletions(-) create mode 100644 packages/service-multimodal/src/audio.ts create mode 100644 packages/service-multimodal/src/media.ts create mode 100644 packages/service-multimodal/src/read_files_schema.ts create mode 100644 packages/service-multimodal/tests/audio-mimo.test.ts diff --git a/packages/service-multimodal/README.md b/packages/service-multimodal/README.md index e96b4badc..a630b5799 100644 --- a/packages/service-multimodal/README.md +++ b/packages/service-multimodal/README.md @@ -1,7 +1,29 @@ -## koishi-plugin-chatluna-long-memory +## koishi-plugin-chatluna-multimodal-service -## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package/koishi-plugin-chatluna-long-memory) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package//koishi-plugin-chatluna-long-memory) +ChatLuna 多模态支持服务,提供上下文图像描述、GIF 帧处理、`read_files` 文件读取,以及语音消息转码注入能力。 -> 提供长期记忆支持的插件 +### MiMo 音频理解 -[长期记忆文档](https://chatluna.chat/ecosystem/renderer/image.html) +MiMo 官方 OpenAI 兼容接口中,`mimo-v2.5` 与 `mimo-v2-omni` 支持音频理解。服务会将 QQ/OneBot 语音先下载到本地内存,必要时通过 `ffmpeg` 转成 MP3,再以 Base64 data URL 注入 `input_audio`: + +- 避免 QQ CDN 直链过期导致模型侧晚读失败。 +- 规避 AMR、Silk 等上游模型不稳定支持的格式。 +- 遵循 MiMo Base64 单音频 50 MB 上限;URL 输入的官方上限是单文件 100 MB。 + +MiMo 官方列出的音频格式为 MP3、WAV、FLAC、M4A、OGG。实际变体较多,服务默认把语音消息转成 MP3 以提高稳定性。 + +`read_files` 也会沿用这条路线:工具调用层如果把 `files` 传成 JSON 字符串,会先容错解析;音频 URL 即使被缓存服务误标为 MP3,也会按文件头识别 AMR/Silk 等实际格式,并在模型注入前通过 `ffmpeg` 转成 MP3。 + +### MiMo 图片理解 + +`mimo-v2.5` 与 `mimo-v2-omni` 也支持图片理解。即使 OpenAI 兼容适配器暂未在模型元数据中声明 `ImageInput`,服务也会把这两个 MiMo 模型视为原生图片输入模型,并使用标准 OpenAI 兼容 `image_url` 内容块注入 Base64 data URL。 + +- 支持 JPEG、PNG、GIF、WebP、BMP。 +- MiMo Base64 单图片上限为 50 MB;URL 单图片官方上限同样为 50 MB。 +- 多图输入受模型上下文和 token 长度限制。 + +音频消息转码需要启用: + +- `enableAudioFfmpegConversion` +- `koishi-plugin-ffmpeg-path` +- 官方 Bot/QQ Silk 语音还需要 `koishi-plugin-ffmpeg-path` 提供的 `silk` 服务 diff --git a/packages/service-multimodal/package.json b/packages/service-multimodal/package.json index 98586c1f9..f79bab824 100644 --- a/packages/service-multimodal/package.json +++ b/packages/service-multimodal/package.json @@ -34,7 +34,8 @@ }, "homepage": "https://github.com/ChatLunaLab/chatluna/tree/v1-dev/packages/service-image#readme", "scripts": { - "build": "atsc -b" + "build": "atsc -b", + "test": "tsx --test tests/*.test.ts" }, "keywords": [ "chatbot", @@ -57,6 +58,7 @@ "@types/omggif": "^1.0.5", "atsc": "^2.1.0", "koishi": "^4.18.9", + "koishi-plugin-adapter-onebot": "^6.9.3", "koishi-plugin-ffmpeg-path": "^2.0.0" }, "peerDependencies": { diff --git a/packages/service-multimodal/src/audio.ts b/packages/service-multimodal/src/audio.ts new file mode 100644 index 000000000..71aa25f00 --- /dev/null +++ b/packages/service-multimodal/src/audio.ts @@ -0,0 +1,111 @@ +import { MessageContentComplex } from '@langchain/core/messages' +import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' + +export const MIMO_BASE64_AUDIO_BYTES = 50 * 1024 * 1024 +export const MIMO_BASE64_IMAGE_BYTES = 50 * 1024 * 1024 + +const mimoModels = new Set(['mimo-v2.5', 'mimo-v2-omni']) + +const mimoAudioMimes = new Set([ + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/flac', + 'audio/mp4', + 'audio/ogg' +]) + +const mimoImageMimes = new Set([ + 'image/jpeg', + 'image/png', + 'image/gif', + 'image/webp', + 'image/bmp' +]) + +export function isMimoAudioModel(model?: string): boolean { + if (!model) return false + return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '') +} + +export function isMimoImageModel(model?: string): boolean { + if (!model) return false + return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '') +} + +export function isMimoAudioMime(mime: string): boolean { + return mimoAudioMimes.has(mime.toLowerCase()) +} + +export function isMimoImageMime(mime: string): boolean { + return mimoImageMimes.has(mime.toLowerCase()) +} + +export function modelCanReadAudio( + info: + | { + value?: { + capabilities?: ModelCapabilities[] + } + } + | undefined, + model?: string +): boolean { + return ( + isMimoAudioModel(model) || + info?.value?.capabilities?.includes(ModelCapabilities.AudioInput) === + true + ) +} + +export function modelCanReadImage( + info: + | { + value?: { + capabilities?: ModelCapabilities[] + } + } + | undefined, + model?: string +): boolean { + return ( + isMimoImageModel(model) || + info?.value?.capabilities?.includes(ModelCapabilities.ImageInput) === + true + ) +} + +export function buildAudioContent( + model: string | undefined, + data: string, + mime: string +): MessageContentComplex { + if (isMimoAudioModel(model)) { + return { + type: 'input_audio', + input_audio: { + data: `data:${mime};base64,${data}` + } + } as unknown as MessageContentComplex + } + + return { + type: 'audio_url', + audio_url: { + url: `data:${mime};base64,${data}`, + mimeType: mime + } + } as unknown as MessageContentComplex +} + +export function buildImageContent( + data: string, + mime: string +): MessageContentComplex { + return { + type: 'image_url', + image_url: { + url: `data:${mime};base64,${data}` + } + } as unknown as MessageContentComplex +} diff --git a/packages/service-multimodal/src/index.ts b/packages/service-multimodal/src/index.ts index f1e7588a6..b7b402a42 100644 --- a/packages/service-multimodal/src/index.ts +++ b/packages/service-multimodal/src/index.ts @@ -99,4 +99,10 @@ export const usage = ` ### 注意 建议搭配 \`chatluna-storage-service\` 使用。请求中的图像、文件大小限制遵循模型平台配置(如 Gemini:PDF 单文件 50MB、其他单文件 100MB、单轮总计 100MB,以文件被编码为 Base64 后的大小为准)。 + +### MiMo 音频理解 +\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的音频理解走 OpenAI 兼容 \`input_audio\`。启用音频转换后,服务会先读取语音 URL,必要时用 ffmpeg/Silk 转为 MP3,再以 Base64 data URL 注入模型,避免 QQ CDN 过期和 AMR/Silk 等格式兼容问题。MiMo Base64 单音频上限为 50MB。 + +### MiMo 图片理解 +\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的图片理解走 OpenAI 兼容 \`image_url\`。即使适配器暂未声明 \`ImageInput\`,服务也会按 MiMo 官方能力接入 JPEG、PNG、GIF、WebP、BMP,Base64 与 URL 单图片上限均为 50MB,多图受模型上下文限制。 ` diff --git a/packages/service-multimodal/src/media.ts b/packages/service-multimodal/src/media.ts new file mode 100644 index 000000000..0c2cd3602 --- /dev/null +++ b/packages/service-multimodal/src/media.ts @@ -0,0 +1,27 @@ +export function detectAudioMimeType( + buffer: Buffer, + declaredMimeType?: string | null +): string | null { + const header = buffer.subarray(0, 16).toString('latin1') + + if (header.startsWith('#!AMR')) return 'audio/amr' + if ( + header.startsWith('#!SILK_V3') || + buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + ) { + return 'audio/silk' + } + if (header.startsWith('ID3') || buffer[0] === 0xff) { + return 'audio/mpeg' + } + if ( + header.startsWith('RIFF') && + buffer.subarray(8, 12).toString('latin1') === 'WAVE' + ) { + return 'audio/wav' + } + if (header.startsWith('fLaC')) return 'audio/flac' + if (header.startsWith('OggS')) return 'audio/ogg' + + return declaredMimeType ?? null +} diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts index f8ce7fbdb..57ccf5f64 100644 --- a/packages/service-multimodal/src/plugins/audio.ts +++ b/packages/service-multimodal/src/plugins/audio.ts @@ -2,10 +2,15 @@ import { MessageContentComplex } from '@langchain/core/messages' import { Context, h, Session } from 'koishi' import type { OneBotBot } from 'koishi-plugin-adapter-onebot' import { Message } from 'koishi-plugin-chatluna' -import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' import type {} from 'koishi-plugin-chatluna-storage-service' import type {} from 'koishi-plugin-ffmpeg-path' import { Config, logger } from '..' +import { + buildAudioContent, + isMimoAudioModel, + MIMO_BASE64_AUDIO_BYTES, + modelCanReadAudio +} from '../audio' const CHATLUNA_DOWNLOAD_USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' @@ -24,12 +29,7 @@ export function apply(ctx: Context, config: Config) { ? ctx.chatluna.platform.findModel(model) : undefined - // If the model doesn't accept audio input, keep fallback path unchanged. - if ( - modelInfo?.value?.capabilities?.includes( - ModelCapabilities.AudioInput - ) === false - ) { + if (!modelCanReadAudio(modelInfo, model)) { return false } @@ -66,24 +66,41 @@ export function apply(ctx: Context, config: Config) { element.attrs['file'] = displayFileName element.attrs['filename'] = displayFileName - const audioUrl = ctx.chatluna_storage - ? (element.attrs['chatluna_file_url'] = ( - await ctx.chatluna_storage.createTempFile( - buffer, - displayFileName - ) - ).url) - : ((element.attrs['chatluna_file_url'] = sourceUrl), - `data:audio/mpeg;base64,${buffer.toString('base64')}`) + const base64 = buffer.toString('base64') + + if ( + isMimoAudioModel(model) && + Buffer.byteLength(base64) > MIMO_BASE64_AUDIO_BYTES + ) { + logger.warn( + `Skip oversized MiMo audio after base64 encoding: ${Buffer.byteLength(base64)} bytes > ${MIMO_BASE64_AUDIO_BYTES} bytes` + ) + return false + } + + const audioUrl = + !isMimoAudioModel(model) && ctx.chatluna_storage + ? (element.attrs['chatluna_file_url'] = ( + await ctx.chatluna_storage.createTempFile( + buffer, + displayFileName + ) + ).url) + : ((element.attrs['chatluna_file_url'] = sourceUrl), + `data:audio/mpeg;base64,${base64}`) ensureContentArray(message, `[voice:${displayFileName}]`) - ;(message.content as MessageContentComplex[]).push({ - type: 'audio_url', - audio_url: { - url: audioUrl, - mimeType: 'audio/mpeg' - } - } as unknown as MessageContentComplex) + ;(message.content as MessageContentComplex[]).push( + isMimoAudioModel(model) + ? buildAudioContent(model, base64, 'audio/mpeg') + : ({ + type: 'audio_url', + audio_url: { + url: audioUrl, + mimeType: 'audio/mpeg' + } + } as unknown as MessageContentComplex) + ) logger.debug( `Transcoded unsupported audio to mp3 for multimodal input: ${displayFileName}` diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts index e29db3c56..560eb959a 100644 --- a/packages/service-multimodal/src/plugins/image.ts +++ b/packages/service-multimodal/src/plugins/image.ts @@ -11,6 +11,7 @@ import { processImageWithModel, readImage } from '../utils' +import { modelCanReadImage } from '../audio' export async function apply( ctx: Context, @@ -29,10 +30,7 @@ export async function apply( ? ctx.chatluna.platform.findModel(model) : undefined const modelSupportsImageInput = - parsedModelInfo?.value != null && - parsedModelInfo.value.capabilities.includes( - ModelCapabilities.ImageInput - ) + modelCanReadImage(parsedModelInfo, model) let imageData: Awaited> const url = (element.attrs.url ?? element.attrs.src) as string diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index ea2e578df..896261e50 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -3,6 +3,7 @@ import { StructuredTool } from '@langchain/core/tools' import { HumanMessage, MessageContentComplex } from '@langchain/core/messages' import { Context } from 'koishi' import { ComputedRef, Message } from 'koishi-plugin-chatluna' +import type {} from 'koishi-plugin-ffmpeg-path' import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model' import { ChatLunaToolRunnable, @@ -12,7 +13,6 @@ import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { isMessageContentAudio, isMessageContentVideo, - type MessageContentAudio, type MessageContentVideo } from 'koishi-plugin-chatluna/utils/langchain' import { getBase64EncodedSize } from 'koishi-plugin-chatluna/utils/base64' @@ -23,6 +23,18 @@ import { parseGifToFrames, processImageWithModel } from '../utils' +import { + buildAudioContent, + buildImageContent, + isMimoAudioMime, + isMimoImageMime, + MIMO_BASE64_AUDIO_BYTES, + MIMO_BASE64_IMAGE_BYTES, + modelCanReadAudio, + modelCanReadImage +} from '../audio' +import { detectAudioMimeType } from '../media' +import { readFilesInputSchema } from '../read_files_schema' import z from 'zod' // --------------------------------------------------------------------------- @@ -97,6 +109,28 @@ function normalizeMimeType(raw: string | null): string | null { return mimeType || null } +function getHeaderValue(headers: unknown, name: string): string | null { + if (headers == null) return null + + if ( + typeof (headers as { get?: unknown }).get === 'function' + ) { + const value = (headers as { get(name: string): string | null }).get( + name + ) + return typeof value === 'string' ? value : null + } + + const record = headers as Record + const value = record[name] ?? record[name.toLowerCase()] + if (typeof value === 'string') return value + if (Array.isArray(value) && typeof value[0] === 'string') { + return value[0] + } + + return null +} + function inferMimeTypeFromPath(path: string): string | null { const sanitizedPath = path.toLowerCase().split(/[?#]/, 1)[0] const fileName = sanitizedPath.split(/[/\\]/).pop() ?? sanitizedPath @@ -121,6 +155,37 @@ function inferMimeTypeFromUrl(url: string): string | null { return null } +async function convertAudioBufferToMp3( + ctx: Context, + buffer: Buffer +): Promise { + const ffmpeg = ctx.ffmpeg + if (!ffmpeg) { + return null + } + + try { + return await ffmpeg + .builder() + .input(buffer) + .outputOption( + '-vn', + '-acodec', + 'libmp3lame', + '-q:a', + '4', + '-f', + 'mp3' + ) + .run('buffer') + } catch (error) { + logger.warn( + `read_files audio transcoding to mp3 failed: ${error instanceof Error ? error.message : String(error)}` + ) + return null + } +} + /** * Check whether the model natively supports a given MIME type based on its * capabilities and `FileHandlingConfig`. @@ -133,9 +198,15 @@ function modelSupportsNativeMimeType( let capabilitySupportsMime = false if (IMAGE_MIME_TYPES.has(mimeType)) { - capabilitySupportsMime = caps.includes(ModelCapabilities.ImageInput) + capabilitySupportsMime = modelCanReadImage( + { value: model.modelInfo }, + model.modelInfo.name + ) } else if (mimeType.startsWith('audio/')) { - capabilitySupportsMime = caps.includes(ModelCapabilities.AudioInput) + capabilitySupportsMime = modelCanReadAudio( + { value: model.modelInfo }, + model.modelInfo.name + ) } else if (mimeType.startsWith('video/')) { capabilitySupportsMime = caps.includes(ModelCapabilities.VideoInput) } else if ( @@ -207,7 +278,8 @@ function buildMultimodalMessage( base64Data: string sourceUrl: string }[], - insertPrompt: string + insertPrompt: string, + model?: string ): HumanMessage { const content: MessageContentComplex[] = [] @@ -215,23 +287,14 @@ function buildMultimodalMessage( const { mimeType, base64Data } = part if (IMAGE_MIME_TYPES.has(mimeType)) { - content.push({ - type: 'image_url', - image_url: { - url: `data:${mimeType};base64,${base64Data}` - } - }) + content.push(buildImageContent(base64Data, mimeType)) } else if (mimeType.startsWith('audio/')) { - const audioContent: MessageContentAudio = { - type: 'audio_url', - audio_url: { - url: `data:${mimeType};base64,${base64Data}`, - mimeType - } - } + const audioContent = buildAudioContent(model, base64Data, mimeType) if (isMessageContentAudio(audioContent as MessageContentComplex)) { content.push(audioContent as MessageContentComplex) + } else if (audioContent.type === 'input_audio') { + content.push(audioContent as MessageContentComplex) } } else if (mimeType.startsWith('video/')) { const videoContent: MessageContentVideo = { @@ -274,25 +337,7 @@ export class ReadFilesTool extends StructuredTool { name = 'read_files' description: string - schema = z.object({ - files: z - .union([ - z.object({ - url: z.string().url() - }), - z - .array( - z.object({ - url: z.string().url() - }) - ) - .min(1) - .max(10) - ]) - .describe( - 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' - ) - }) + schema = readFilesInputSchema constructor( private readonly ctx: Context, @@ -383,23 +428,21 @@ export class ReadFilesTool extends StructuredTool { const buffer = Buffer.from(httpResponse.data) // Resolve MIME type from response headers or URL - const headers = httpResponse.headers as unknown as - | Record - | undefined - const rawCt = - headers?.['content-type'] ?? headers?.['Content-Type'] - let responseMimeType: string | null = null - if (typeof rawCt === 'string') { - responseMimeType = normalizeMimeType(rawCt) - } else if ( - Array.isArray(rawCt) && - typeof rawCt[0] === 'string' - ) { - responseMimeType = normalizeMimeType(rawCt[0]) - } + const responseMimeType = normalizeMimeType( + getHeaderValue(httpResponse.headers, 'content-type') + ) - const mimeType = + const declaredMimeType = responseMimeType ?? inferMimeTypeFromUrl(sourceUrl) + const detectedAudioMimeType = detectAudioMimeType( + buffer, + declaredMimeType + ) + const mimeType = + declaredMimeType?.startsWith('audio/') || + detectedAudioMimeType?.startsWith('audio/') + ? detectedAudioMimeType + : declaredMimeType if (!mimeType) { pushError( @@ -418,23 +461,60 @@ export class ReadFilesTool extends StructuredTool { // Check if the model supports this MIME type natively const isImage = IMAGE_MIME_TYPES.has(mimeType) + const isAudio = mimeType.startsWith('audio/') const modelSupports = model != null && - modelSupportsNativeMimeType(model, mimeType) + (isAudio + ? modelCanReadAudio( + { value: model.modelInfo }, + model.modelInfo.name + ) + : modelSupportsNativeMimeType(model, mimeType)) if (modelSupports && !isImage) { // Non-image file that the model supports natively -> inline inject - const maxFileSize = - fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES + let nativeBuffer: Buffer = buffer + let nativeMimeType = mimeType - const encodedSize = getBase64EncodedSize(buffer.byteLength) + if (isAudio && !isMimoAudioMime(mimeType)) { + const converted = await convertAudioBufferToMp3( + this.ctx, + buffer + ) + + if (!converted) { + pushError( + `Unsupported audio MIME type "${mimeType}" and ffmpeg conversion to MP3 failed.`, + mimeType + ) + continue + } + + nativeBuffer = converted + nativeMimeType = 'audio/mpeg' + logger.debug( + `Transcoded read_files audio from ${mimeType} to audio/mpeg for multimodal input` + ) + } + + const maxFileSize = + isMimoAudioMime(nativeMimeType) && + modelCanReadAudio(undefined, model?.modelInfo.name) + ? MIMO_BASE64_AUDIO_BYTES + : (fileConfig?.maxFileSizeBytesOverrides?.[ + nativeMimeType + ] ?? + fileConfig?.maxFileSizeBytes ?? + DEFAULT_MAX_FILE_SIZE_BYTES) + + const encodedSize = getBase64EncodedSize( + nativeBuffer.byteLength + ) if (encodedSize > maxFileSize) { pushError( - `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${mimeType}`, - mimeType + `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${nativeMimeType}`, + nativeMimeType ) continue } @@ -442,21 +522,21 @@ export class ReadFilesTool extends StructuredTool { if (totalBase64Bytes + encodedSize > maxTotalSize) { pushError( `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`, - mimeType + nativeMimeType ) continue } totalBase64Bytes += encodedSize nativeParts.push({ - mimeType, - base64Data: buffer.toString('base64'), + mimeType: nativeMimeType, + base64Data: nativeBuffer.toString('base64'), sourceUrl }) response.files.push({ sourceUrl, - mimeType, + mimeType: nativeMimeType, status: 'ok' }) response.successCount++ @@ -464,9 +544,14 @@ export class ReadFilesTool extends StructuredTool { // Image that the model supports natively -> inject directly // Unified per-file size check before any branching const maxFileSize = - fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES + isMimoImageMime(mimeType) && + modelCanReadImage(undefined, model?.modelInfo.name) + ? MIMO_BASE64_IMAGE_BYTES + : (fileConfig?.maxFileSizeBytesOverrides?.[ + mimeType + ] ?? + fileConfig?.maxFileSizeBytes ?? + DEFAULT_MAX_FILE_SIZE_BYTES) const encodedSize = getBase64EncodedSize(buffer.byteLength) @@ -592,7 +677,8 @@ export class ReadFilesTool extends StructuredTool { if (nativeParts.length > 0 && conversationId) { const message = buildMultimodalMessage( nativeParts, - this.config.fileInsertPrompt + this.config.fileInsertPrompt, + model?.modelInfo.name ) this.ctx.chatluna.contextManager.inject({ diff --git a/packages/service-multimodal/src/read_files_schema.ts b/packages/service-multimodal/src/read_files_schema.ts new file mode 100644 index 000000000..8368f395d --- /dev/null +++ b/packages/service-multimodal/src/read_files_schema.ts @@ -0,0 +1,31 @@ +import z from 'zod' + +const READ_FILE_SCHEMA = z.object({ + url: z.string().url() +}) + +function parseJsonStringInput(value: unknown): unknown { + if (typeof value !== 'string') { + return value + } + + try { + return JSON.parse(value) + } catch { + return value + } +} + +export const readFilesInputSchema = z.object({ + files: z + .preprocess( + parseJsonStringInput, + z.union([ + READ_FILE_SCHEMA, + z.array(READ_FILE_SCHEMA).min(1).max(10) + ]) + ) + .describe( + 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' + ) +}) diff --git a/packages/service-multimodal/tests/audio-mimo.test.ts b/packages/service-multimodal/tests/audio-mimo.test.ts new file mode 100644 index 000000000..03b09d706 --- /dev/null +++ b/packages/service-multimodal/tests/audio-mimo.test.ts @@ -0,0 +1,149 @@ +import assert from 'node:assert/strict' +import { test } from 'node:test' +import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' +import { + MIMO_BASE64_AUDIO_BYTES, + MIMO_BASE64_IMAGE_BYTES, + buildAudioContent, + buildImageContent, + isMimoAudioMime, + isMimoImageMime, + modelCanReadAudio, + modelCanReadImage +} from '../src/audio' +import { detectAudioMimeType } from '../src/media' +import { readFilesInputSchema } from '../src/read_files_schema' + +test('recognizes MiMo audio models even when adapter metadata lacks AudioInput', () => { + assert.equal( + modelCanReadAudio( + { value: { capabilities: [ModelCapabilities.ToolCall] } }, + 'unifyllm/mimo-v2.5' + ), + true + ) + assert.equal( + modelCanReadAudio( + { value: { capabilities: [ModelCapabilities.ToolCall] } }, + 'mimo-v2-omni' + ), + true + ) + assert.equal( + modelCanReadAudio( + { value: { capabilities: [ModelCapabilities.ToolCall] } }, + 'unifyllm/deepseek-v4-flash' + ), + false + ) +}) + +test('uses MiMo input_audio data URL instead of ChatLuna audio_url', () => { + assert.deepEqual(buildAudioContent('mimo-v2.5', 'abc', 'audio/mpeg'), { + type: 'input_audio', + input_audio: { + data: 'data:audio/mpeg;base64,abc' + } + }) + assert.deepEqual(buildAudioContent('gpt-4o-audio', 'abc', 'audio/mpeg'), { + type: 'audio_url', + audio_url: { + url: 'data:audio/mpeg;base64,abc', + mimeType: 'audio/mpeg' + } + }) +}) + +test('keeps MiMo base64 audio within the documented 50 MB limit', () => { + assert.equal(MIMO_BASE64_AUDIO_BYTES, 50 * 1024 * 1024) + assert.equal(isMimoAudioMime('audio/mpeg'), true) + assert.equal(isMimoAudioMime('audio/wav'), true) + assert.equal(isMimoAudioMime('audio/flac'), true) + assert.equal(isMimoAudioMime('audio/mp4'), true) + assert.equal(isMimoAudioMime('audio/ogg'), true) + assert.equal(isMimoAudioMime('audio/aac'), false) +}) + +test('recognizes MiMo image models even when adapter metadata lacks ImageInput', () => { + assert.equal( + modelCanReadImage( + { value: { capabilities: [ModelCapabilities.ToolCall] } }, + 'unifyllm/mimo-v2.5' + ), + true + ) + assert.equal( + modelCanReadImage( + { value: { capabilities: [ModelCapabilities.ToolCall] } }, + 'mimo-v2-omni' + ), + true + ) + assert.equal( + modelCanReadImage( + { value: { capabilities: [ModelCapabilities.ToolCall] } }, + 'unifyllm/deepseek-v4-flash' + ), + false + ) +}) + +test('uses OpenAI image_url content for MiMo images', () => { + assert.deepEqual(buildImageContent('abc', 'image/png'), { + type: 'image_url', + image_url: { + url: 'data:image/png;base64,abc' + } + }) +}) + +test('keeps MiMo base64 images within the documented 50 MB limit', () => { + assert.equal(MIMO_BASE64_IMAGE_BYTES, 50 * 1024 * 1024) + assert.equal(isMimoImageMime('image/jpeg'), true) + assert.equal(isMimoImageMime('image/png'), true) + assert.equal(isMimoImageMime('image/gif'), true) + assert.equal(isMimoImageMime('image/webp'), true) + assert.equal(isMimoImageMime('image/bmp'), true) + assert.equal(isMimoImageMime('image/svg+xml'), false) +}) + +test('accepts JSON-stringified read_files input from tool calls', () => { + assert.deepEqual( + readFilesInputSchema.parse({ + files: '{"url":"http://127.0.0.1:5140/image.png"}' + }), + { + files: { + url: 'http://127.0.0.1:5140/image.png' + } + } + ) + + assert.deepEqual( + readFilesInputSchema.parse({ + files: '[{"url":"http://127.0.0.1:5140/image.png"}]' + }), + { + files: [ + { + url: 'http://127.0.0.1:5140/image.png' + } + ] + } + ) +}) + +test('detects AMR audio even when storage declares it as MP3', () => { + assert.equal( + detectAudioMimeType(Buffer.from('#!AMR\nabc'), 'audio/mp3'), + 'audio/amr' + ) + assert.equal( + detectAudioMimeType(Buffer.from('#!AMR\nabc'), null), + 'audio/amr' + ) + assert.equal( + detectAudioMimeType(Buffer.from('ID3abc'), 'audio/mp3'), + 'audio/mpeg' + ) +}) From 6ac2a42ffd72b340dff1dc1c751a22cb0ede72b1 Mon Sep 17 00:00:00 2001 From: yabo083 Date: Mon, 18 May 2026 03:31:32 +0800 Subject: [PATCH 2/7] fix(service-multimodal): prevent JPEG from being misdetected as MP3 `detectAudioMimeType` checked only `buffer[0] === 0xFF` to identify MP3 frame sync, but JPEG files also start with 0xFF (FF D8). This caused every JPEG passed through `read_files` to be injected into the conversation as `audio/mpeg`, crashing model APIs that reject unsupported audio formats. Tighten the check to require the full MPEG sync word: `buffer[0] === 0xFF && (buffer[1] & 0xE0) === 0xE0`. Co-Authored-By: Claude Sonnet 4.6 --- packages/service-multimodal/src/media.ts | 5 ++++- .../tests/audio-mimo.test.ts | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/packages/service-multimodal/src/media.ts b/packages/service-multimodal/src/media.ts index 0c2cd3602..0e458913c 100644 --- a/packages/service-multimodal/src/media.ts +++ b/packages/service-multimodal/src/media.ts @@ -11,7 +11,10 @@ export function detectAudioMimeType( ) { return 'audio/silk' } - if (header.startsWith('ID3') || buffer[0] === 0xff) { + if ( + header.startsWith('ID3') || + (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) + ) { return 'audio/mpeg' } if ( diff --git a/packages/service-multimodal/tests/audio-mimo.test.ts b/packages/service-multimodal/tests/audio-mimo.test.ts index 03b09d706..240e67c3d 100644 --- a/packages/service-multimodal/tests/audio-mimo.test.ts +++ b/packages/service-multimodal/tests/audio-mimo.test.ts @@ -147,3 +147,23 @@ test('detects AMR audio even when storage declares it as MP3', () => { 'audio/mpeg' ) }) + +test('does not misidentify JPEG as audio/mpeg', () => { + // JPEG starts with FF D8 FF E0 (JFIF) — 0xD8 & 0xE0 = 0xC0, not an MP3 sync + const jpegHeader = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46]) + assert.equal( + detectAudioMimeType(jpegHeader, 'image/jpeg'), + 'image/jpeg' + ) + assert.equal(detectAudioMimeType(jpegHeader, null), null) +}) + +test('still detects valid MP3 frame sync', () => { + // MP3: FF FB (MPEG1 Layer3) — 0xFB & 0xE0 = 0xE0, valid sync + const mp3Header = Buffer.from([0xff, 0xfb, 0x90, 0x00]) + assert.equal(detectAudioMimeType(mp3Header, null), 'audio/mpeg') + + // MP3: FF F3 (MPEG2 Layer3) + const mp3v2Header = Buffer.from([0xff, 0xf3, 0x90, 0x00]) + assert.equal(detectAudioMimeType(mp3v2Header, null), 'audio/mpeg') +}) From 8819b136169d598df175cec72bc31fe16032eb55 Mon Sep 17 00:00:00 2001 From: dingyi Date: Mon, 18 May 2026 14:41:04 +0800 Subject: [PATCH 3/7] [Feature] support native multimodal file handling --- packages/adapter-claude/src/utils.ts | 34 +- packages/adapter-gemini/src/utils.ts | 41 +- packages/adapter-ollama/src/utils.ts | 33 +- packages/adapter-openai-like/src/client.ts | 6 + packages/adapter-openai/src/client.ts | 6 + packages/adapter-qwen/src/utils.ts | 52 +- packages/service-multimodal/README.md | 28 +- packages/service-multimodal/package.json | 4 +- packages/service-multimodal/src/audio.ts | 111 --- packages/service-multimodal/src/index.ts | 8 +- packages/service-multimodal/src/media.ts | 30 - .../service-multimodal/src/plugins/audio.ts | 410 ++------ .../service-multimodal/src/plugins/image.ts | 209 ++-- .../src/plugins/read_files.ts | 922 +++++++----------- .../src/read_files_schema.ts | 31 - packages/service-multimodal/src/utils.ts | 443 +++++---- .../tests/audio-mimo.test.ts | 169 ---- packages/shared-adapter/src/client.ts | 81 +- packages/shared-adapter/src/utils.ts | 146 ++- 19 files changed, 1022 insertions(+), 1742 deletions(-) delete mode 100644 packages/service-multimodal/src/audio.ts delete mode 100644 packages/service-multimodal/src/media.ts delete mode 100644 packages/service-multimodal/src/read_files_schema.ts delete mode 100644 packages/service-multimodal/tests/audio-mimo.test.ts diff --git a/packages/adapter-claude/src/utils.ts b/packages/adapter-claude/src/utils.ts index 06c8f8e9e..b9d38a422 100644 --- a/packages/adapter-claude/src/utils.ts +++ b/packages/adapter-claude/src/utils.ts @@ -56,41 +56,15 @@ export async function langchainMessageToClaudeMessage( const mappedMessages = await Promise.all( messages.map(async (rawMessage) => { - let content: string | ClaudeInputContentBlockParam[] | undefined = + const content: string | ClaudeInputContentBlockParam[] | undefined = typeof rawMessage.content === 'string' ? rawMessage.content : await processMessageContent(plugin, rawMessage.content) - const images = rawMessage.additional_kwargs.images as - | string[] - | null - - if ( - (model?.includes('claude-3') || model?.includes('claude-4')) && - images != null - ) { - const mappedImages = await Promise.all( - images.map(async (image) => - processImageContent(plugin, { - type: 'image_url', - image_url: { url: image } - } as MessageContentImageUrl) - ) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' ) - - const nextContent: ClaudeInputContentBlockParam[] = - mappedImages.filter((item) => item != null) - - if (Array.isArray(content)) { - nextContent.push(...content) - } else if ((content?.length ?? 0) > 0) { - nextContent.push({ - type: 'text', - text: content - }) - } - - content = nextContent } const result: ClaudeMessage = { diff --git a/packages/adapter-gemini/src/utils.ts b/packages/adapter-gemini/src/utils.ts index 3875a1b95..9f5a0c5ba 100644 --- a/packages/adapter-gemini/src/utils.ts +++ b/packages/adapter-gemini/src/utils.ts @@ -14,7 +14,6 @@ import { ChatCompletionResponseMessageRoleEnum, ChatFunctionCallingPart, ChatFunctionResponsePart, - ChatMessagePart, ChatPart, ChatResponse, GeminiUsageMetadata @@ -78,9 +77,10 @@ export async function langchainMessageToGeminiMessage( thoughtData ) - const images = message.additional_kwargs.images as string[] | null - if (images) { - processImageParts(result, images, model) + if (message.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' + ) } return result @@ -203,39 +203,6 @@ async function processFunctionMessage( ] } } -function processImageParts( - result: ChatCompletionResponseMessage, - images: string[], - model: string -) { - if ( - !( - (model.includes('vision') || - model.includes('gemini') || - model.includes('gemma2')) && - !model.includes('gemini-1.0') - ) - ) { - return - } - - for (const image of images) { - const mineType = image.split(';')?.[0]?.split(':')?.[1] ?? 'image/jpeg' - const data = image.replace(/^data:image\/\w+;base64,/, '') - - result.parts.push({ - inline_data: { data, mime_type: mineType } - }) - } - - result.parts = result.parts.filter((uncheckedPart) => { - const part = partAsTypeCheck( - uncheckedPart, - (part) => part['text'] != null - ) - return part == null || part.text.length > 0 - }) -} async function processGeminiImageContent( plugin: ChatLunaPlugin, diff --git a/packages/adapter-ollama/src/utils.ts b/packages/adapter-ollama/src/utils.ts index 30f671cc6..946e2bab1 100644 --- a/packages/adapter-ollama/src/utils.ts +++ b/packages/adapter-ollama/src/utils.ts @@ -21,25 +21,24 @@ export async function langchainMessageToOllamaMessage( const mappedMessage = await Promise.all( messages.map(async (rawMessage) => { - let images: string[] = [] - - if (rawMessage.additional_kwargs.images != null && supportImage) { - images = rawMessage.additional_kwargs.images as string[] - } else { - images = - typeof rawMessage.content === 'string' - ? undefined - : await Promise.all( - rawMessage.content - .filter((part) => - isMessageContentImageUrl(part) - ) - .map((part) => - processOllamaImageContent(plugin, part) - ) - ) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' + ) } + const images: string[] | undefined = supportImage + ? typeof rawMessage.content === 'string' + ? undefined + : await Promise.all( + rawMessage.content + .filter((part) => isMessageContentImageUrl(part)) + .map((part) => + processOllamaImageContent(plugin, part) + ) + ) + : undefined + const result = { role: messageTypeToOllamaRole(rawMessage.getType()), content: getMessageContent(rawMessage.content), diff --git a/packages/adapter-openai-like/src/client.ts b/packages/adapter-openai-like/src/client.ts index 20403fdf3..3435f9df2 100644 --- a/packages/adapter-openai-like/src/client.ts +++ b/packages/adapter-openai-like/src/client.ts @@ -20,10 +20,12 @@ import { OpenAIRequester } from './requester' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { getModelMaxContextSize, + getOpenAIFileHandlingConfig, isEmbeddingModel, isImageGenerationModel, isNonLLMModel, isRerankerModel, + supportAudioInput, supportImageInput } from '@chatluna/v1-shared-adapter' import { RunnableConfig } from '@langchain/core/runnables' @@ -92,6 +94,9 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient { ModelCapabilities.ToolCall, supportImageInput(model) ? ModelCapabilities.ImageInput + : null, + supportAudioInput(model) + ? ModelCapabilities.AudioInput : null ].filter(Boolean) } @@ -167,6 +172,7 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient { temperature: this._config.temperature, maxRetries: this._config.maxRetries, llmType: 'openai', + fileHandlingConfig: getOpenAIFileHandlingConfig(model), isThinkModel: model.includes('reasoner') || model.includes('r1') || diff --git a/packages/adapter-openai/src/client.ts b/packages/adapter-openai/src/client.ts index 6a4857855..0e0f0ffc1 100644 --- a/packages/adapter-openai/src/client.ts +++ b/packages/adapter-openai/src/client.ts @@ -20,6 +20,8 @@ import { OpenAIRequester } from './requester' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { getModelMaxContextSize, + getOpenAIFileHandlingConfig, + supportAudioInput, supportImageInput } from '@chatluna/v1-shared-adapter' import { RunnableConfig } from '@langchain/core/runnables' @@ -84,6 +86,9 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient ModelCapabilities.ToolCall, supportImageInput(model) ? ModelCapabilities.ImageInput + : undefined, + supportAudioInput(model) + ? ModelCapabilities.AudioInput : undefined ].filter(Boolean) } as ModelInfo @@ -125,6 +130,7 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient timeout: this._config.timeout, temperature: this._config.temperature, maxRetries: this._config.maxRetries, + fileHandlingConfig: getOpenAIFileHandlingConfig(model), llmType: 'openai' }) } diff --git a/packages/adapter-qwen/src/utils.ts b/packages/adapter-qwen/src/utils.ts index 09201bb36..9491127cf 100644 --- a/packages/adapter-qwen/src/utils.ts +++ b/packages/adapter-qwen/src/utils.ts @@ -5,7 +5,6 @@ import { ChatMessageChunk, FunctionMessageChunk, HumanMessageChunk, - MessageContentImageUrl, MessageType, SystemMessageChunk, ToolMessage, @@ -21,11 +20,11 @@ import { } from './types' import { fetchImageUrl, - removeAdditionalProperties, - supportImageInput + removeAdditionalProperties } from '@chatluna/v1-shared-adapter' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { isZodSchemaV3 } from '@langchain/core/utils/types' +import { logger } from '.' export function formatToolsToQWenTools( tools: StructuredTool[] @@ -113,50 +112,13 @@ export async function langchainMessageToQWenMessage( } } - const images = rawMessage.additional_kwargs.images as string[] | null - - if ( - (model?.includes('qwen-vl') || - model?.includes('omni') || - model?.includes('qwen2.5-vl') || - model?.includes('qwen2.5-omni') || - model?.includes('qwen-omni') || - model?.includes('qwen2-vl') || - model?.includes('qvq') || - supportImageInput(model)) && - images != null - ) { - msg.content = [ - { - type: 'text', - text: rawMessage.content as string - } - ] - - const imageContents = await Promise.all( - images.map(async (image) => { - try { - const url = await fetchImageUrl(plugin, { - type: 'image_url', - image_url: { url: image } - } as MessageContentImageUrl) - return { - type: 'image_url', - image_url: { - url, - detail: 'low' - } - } as const - } catch { - return null - } - }) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' ) + } - msg.content.push( - ...imageContents.filter((content) => content != null) - ) - } else if (Array.isArray(msg.content) && msg.content.length > 0) { + if (Array.isArray(msg.content) && msg.content.length > 0) { const mappedContent = await Promise.all( msg.content.map(async (content) => { if (!isMessageContentImageUrl(content)) return content diff --git a/packages/service-multimodal/README.md b/packages/service-multimodal/README.md index a630b5799..84e92d7bf 100644 --- a/packages/service-multimodal/README.md +++ b/packages/service-multimodal/README.md @@ -1,29 +1,7 @@ ## koishi-plugin-chatluna-multimodal-service -ChatLuna 多模态支持服务,提供上下文图像描述、GIF 帧处理、`read_files` 文件读取,以及语音消息转码注入能力。 +## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service) -### MiMo 音频理解 +> ChatLuna 的多模态服务插件,提供上下文图像/语音描述、GIF 处理与 `read_files` 文件读取工具。 -MiMo 官方 OpenAI 兼容接口中,`mimo-v2.5` 与 `mimo-v2-omni` 支持音频理解。服务会将 QQ/OneBot 语音先下载到本地内存,必要时通过 `ffmpeg` 转成 MP3,再以 Base64 data URL 注入 `input_audio`: - -- 避免 QQ CDN 直链过期导致模型侧晚读失败。 -- 规避 AMR、Silk 等上游模型不稳定支持的格式。 -- 遵循 MiMo Base64 单音频 50 MB 上限;URL 输入的官方上限是单文件 100 MB。 - -MiMo 官方列出的音频格式为 MP3、WAV、FLAC、M4A、OGG。实际变体较多,服务默认把语音消息转成 MP3 以提高稳定性。 - -`read_files` 也会沿用这条路线:工具调用层如果把 `files` 传成 JSON 字符串,会先容错解析;音频 URL 即使被缓存服务误标为 MP3,也会按文件头识别 AMR/Silk 等实际格式,并在模型注入前通过 `ffmpeg` 转成 MP3。 - -### MiMo 图片理解 - -`mimo-v2.5` 与 `mimo-v2-omni` 也支持图片理解。即使 OpenAI 兼容适配器暂未在模型元数据中声明 `ImageInput`,服务也会把这两个 MiMo 模型视为原生图片输入模型,并使用标准 OpenAI 兼容 `image_url` 内容块注入 Base64 data URL。 - -- 支持 JPEG、PNG、GIF、WebP、BMP。 -- MiMo Base64 单图片上限为 50 MB;URL 单图片官方上限同样为 50 MB。 -- 多图输入受模型上下文和 token 长度限制。 - -音频消息转码需要启用: - -- `enableAudioFfmpegConversion` -- `koishi-plugin-ffmpeg-path` -- 官方 Bot/QQ Silk 语音还需要 `koishi-plugin-ffmpeg-path` 提供的 `silk` 服务 +[多模态插件文档](https://chatluna.chat/ecosystem/plugin/multimodal-service.html) diff --git a/packages/service-multimodal/package.json b/packages/service-multimodal/package.json index f79bab824..98586c1f9 100644 --- a/packages/service-multimodal/package.json +++ b/packages/service-multimodal/package.json @@ -34,8 +34,7 @@ }, "homepage": "https://github.com/ChatLunaLab/chatluna/tree/v1-dev/packages/service-image#readme", "scripts": { - "build": "atsc -b", - "test": "tsx --test tests/*.test.ts" + "build": "atsc -b" }, "keywords": [ "chatbot", @@ -58,7 +57,6 @@ "@types/omggif": "^1.0.5", "atsc": "^2.1.0", "koishi": "^4.18.9", - "koishi-plugin-adapter-onebot": "^6.9.3", "koishi-plugin-ffmpeg-path": "^2.0.0" }, "peerDependencies": { diff --git a/packages/service-multimodal/src/audio.ts b/packages/service-multimodal/src/audio.ts deleted file mode 100644 index 71aa25f00..000000000 --- a/packages/service-multimodal/src/audio.ts +++ /dev/null @@ -1,111 +0,0 @@ -import { MessageContentComplex } from '@langchain/core/messages' -import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' - -export const MIMO_BASE64_AUDIO_BYTES = 50 * 1024 * 1024 -export const MIMO_BASE64_IMAGE_BYTES = 50 * 1024 * 1024 - -const mimoModels = new Set(['mimo-v2.5', 'mimo-v2-omni']) - -const mimoAudioMimes = new Set([ - 'audio/mpeg', - 'audio/mp3', - 'audio/wav', - 'audio/flac', - 'audio/mp4', - 'audio/ogg' -]) - -const mimoImageMimes = new Set([ - 'image/jpeg', - 'image/png', - 'image/gif', - 'image/webp', - 'image/bmp' -]) - -export function isMimoAudioModel(model?: string): boolean { - if (!model) return false - return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '') -} - -export function isMimoImageModel(model?: string): boolean { - if (!model) return false - return mimoModels.has(model.split('/').pop()?.toLowerCase() ?? '') -} - -export function isMimoAudioMime(mime: string): boolean { - return mimoAudioMimes.has(mime.toLowerCase()) -} - -export function isMimoImageMime(mime: string): boolean { - return mimoImageMimes.has(mime.toLowerCase()) -} - -export function modelCanReadAudio( - info: - | { - value?: { - capabilities?: ModelCapabilities[] - } - } - | undefined, - model?: string -): boolean { - return ( - isMimoAudioModel(model) || - info?.value?.capabilities?.includes(ModelCapabilities.AudioInput) === - true - ) -} - -export function modelCanReadImage( - info: - | { - value?: { - capabilities?: ModelCapabilities[] - } - } - | undefined, - model?: string -): boolean { - return ( - isMimoImageModel(model) || - info?.value?.capabilities?.includes(ModelCapabilities.ImageInput) === - true - ) -} - -export function buildAudioContent( - model: string | undefined, - data: string, - mime: string -): MessageContentComplex { - if (isMimoAudioModel(model)) { - return { - type: 'input_audio', - input_audio: { - data: `data:${mime};base64,${data}` - } - } as unknown as MessageContentComplex - } - - return { - type: 'audio_url', - audio_url: { - url: `data:${mime};base64,${data}`, - mimeType: mime - } - } as unknown as MessageContentComplex -} - -export function buildImageContent( - data: string, - mime: string -): MessageContentComplex { - return { - type: 'image_url', - image_url: { - url: `data:${mime};base64,${data}` - } - } as unknown as MessageContentComplex -} diff --git a/packages/service-multimodal/src/index.ts b/packages/service-multimodal/src/index.ts index b7b402a42..62ff61556 100644 --- a/packages/service-multimodal/src/index.ts +++ b/packages/service-multimodal/src/index.ts @@ -83,7 +83,7 @@ export const Config: Schema = Schema.intersect([ export const inject = { required: ['chatluna'], - optional: ['chatluna_storage', 'ffmpeg', 'silk'] + optional: ['ffmpeg', 'silk'] } export const name = 'chatluna-multimodal-service' @@ -99,10 +99,4 @@ export const usage = ` ### 注意 建议搭配 \`chatluna-storage-service\` 使用。请求中的图像、文件大小限制遵循模型平台配置(如 Gemini:PDF 单文件 50MB、其他单文件 100MB、单轮总计 100MB,以文件被编码为 Base64 后的大小为准)。 - -### MiMo 音频理解 -\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的音频理解走 OpenAI 兼容 \`input_audio\`。启用音频转换后,服务会先读取语音 URL,必要时用 ffmpeg/Silk 转为 MP3,再以 Base64 data URL 注入模型,避免 QQ CDN 过期和 AMR/Silk 等格式兼容问题。MiMo Base64 单音频上限为 50MB。 - -### MiMo 图片理解 -\`mimo-v2.5\` 与 \`mimo-v2-omni\` 的图片理解走 OpenAI 兼容 \`image_url\`。即使适配器暂未声明 \`ImageInput\`,服务也会按 MiMo 官方能力接入 JPEG、PNG、GIF、WebP、BMP,Base64 与 URL 单图片上限均为 50MB,多图受模型上下文限制。 ` diff --git a/packages/service-multimodal/src/media.ts b/packages/service-multimodal/src/media.ts deleted file mode 100644 index 0e458913c..000000000 --- a/packages/service-multimodal/src/media.ts +++ /dev/null @@ -1,30 +0,0 @@ -export function detectAudioMimeType( - buffer: Buffer, - declaredMimeType?: string | null -): string | null { - const header = buffer.subarray(0, 16).toString('latin1') - - if (header.startsWith('#!AMR')) return 'audio/amr' - if ( - header.startsWith('#!SILK_V3') || - buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' - ) { - return 'audio/silk' - } - if ( - header.startsWith('ID3') || - (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) - ) { - return 'audio/mpeg' - } - if ( - header.startsWith('RIFF') && - buffer.subarray(8, 12).toString('latin1') === 'WAVE' - ) { - return 'audio/wav' - } - if (header.startsWith('fLaC')) return 'audio/flac' - if (header.startsWith('OggS')) return 'audio/ogg' - - return declaredMimeType ?? null -} diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts index 57ccf5f64..6afad7fab 100644 --- a/packages/service-multimodal/src/plugins/audio.ts +++ b/packages/service-multimodal/src/plugins/audio.ts @@ -1,109 +1,93 @@ import { MessageContentComplex } from '@langchain/core/messages' import { Context, h, Session } from 'koishi' import type { OneBotBot } from 'koishi-plugin-adapter-onebot' -import { Message } from 'koishi-plugin-chatluna' -import type {} from 'koishi-plugin-chatluna-storage-service' +import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' import type {} from 'koishi-plugin-ffmpeg-path' import { Config, logger } from '..' import { - buildAudioContent, - isMimoAudioModel, - MIMO_BASE64_AUDIO_BYTES, - modelCanReadAudio -} from '../audio' - -const CHATLUNA_DOWNLOAD_USER_AGENT = - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' -const MAX_AUDIO_BYTES = 25 * 1024 * 1024 + BROWSER_UA, + convertAudioToMp3, + detectAudioMimeType, + ensureContentArray +} from '../utils' + +// MIMEs commonly accepted by OpenAI / Gemini / MiMo audio inputs. Anything +// else (Silk, AMR, ...) is transcoded to MP3. +const NATIVE_AUDIO_MIMES = new Set([ + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/flac', + 'audio/ogg', + 'audio/mp4', + 'audio/aac', + 'audio/webm' +]) + +const MIME_TO_EXT: Record = { + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/flac': 'flac', + 'audio/ogg': 'ogg', + 'audio/mp4': 'm4a' +} +/** + * Intercept voice/audio elements: download, transcode unfriendly formats + * (Silk/AMR/...) to MP3, then inject as a Base64 `audio_url` content part. + * OpenAI-compatible adapters convert the result to `input_audio` downstream. + */ export function apply(ctx: Context, config: Config) { - if (!config.enableAudioFfmpegConversion) { - return - } + if (!config.enableAudioFfmpegConversion) return ctx.effect(() => ctx.chatluna.messageTransformer.intercept( 'audio', async (session, element, message, model) => { - const modelInfo = model - ? ctx.chatluna.platform.findModel(model) - : undefined + if (!modelAcceptsAudio(ctx, model)) return false - if (!modelCanReadAudio(modelInfo, model)) { - return false - } + const sourceUrl = await resolveAudioSourceUrl(session, element) + if (!sourceUrl) return false - const sourceUrl = await resolveAudioSourceUrl( - ctx, - session, - element - ) - if (!sourceUrl) { - return false - } - - const fileName = - element.attrs['file'] ?? - element.attrs['name'] ?? - element.attrs['filename'] - - const fileData = await readFile(ctx, sourceUrl) - if (!fileData.buffer) { - return false - } + const buffer = await downloadAudio(ctx, sourceUrl) + if (!buffer) return false - const converted = await tryConvertAudioToMp3( - ctx, - fileData.buffer, - fileName + const detected = detectAudioMimeType( + buffer, + element.attrs['mime'] as string | null ) - if (!converted) { - logger.warn(`Failed to convert audio to MP3: ${sourceUrl}`) - return false - } - - const { fileName: displayFileName, buffer } = converted - element.attrs['file'] = displayFileName - element.attrs['filename'] = displayFileName - - const base64 = buffer.toString('base64') - if ( - isMimoAudioModel(model) && - Buffer.byteLength(base64) > MIMO_BASE64_AUDIO_BYTES - ) { - logger.warn( - `Skip oversized MiMo audio after base64 encoding: ${Buffer.byteLength(base64)} bytes > ${MIMO_BASE64_AUDIO_BYTES} bytes` - ) - return false + let outBuffer = buffer + let outMime = detected ?? 'audio/mpeg' + + if (!detected || !NATIVE_AUDIO_MIMES.has(detected)) { + const converted = await convertAudioToMp3(ctx, buffer) + if (!converted) { + logger.warn( + `Skip audio: format ${detected ?? 'unknown'} not natively supported and ffmpeg conversion failed.` + ) + return false + } + outBuffer = converted + outMime = 'audio/mpeg' } - const audioUrl = - !isMimoAudioModel(model) && ctx.chatluna_storage - ? (element.attrs['chatluna_file_url'] = ( - await ctx.chatluna_storage.createTempFile( - buffer, - displayFileName - ) - ).url) - : ((element.attrs['chatluna_file_url'] = sourceUrl), - `data:audio/mpeg;base64,${base64}`) + const dataUrl = `data:${outMime};base64,${outBuffer.toString('base64')}` + const ext = MIME_TO_EXT[outMime] ?? 'mp3' + const fileName = `${stripExtension(audioName(element))}.${ext}` + element.attrs['file'] = fileName + element.attrs['filename'] = fileName + element.attrs['chatluna_file_url'] = sourceUrl - ensureContentArray(message, `[voice:${displayFileName}]`) - ;(message.content as MessageContentComplex[]).push( - isMimoAudioModel(model) - ? buildAudioContent(model, base64, 'audio/mpeg') - : ({ - type: 'audio_url', - audio_url: { - url: audioUrl, - mimeType: 'audio/mpeg' - } - } as unknown as MessageContentComplex) - ) + ensureContentArray(message, `[voice:${fileName}]`) + ;(message.content as MessageContentComplex[]).push({ + type: 'audio_url', + audio_url: { url: dataUrl, mimeType: outMime } + } as unknown as MessageContentComplex) logger.debug( - `Transcoded unsupported audio to mp3 for multimodal input: ${displayFileName}` + `Injected audio for ${model}: ${fileName} (${outMime}, ${outBuffer.byteLength} bytes)` ) return true }, @@ -112,22 +96,28 @@ export function apply(ctx: Context, config: Config) { ) } +function modelAcceptsAudio(ctx: Context, model: string | undefined): boolean { + if (!model) return false + return ( + ctx.chatluna.platform + .findModel(model) + ?.value?.capabilities?.includes(ModelCapabilities.AudioInput) === + true + ) +} + async function resolveAudioSourceUrl( - ctx: Context, session: Session, element: h ): Promise { - const srcAttr = (element.attrs['src'] ?? element.attrs['url']) as + const src = (element.attrs['src'] ?? element.attrs['url']) as | string | undefined - if (srcAttr?.startsWith('http')) { - return srcAttr - } - - if (session.platform !== 'onebot') return srcAttr ?? null + if (src?.startsWith('http')) return src + if (session.platform !== 'onebot') return src ?? null const fileId = element.attrs['fileId'] ?? element.attrs['fileid'] - if (!fileId) return srcAttr ?? null + if (!fileId) return src ?? null try { const bot = session.bot as OneBotBot @@ -136,239 +126,37 @@ async function resolveAudioSourceUrl( ? await bot.internal.getPrivateFileUrl(session.userId, fileId) : await bot.internal.getGroupFileUrl(session.guildId, fileId, busId) } catch { - return srcAttr ?? null + return src ?? null } } -async function readFile( +async function downloadAudio( ctx: Context, url: string -): Promise<{ buffer: Buffer | null; mimeType: string | null }> { - const headers = { 'User-Agent': CHATLUNA_DOWNLOAD_USER_AGENT } - - let sanitizedUrl: string +): Promise { try { - const parsed = new URL(url) - sanitizedUrl = parsed.origin + parsed.pathname - } catch { - sanitizedUrl = url - } - - let mimeTypeFromHead: string | null = null - - // Try HEAD request for size check - try { - const headResponse = await ctx.http(url, { method: 'head', headers }) - const headHeaders: Headers = headResponse?.headers - mimeTypeFromHead = - headHeaders - ?.get('content-type') - ?.split(';')[0] - ?.trim() - ?.toLowerCase() ?? null - - const headContentLength = headHeaders?.get('content-length') - ? Number(headHeaders.get('content-length')) - : null - - if ( - headContentLength != null && - Number.isFinite(headContentLength) && - headContentLength > MAX_AUDIO_BYTES - ) { - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: ${headContentLength} bytes > ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType: mimeTypeFromHead } - } - } catch { - // Some endpoints do not support HEAD; continue with GET safeguards. - } - - try { - const response = await fetch(url, { method: 'GET', headers }) - - if (!response.ok) { - throw new Error(`HTTP ${response.status}`) - } - - const mimeType = - response.headers - .get('content-type') - ?.split(';')[0] - ?.trim() - ?.toLowerCase() ?? mimeTypeFromHead - const responseContentLength = response.headers.get('content-length') - ? Number(response.headers.get('content-length')) - : null - - if ( - responseContentLength != null && - Number.isFinite(responseContentLength) && - responseContentLength > MAX_AUDIO_BYTES - ) { - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: ${responseContentLength} bytes > ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType } - } - - if (response.body == null) { - const arrayBuffer = await response.arrayBuffer() - if (arrayBuffer.byteLength > MAX_AUDIO_BYTES) { - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: ${arrayBuffer.byteLength} bytes > ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType } - } - return { buffer: Buffer.from(arrayBuffer), mimeType } - } - - const reader = response.body.getReader() - const chunks: Buffer[] = [] - let totalBytes = 0 - - while (true) { - const { done, value } = await reader.read() - if (done) break - - if (!value?.byteLength) continue - - totalBytes += value.byteLength - if (totalBytes > MAX_AUDIO_BYTES) { - await reader.cancel('audio exceeds max size') - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: streamed bytes exceed ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType } - } - - chunks.push(Buffer.from(value)) - } - - return { buffer: Buffer.concat(chunks, totalBytes), mimeType } - } catch (error) { - logger.warn(`Failed to read audio from ${sanitizedUrl}:`, error) - return { buffer: null, mimeType: null } - } -} - -function toMp3FileName(fileName?: string): string { - const baseName = (fileName ?? 'voice').trim() - const dotIndex = baseName.lastIndexOf('.') - return `${dotIndex <= 0 ? baseName : baseName.slice(0, dotIndex)}.mp3` -} - -async function tryConvertAudioToMp3( - ctx: Context, - inputBuffer: Buffer, - fileName?: string -): Promise<{ buffer: Buffer; fileName: string } | null> { - try { - let sourceBuffer = inputBuffer - let decodedPcmSampleRate: number | null = null - - if (isSilkAudio(inputBuffer)) { - const decoded = await decodeSilkAudio(ctx, inputBuffer) - sourceBuffer = decoded.buffer - decodedPcmSampleRate = decoded.sampleRate - logger.debug('Decoded silk audio before mp3 transcoding.') - } - - const ffmpeg = ctx.ffmpeg - if (!ffmpeg) { - throw new Error( - 'FFmpeg service is unavailable. Please enable koishi-plugin-ffmpeg-path.' - ) - } - - const builder = ffmpeg.builder().input(sourceBuffer) - if (decodedPcmSampleRate != null) { - builder.inputOption( - '-f', - 's16le', - '-ar', - String(decodedPcmSampleRate), - '-ac', - '1' - ) - } - - const outputBuffer = await builder - .outputOption( - '-vn', - '-acodec', - 'libmp3lame', - '-q:a', - '4', - '-f', - 'mp3' - ) - .run('buffer') - - return { - buffer: outputBuffer, - fileName: toMp3FileName(fileName) - } + const { data } = await ctx.http(url, { + responseType: 'arraybuffer', + method: 'get', + headers: { 'User-Agent': BROWSER_UA } + }) + return Buffer.from(data) } catch (error) { - logger.warn( - `Audio transcoding to mp3 failed, fallback to original audio: ${error instanceof Error ? error.message : String(error)}` - ) + logger.warn(`Failed to fetch audio from ${url}:`, error) return null } } -function isSilkAudio(inputBuffer: Buffer): boolean { - if (inputBuffer.length < 9) return false - const sig = inputBuffer.subarray(0, 9).toString('latin1') +function audioName(element: h): string { return ( - sig === '#!SILK_V3' || - inputBuffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + (element.attrs['file'] as string | undefined) ?? + (element.attrs['name'] as string | undefined) ?? + (element.attrs['filename'] as string | undefined) ?? + 'voice' ) } -async function decodeSilkAudio( - ctx: Context, - inputBuffer: Buffer -): Promise<{ buffer: Buffer; sampleRate: number }> { - const silk = ctx.silk - if (!silk) { - throw new Error( - 'Detected silk audio, but no silk service is available for decoding' - ) - } - for (const sampleRate of [24000, 16000, 12000, 8000]) { - try { - const result = (await silk.decode( - inputBuffer, - sampleRate - )) as DecodeResult - - if (result?.data != null) { - return { buffer: Buffer.from(result.data), sampleRate } - } - } catch { - continue - } - } - - throw new Error('silk decode returned empty output') -} - -function ensureContentArray(message: Message, fallbackText: string) { - if (typeof message.content === 'string') { - message.content = [ - { - type: 'text', - text: message.content.trim().length - ? message.content - : fallbackText - } - ] - } -} - -interface DecodeResult { - data: Uint8Array - duration: number +function stripExtension(name: string): string { + const dot = name.lastIndexOf('.') + return dot > 0 ? name.slice(0, dot) : name } diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts index 560eb959a..f1b64e51a 100644 --- a/packages/service-multimodal/src/plugins/image.ts +++ b/packages/service-multimodal/src/plugins/image.ts @@ -1,6 +1,6 @@ -/* eslint-disable max-len */ import { Context } from 'koishi' import { Message } from 'koishi-plugin-chatluna' +import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model' import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { Config, logger } from '..' @@ -11,8 +11,12 @@ import { processImageWithModel, readImage } from '../utils' -import { modelCanReadImage } from '../audio' +/** + * Intercept image elements. Native-capable models receive the data URL + * directly (GIFs are split into frames). Otherwise fall back to describing + * the image via the configured vision model and inject the description. + */ export async function apply( ctx: Context, config: Config, @@ -22,133 +26,106 @@ export async function apply( config.imageModel ) - const disposable = ctx.chatluna.messageTransformer.intercept( - 'img', - async (_session, element, message, model) => { - const parsedModelInfo = - model != null - ? ctx.chatluna.platform.findModel(model) - : undefined - const modelSupportsImageInput = - modelCanReadImage(parsedModelInfo, model) + ctx.effect(() => + ctx.chatluna.messageTransformer.intercept( + 'img', + async (_session, element, message, model) => { + const url = (element.attrs.url ?? element.attrs.src) as string + if (!url) return false - let imageData: Awaited> - const url = (element.attrs.url ?? element.attrs.src) as string - - if (modelSupportsImageInput) { - imageData = await readImage(ctx, url) - - if (imageData.ext == null) { + const native = modelAcceptsImage(ctx, model) + if (!native && !config.enableContextImageDescription) { return false } - if (imageData.ext === 'image/gif') { - if (!config.enableContextGifHandling) { - return false - } + const imageData = await readImage(ctx, url) + if (imageData.ext == null) return false - logger.debug(`image url: ${url.substring(0, 50)}...`) - const frames = await parseGifToFrames(imageData.buffer, { - strategy: config.gifStrategy, - frameCount: config.gifFrameCount - }) + const isGif = imageData.ext === 'image/gif' + if (isGif && !config.enableContextGifHandling) return false - logger.debug(`Extracted ${frames.length} frames from GIF`) - - for (const frame of frames) { - addImageToContent(message, frame) + if (native) { + if (isGif) { + await injectGifFrames(message, imageData.buffer, config) + addTextToContent(message, '[image: GIF]') + } else if (imageData.base64Source) { + addImageToContent(message, imageData.base64Source) } - - addTextToContent(message, '[image: GIF]') - return true } - if (imageData.base64Source != null) { - addImageToContent(message, imageData.base64Source) - return true - } - } - - if (!config.enableContextImageDescription) { - return false - } - - if (imageUnderstandModel.value == null) { - logger.warn( - `The model ${config.imageModel} is not loaded, please check your chat adapter` - ) - return false - } - - if ( - !imageUnderstandModel.value.modelInfo.capabilities.includes( - ModelCapabilities.ImageInput - ) - ) { - logger.warn( - `The model ${config.imageModel} in image-service does not support image input, please check your chat adapter` - ) - return false - } - - try { - const fakeMessage: Message = { - content: [] - } - - logger.debug(`image url: ${url}`) - - imageData = imageData ?? (await readImage(ctx, url)) - - if (imageData.ext == null) { - return false - } - - if (imageData.ext === 'image/gif') { - if (!config.enableContextGifHandling) { - return false - } - - const frames = await parseGifToFrames(imageData.buffer, { - strategy: config.gifStrategy, - frameCount: config.gifFrameCount - }) - - logger.debug( - `Extracted ${frames.length} frames from GIF for model processing` - ) - - addTextToContent( - fakeMessage, - 'This is a GIF image. See the frames below:' - ) - for (const frame of frames) { - addImageToContent(fakeMessage, frame) - } - } else { - addImageToContent(fakeMessage, imageData.base64Source) - } - - const result = await processImageWithModel( - imageUnderstandModel.value, + return describeAndInject( + message, + imageData, + isGif, config, - fakeMessage + imageUnderstandModel.value, + url ) + }, + 100 + ) + ) +} - if (result) { - addTextToContent(message, '\n\n' + result) - return true - } - } catch (error) { - logger.warn( - `Read image ${url} error, check your chat adapter`, - error - ) - } - }, - 100 +function modelAcceptsImage(ctx: Context, model: string | undefined): boolean { + if (!model) return false + return ( + ctx.chatluna.platform + .findModel(model) + ?.value?.capabilities?.includes(ModelCapabilities.ImageInput) === + true ) +} - ctx.effect(() => disposable) +async function injectGifFrames( + message: Message, + buffer: Buffer, + config: Config +): Promise { + const frames = await parseGifToFrames(buffer, { + strategy: config.gifStrategy, + frameCount: config.gifFrameCount + }) + logger.debug(`Extracted ${frames.length} frames from GIF`) + for (const frame of frames) addImageToContent(message, frame) +} + +async function describeAndInject( + message: Message, + imageData: Awaited>, + isGif: boolean, + config: Config, + imageModel: ChatLunaChatModel | undefined, + url: string +): Promise { + if ( + imageModel == null || + !imageModel.modelInfo.capabilities.includes( + ModelCapabilities.ImageInput + ) + ) { + logger.warn( + `Image-description model "${config.imageModel}" is missing or lacks image input — skip.` + ) + return false + } + + try { + const fake: Message = { content: [] } + if (isGif) { + addTextToContent(fake, 'This is a GIF image. See the frames below:') + await injectGifFrames(fake, imageData.buffer, config) + } else if (imageData.base64Source) { + addImageToContent(fake, imageData.base64Source) + } + const result = await processImageWithModel(imageModel, config, fake) + if (result) { + addTextToContent(message, '\n\n' + result) + return true + } + } catch (error) { + logger.warn(`Image describe failed for ${url}:`, error) + } + return false } diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index 896261e50..06fe42d33 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -3,330 +3,45 @@ import { StructuredTool } from '@langchain/core/tools' import { HumanMessage, MessageContentComplex } from '@langchain/core/messages' import { Context } from 'koishi' import { ComputedRef, Message } from 'koishi-plugin-chatluna' -import type {} from 'koishi-plugin-ffmpeg-path' import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model' +import type { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client' import { ChatLunaToolRunnable, ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' -import { - isMessageContentAudio, - isMessageContentVideo, - type MessageContentVideo -} from 'koishi-plugin-chatluna/utils/langchain' import { getBase64EncodedSize } from 'koishi-plugin-chatluna/utils/base64' import { Config, logger } from '..' import { addImageToContent, addTextToContent, + BROWSER_UA, + convertAudioToMp3, + detectAudioMimeType, + IMAGE_MIME_TYPES, + inferMimeTypeFromUrl, + normalizeMimeType, parseGifToFrames, processImageWithModel } from '../utils' -import { - buildAudioContent, - buildImageContent, - isMimoAudioMime, - isMimoImageMime, - MIMO_BASE64_AUDIO_BYTES, - MIMO_BASE64_IMAGE_BYTES, - modelCanReadAudio, - modelCanReadImage -} from '../audio' -import { detectAudioMimeType } from '../media' -import { readFilesInputSchema } from '../read_files_schema' import z from 'zod' -// --------------------------------------------------------------------------- -// Constants -// --------------------------------------------------------------------------- - -const IMAGE_MIME_TYPES = new Set([ - 'image/png', - 'image/jpeg', - 'image/bmp', - 'image/webp', - 'image/gif' -]) - const DEFAULT_MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024 const DEFAULT_MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024 -const FILE_EXTENSION_TO_MIME_TYPE = new Map([ - ['.png', 'image/png'], - ['.jpg', 'image/jpeg'], - ['.jpeg', 'image/jpeg'], - ['.bmp', 'image/bmp'], - ['.webp', 'image/webp'], - ['.gif', 'image/gif'], - ['.pdf', 'application/pdf'], - ['.txt', 'text/plain'], - ['.md', 'text/markdown'], - ['.html', 'text/html'], - ['.htm', 'text/html'], - ['.css', 'text/css'], - ['.xml', 'text/xml'], - ['.csv', 'text/csv'], - ['.rtf', 'text/rtf'], - ['.js', 'text/javascript'], - ['.mjs', 'text/javascript'], - ['.json', 'application/json'], - ['.mp4', 'video/mp4'], - ['.mpeg', 'video/mpeg'], - ['.mov', 'video/mov'], - ['.avi', 'video/avi'], - ['.flv', 'video/x-flv'], - ['.mpg', 'video/mpg'], - ['.webm', 'video/webm'], - ['.wmv', 'video/wmv'], - ['.3gp', 'video/3gpp'], - ['.3gpp', 'video/3gpp'], - ['.mp3', 'audio/mpeg'], - ['.aiff', 'audio/aiff'], - ['.aac', 'audio/aac'], - ['.flac', 'audio/flac'], - ['.wav', 'audio/wav'], - ['.ogg', 'audio/ogg'], - ['.m4a', 'audio/mp4'] -]) - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -function isHttpOrHttpsUrl(url: string): boolean { - try { - const parsed = new URL(url) - return parsed.protocol === 'http:' || parsed.protocol === 'https:' - } catch { - return false - } -} - -function normalizeMimeType(raw: string | null): string | null { - if (raw == null) return null - const mimeType = raw.split(';')[0]?.trim()?.toLowerCase() - return mimeType || null -} - -function getHeaderValue(headers: unknown, name: string): string | null { - if (headers == null) return null - - if ( - typeof (headers as { get?: unknown }).get === 'function' - ) { - const value = (headers as { get(name: string): string | null }).get( - name +const fileSchema = z.object({ url: z.string().url() }) +const readFilesSchema = z.object({ + files: z + .union([fileSchema, z.array(fileSchema).min(1).max(10)]) + .describe( + 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' ) - return typeof value === 'string' ? value : null - } - - const record = headers as Record - const value = record[name] ?? record[name.toLowerCase()] - if (typeof value === 'string') return value - if (Array.isArray(value) && typeof value[0] === 'string') { - return value[0] - } - - return null -} - -function inferMimeTypeFromPath(path: string): string | null { - const sanitizedPath = path.toLowerCase().split(/[?#]/, 1)[0] - const fileName = sanitizedPath.split(/[/\\]/).pop() ?? sanitizedPath - const extensionIndex = fileName.lastIndexOf('.') +}) - if (extensionIndex < 0) { - return null - } - - const extension = fileName.slice(extensionIndex) - return FILE_EXTENSION_TO_MIME_TYPE.get(extension) ?? null -} - -function inferMimeTypeFromUrl(url: string): string | null { - try { - const pathname = new URL(url).pathname - return inferMimeTypeFromPath(pathname) - } catch { - // ignore - } - - return null -} - -async function convertAudioBufferToMp3( - ctx: Context, - buffer: Buffer -): Promise { - const ffmpeg = ctx.ffmpeg - if (!ffmpeg) { - return null - } - - try { - return await ffmpeg - .builder() - .input(buffer) - .outputOption( - '-vn', - '-acodec', - 'libmp3lame', - '-q:a', - '4', - '-f', - 'mp3' - ) - .run('buffer') - } catch (error) { - logger.warn( - `read_files audio transcoding to mp3 failed: ${error instanceof Error ? error.message : String(error)}` - ) - return null - } -} - -/** - * Check whether the model natively supports a given MIME type based on its - * capabilities and `FileHandlingConfig`. - */ -function modelSupportsNativeMimeType( - model: ChatLunaChatModel, +interface NativePart { mimeType: string -): boolean { - const caps = model.modelInfo.capabilities - - let capabilitySupportsMime = false - if (IMAGE_MIME_TYPES.has(mimeType)) { - capabilitySupportsMime = modelCanReadImage( - { value: model.modelInfo }, - model.modelInfo.name - ) - } else if (mimeType.startsWith('audio/')) { - capabilitySupportsMime = modelCanReadAudio( - { value: model.modelInfo }, - model.modelInfo.name - ) - } else if (mimeType.startsWith('video/')) { - capabilitySupportsMime = caps.includes(ModelCapabilities.VideoInput) - } else if ( - mimeType.startsWith('text/') || - mimeType === 'application/json' || - mimeType === 'application/pdf' - ) { - capabilitySupportsMime = caps.includes(ModelCapabilities.FileInput) - } - - if (!capabilitySupportsMime) { - return false - } - - const fileConfig = model.fileHandlingConfig - if (fileConfig != null) { - return fileConfig.supportedMimeTypes.has(mimeType) - } - - return true -} - -function isMimeTypeEnabled(config: Config, mimeType: string): boolean { - if (mimeType === 'image/gif') { - return config.enableGifReadTool - } - - if (IMAGE_MIME_TYPES.has(mimeType)) { - return config.enableImageReadTool - } - - return config.enableFileReadTool -} - -function buildReadFilesDescription(config: Config): string { - const sections: string[] = [] - - if (config.enableImageReadTool) { - sections.push( - '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.' - ) - } - - if (config.enableGifReadTool) { - sections.push( - '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.' - ) - } - - if (config.enableFileReadTool) { - sections.push( - '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).' - ) - } - - return `Read files from URL(s) and return their content. -Enabled read_files capabilities: -${sections.join('\n')} -Use this tool when you need to read files from URL(s) as context.` -} - -/** - * Build a multimodal `HumanMessage` containing the file(s) as content parts, - * suitable for injecting into the conversation context. - */ -function buildMultimodalMessage( - parts: { - mimeType: string - base64Data: string - sourceUrl: string - }[], - insertPrompt: string, - model?: string -): HumanMessage { - const content: MessageContentComplex[] = [] - - for (const part of parts) { - const { mimeType, base64Data } = part - - if (IMAGE_MIME_TYPES.has(mimeType)) { - content.push(buildImageContent(base64Data, mimeType)) - } else if (mimeType.startsWith('audio/')) { - const audioContent = buildAudioContent(model, base64Data, mimeType) - - if (isMessageContentAudio(audioContent as MessageContentComplex)) { - content.push(audioContent as MessageContentComplex) - } else if (audioContent.type === 'input_audio') { - content.push(audioContent as MessageContentComplex) - } - } else if (mimeType.startsWith('video/')) { - const videoContent: MessageContentVideo = { - type: 'video_url', - video_url: { - url: `data:${mimeType};base64,${base64Data}`, - mimeType - } - } - - if (isMessageContentVideo(videoContent as MessageContentComplex)) { - content.push(videoContent as MessageContentComplex) - } - } else { - // Inline data for text/pdf/etc. (Gemini-style) - content.push({ - inline_data: { - mime_type: mimeType, - data: base64Data - } - } as unknown as MessageContentComplex) - } - } - - if (content.length > 0) { - content.unshift({ - type: 'text', - text: insertPrompt - }) - } - - return new HumanMessage({ content }) + base64Data: string + sourceUrl: string } // --------------------------------------------------------------------------- @@ -335,10 +50,9 @@ function buildMultimodalMessage( export class ReadFilesTool extends StructuredTool { name = 'read_files' + schema = readFilesSchema description: string - schema = readFilesInputSchema - constructor( private readonly ctx: Context, private readonly config: Config, @@ -347,7 +61,7 @@ export class ReadFilesTool extends StructuredTool { > ) { super({}) - this.description = buildReadFilesDescription(config) + this.description = describeTool(config) } async _call( @@ -359,407 +73,301 @@ export class ReadFilesTool extends StructuredTool { const model = runConfig?.configurable?.model const conversationId = runConfig?.configurable?.conversationId const fileConfig = model?.fileHandlingConfig - - let totalBase64Bytes = 0 - const maxTotalSize = + const maxTotal = fileConfig?.maxTotalSizeBytes ?? DEFAULT_MAX_TOTAL_SIZE_BYTES - const nativeParts: { - mimeType: string - base64Data: string - sourceUrl: string - }[] = [] - - const response: { - files: { - sourceUrl: string - mimeType?: string - status: 'ok' | 'described' | 'error' - description?: string - error?: string - }[] - successCount: number - failureCount: number - } = { + const native: NativePart[] = [] + const report: ToolReport = { files: [], successCount: 0, failureCount: 0 } + let totalBytes = 0 let describedCount = 0 - for (const file of files) { - const sourceUrl = file.url - - const pushError = (errorMessage: string, mimeType?: string) => { - response.files.push({ + for (const { url: sourceUrl } of files) { + if (!isHttp(sourceUrl)) { + pushError( + report, sourceUrl, - mimeType, - status: 'error', - error: errorMessage - }) - response.failureCount++ + 'Only http/https URLs are supported.' + ) + continue } try { - if (!isHttpOrHttpsUrl(sourceUrl)) { - pushError( - 'Only http/https URLs are supported for read_files.' - ) + const fetched = await this._fetch(sourceUrl) + if (!fetched) { + pushError(report, sourceUrl, 'Failed to fetch URL.') continue } - // Determine MIME type first by fetching with headers - const controller = new AbortController() - const timeout = setTimeout(() => controller.abort(), 60_000) - const httpResponse = await this.ctx - .http(sourceUrl, { - responseType: 'arraybuffer', - method: 'get', - headers: { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' - }, - signal: controller.signal - }) - .finally(() => { - clearTimeout(timeout) - }) - - const buffer = Buffer.from(httpResponse.data) - - // Resolve MIME type from response headers or URL - const responseMimeType = normalizeMimeType( - getHeaderValue(httpResponse.headers, 'content-type') - ) - - const declaredMimeType = - responseMimeType ?? inferMimeTypeFromUrl(sourceUrl) - const detectedAudioMimeType = detectAudioMimeType( - buffer, - declaredMimeType + const declared = + normalizeMimeType(fetched.contentType) ?? + inferMimeTypeFromUrl(sourceUrl) + const detectedAudio = detectAudioMimeType( + fetched.buffer, + declared ) - const mimeType = - declaredMimeType?.startsWith('audio/') || - detectedAudioMimeType?.startsWith('audio/') - ? detectedAudioMimeType - : declaredMimeType + const mime = + declared?.startsWith('audio/') || detectedAudio + ? detectedAudio + : declared - if (!mimeType) { + if (!mime) { pushError( - `Could not determine MIME type for ${sourceUrl}. Please ensure the URL returns a valid content type.` + report, + sourceUrl, + `Could not determine MIME type for ${sourceUrl}.` ) continue } - - if (!isMimeTypeEnabled(this.config, mimeType)) { + if (!mimeEnabled(this.config, mime)) { pushError( - `Feature disabled for MIME type "${mimeType}". Please enable the corresponding read_files switch.`, - mimeType + report, + sourceUrl, + `Feature disabled for MIME type "${mime}".`, + mime ) continue } - // Check if the model supports this MIME type natively - const isImage = IMAGE_MIME_TYPES.has(mimeType) - const isAudio = mimeType.startsWith('audio/') - const modelSupports = - model != null && - (isAudio - ? modelCanReadAudio( - { value: model.modelInfo }, - model.modelInfo.name - ) - : modelSupportsNativeMimeType(model, mimeType)) - - if (modelSupports && !isImage) { - // Non-image file that the model supports natively -> inline inject - let nativeBuffer: Buffer = buffer - let nativeMimeType = mimeType - - if (isAudio && !isMimoAudioMime(mimeType)) { - const converted = await convertAudioBufferToMp3( + const isImage = IMAGE_MIME_TYPES.has(mime) + const isAudio = mime.startsWith('audio/') + const supportsNative = + model != null && modelSupportsMime(model, mime) + + // ----- Non-image native: maybe transcode audio, then inline --- + if (!isImage && supportsNative) { + let bytes = fetched.buffer + let outMime = mime + if ( + isAudio && + fileConfig?.supportedMimeTypes && + !fileConfig.supportedMimeTypes.has(mime) + ) { + const converted = await convertAudioToMp3( this.ctx, - buffer + bytes ) - if (!converted) { pushError( - `Unsupported audio MIME type "${mimeType}" and ffmpeg conversion to MP3 failed.`, - mimeType + report, + sourceUrl, + `Unsupported audio MIME "${mime}" and ffmpeg conversion failed.`, + mime ) continue } - - nativeBuffer = converted - nativeMimeType = 'audio/mpeg' - logger.debug( - `Transcoded read_files audio from ${mimeType} to audio/mpeg for multimodal input` - ) + bytes = converted + outMime = 'audio/mpeg' } - const maxFileSize = - isMimoAudioMime(nativeMimeType) && - modelCanReadAudio(undefined, model?.modelInfo.name) - ? MIMO_BASE64_AUDIO_BYTES - : (fileConfig?.maxFileSizeBytesOverrides?.[ - nativeMimeType - ] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES) - - const encodedSize = getBase64EncodedSize( - nativeBuffer.byteLength + const sizeError = checkSize( + bytes, + outMime, + fileConfig, + totalBytes, + maxTotal ) - - if (encodedSize > maxFileSize) { - pushError( - `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${nativeMimeType}`, - nativeMimeType - ) + if (sizeError) { + pushError(report, sourceUrl, sizeError, outMime) continue } - - if (totalBase64Bytes + encodedSize > maxTotalSize) { - pushError( - `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`, - nativeMimeType - ) - continue - } - - totalBase64Bytes += encodedSize - nativeParts.push({ - mimeType: nativeMimeType, - base64Data: nativeBuffer.toString('base64'), - sourceUrl - }) - - response.files.push({ + totalBytes += getBase64EncodedSize(bytes.byteLength) + pushNative( + report, + native, sourceUrl, - mimeType: nativeMimeType, - status: 'ok' - }) - response.successCount++ - } else if (isImage && modelSupports) { - // Image that the model supports natively -> inject directly - // Unified per-file size check before any branching - const maxFileSize = - isMimoImageMime(mimeType) && - modelCanReadImage(undefined, model?.modelInfo.name) - ? MIMO_BASE64_IMAGE_BYTES - : (fileConfig?.maxFileSizeBytesOverrides?.[ - mimeType - ] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES) - - const encodedSize = getBase64EncodedSize(buffer.byteLength) - - if (encodedSize > maxFileSize) { - pushError( - `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`, - mimeType - ) + outMime, + bytes.toString('base64') + ) + continue + } + + // ----- Image native: inject directly (GIF splits to frames) --- + if (isImage && supportsNative) { + const sizeError = checkSize( + fetched.buffer, + mime, + fileConfig, + totalBytes, + maxTotal + ) + if (sizeError) { + pushError(report, sourceUrl, sizeError, mime) continue } - // For GIF: split into frames - if (mimeType === 'image/gif') { - const frames = await parseGifToFrames(buffer, { + if (mime === 'image/gif') { + const frames = await parseGifToFrames(fetched.buffer, { strategy: this.config.gifStrategy, frameCount: this.config.gifFrameCount }) - - logger.debug( - `Extracted ${frames.length} frames from GIF for native model injection` - ) - for (const frame of frames) { - // Frames are data:image/png;base64,... strings const frameBase64 = frame.split(',')[1] const frameSize = getBase64EncodedSize( Buffer.from(frameBase64, 'base64').byteLength ) - - if (totalBase64Bytes + frameSize > maxTotalSize) { + if (totalBytes + frameSize > maxTotal) { logger.warn( 'Skipping remaining GIF frames due to total size limit' ) break } - - totalBase64Bytes += frameSize - nativeParts.push({ - mimeType: 'image/png', - base64Data: frameBase64, - sourceUrl - }) - } - } else { - if (totalBase64Bytes + encodedSize > maxTotalSize) { - pushError( - `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`, - mimeType + totalBytes += frameSize + pushNative( + report, + native, + sourceUrl, + 'image/png', + frameBase64 ) - continue } - - totalBase64Bytes += encodedSize - nativeParts.push({ - mimeType, - base64Data: buffer.toString('base64'), - sourceUrl - }) - } - - response.files.push({ - sourceUrl, - mimeType, - status: 'ok' - }) - response.successCount++ - } else if (isImage) { - // Image but model doesn't support it natively -> describe using image model - const maxFileSize = - fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES - - const encodedSize = getBase64EncodedSize(buffer.byteLength) - - if (encodedSize > maxFileSize) { - pushError( - `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`, - mimeType + } else { + totalBytes += getBase64EncodedSize( + fetched.buffer.byteLength + ) + pushNative( + report, + native, + sourceUrl, + mime, + fetched.buffer.toString('base64') ) - continue } + continue + } - const describeResult = await this._describeImageWithModel( + // ----- Image without native support: describe via vision model - + if (isImage) { + const described = await this._describeImage( sourceUrl, - buffer, - mimeType + fetched.buffer, + mime ) - - if (describeResult) { - response.files.push({ + if (described) { + report.files.push({ sourceUrl, - mimeType, + mimeType: mime, status: 'described', - description: describeResult + description: described }) - response.successCount++ + report.successCount++ describedCount++ } else { pushError( - `Failed to describe image from ${sourceUrl}`, - mimeType + report, + sourceUrl, + 'Failed to describe image.', + mime ) - continue } - } else { - // Non-image, model doesn't support it natively - pushError( - `Unsupported MIME type "${mimeType}" for the current model. The model does not natively support this file type.`, - mimeType - ) continue } + + pushError( + report, + sourceUrl, + `Unsupported MIME "${mime}" for the current model.`, + mime + ) } catch (error) { logger.warn(`read_files error for ${sourceUrl}:`, error) - const errorMessage = + pushError( + report, + sourceUrl, error instanceof Error ? error.message : String(error) - pushError(errorMessage) + ) } } - // Inject native parts into next-round context via contextManager - if (nativeParts.length > 0 && conversationId) { - const message = buildMultimodalMessage( - nativeParts, - this.config.fileInsertPrompt, - model?.modelInfo.name - ) - + if (native.length > 0 && conversationId) { this.ctx.chatluna.contextManager.inject({ conversationId, name: 'read_files_context', - value: message, + value: buildMultimodalMessage( + native, + this.config.fileInsertPrompt + ), once: true, stage: 'after_scratchpad' }) - logger.debug( - `Injected ${nativeParts.length} file part(s) into context for conversation ${conversationId}` + `Injected ${native.length} file part(s) into context for conversation ${conversationId}` ) } return JSON.stringify({ - response, + response: report, note: - nativeParts.length > 0 - ? `Successfully read ${nativeParts.length} file(s). The file content has been added to the conversation context and will be available in the next turn.` + native.length > 0 + ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.` : describedCount > 0 ? `Described ${describedCount} image file(s) using the vision model.` - : response.failureCount > 0 - ? `Failed to read ${response.failureCount} file(s).` + : report.failureCount > 0 + ? `Failed to read ${report.failureCount} file(s).` : 'No files were processed.' }) } - /** - * Describe an image using the configured image model (fallback when the - * main model doesn't support image input). - */ - private async _describeImageWithModel( + private async _fetch( + url: string + ): Promise<{ buffer: Buffer; contentType: string | null } | null> { + try { + const response = await this.ctx.http(url, { + responseType: 'arraybuffer', + method: 'get', + headers: { 'User-Agent': BROWSER_UA }, + timeout: 60_000 + }) + return { + buffer: Buffer.from(response.data), + contentType: + (response.headers as Headers)?.get?.('content-type') ?? null + } + } catch { + return null + } + } + + private async _describeImage( url: string, buffer: Buffer, mimeType: string ): Promise { const imageModel = this.imageModelRef().value - if (imageModel == null) { - logger.warn( - 'Image model is not loaded, cannot describe image. Please check your chat adapter.' - ) - return null - } - if ( + !imageModel || !imageModel.modelInfo.capabilities.includes( ModelCapabilities.ImageInput ) ) { - logger.warn('Image model does not support image input.') + logger.warn( + 'Image model not loaded or lacks image input; cannot describe.' + ) return null } try { - const fakeMessage: Message = { content: [] } - + const fake: Message = { content: [] } if (mimeType === 'image/gif') { const frames = await parseGifToFrames(buffer, { strategy: this.config.gifStrategy, frameCount: this.config.gifFrameCount }) - addTextToContent( - fakeMessage, + fake, 'This is a GIF image. See the frames below:' ) - for (const frame of frames) { - addImageToContent(fakeMessage, frame) - } + for (const frame of frames) addImageToContent(fake, frame) } else { - const base64 = buffer.toString('base64') - const base64Source = `data:${mimeType};base64,${base64}` - addImageToContent(fakeMessage, base64Source) + addImageToContent( + fake, + `data:${mimeType};base64,${buffer.toString('base64')}` + ) } - - return await processImageWithModel( - imageModel, - this.config, - fakeMessage - ) + return await processImageWithModel(imageModel, this.config, fake) } catch (error) { logger.warn(`Describe image ${url} error:`, error) return null @@ -767,6 +375,148 @@ export class ReadFilesTool extends StructuredTool { } } +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +interface ToolReport { + files: { + sourceUrl: string + mimeType?: string + status: 'ok' | 'described' | 'error' + description?: string + error?: string + }[] + successCount: number + failureCount: number +} + +function pushError( + report: ToolReport, + sourceUrl: string, + error: string, + mimeType?: string +) { + report.files.push({ sourceUrl, mimeType, status: 'error', error }) + report.failureCount++ +} + +function pushNative( + report: ToolReport, + native: NativePart[], + sourceUrl: string, + mimeType: string, + base64Data: string +) { + native.push({ sourceUrl, mimeType, base64Data }) + report.files.push({ sourceUrl, mimeType, status: 'ok' }) + report.successCount++ +} + +function isHttp(url: string): boolean { + try { + const { protocol } = new URL(url) + return protocol === 'http:' || protocol === 'https:' + } catch { + return false + } +} + +function modelSupportsMime(model: ChatLunaChatModel, mime: string): boolean { + const caps = model.modelInfo.capabilities + const isImage = IMAGE_MIME_TYPES.has(mime) + const capOk = isImage + ? caps.includes(ModelCapabilities.ImageInput) + : mime.startsWith('audio/') + ? caps.includes(ModelCapabilities.AudioInput) + : mime.startsWith('video/') + ? caps.includes(ModelCapabilities.VideoInput) + : caps.includes(ModelCapabilities.FileInput) + if (!capOk) return false + const file = model.fileHandlingConfig + return file == null || file.supportedMimeTypes.has(mime) +} + +function mimeEnabled(config: Config, mime: string): boolean { + if (mime === 'image/gif') return config.enableGifReadTool + if (IMAGE_MIME_TYPES.has(mime)) return config.enableImageReadTool + return config.enableFileReadTool +} + +function checkSize( + buffer: Buffer, + mime: string, + fileConfig: FileHandlingConfig | undefined, + totalBytes: number, + maxTotal: number +): string | null { + const max = + fileConfig?.maxFileSizeBytesOverrides?.[mime] ?? + fileConfig?.maxFileSizeBytes ?? + DEFAULT_MAX_FILE_SIZE_BYTES + const encoded = getBase64EncodedSize(buffer.byteLength) + if (encoded > max) { + return `File too large (${encoded} bytes after base64, raw ${buffer.byteLength} bytes), max ${max} bytes for ${mime}.` + } + if (totalBytes + encoded > maxTotal) { + return `Total inline upload size too large (${totalBytes + encoded} bytes), max ${maxTotal} bytes per request.` + } + return null +} + +function buildMultimodalMessage( + parts: NativePart[], + prompt: string +): HumanMessage { + const content: MessageContentComplex[] = [] + for (const { mimeType, base64Data } of parts) { + const dataUrl = `data:${mimeType};base64,${base64Data}` + if (IMAGE_MIME_TYPES.has(mimeType)) { + content.push({ type: 'image_url', image_url: { url: dataUrl } }) + } else if (mimeType.startsWith('audio/')) { + content.push({ + type: 'audio_url', + audio_url: { url: dataUrl, mimeType } + } as unknown as MessageContentComplex) + } else if (mimeType.startsWith('video/')) { + content.push({ + type: 'video_url', + video_url: { url: dataUrl, mimeType } + } as unknown as MessageContentComplex) + } else { + // Inline data for text/pdf/etc. (Gemini-style) + content.push({ + inline_data: { mime_type: mimeType, data: base64Data } + } as unknown as MessageContentComplex) + } + } + if (content.length > 0) content.unshift({ type: 'text', text: prompt }) + return new HumanMessage({ content }) +} + +function describeTool(config: Config): string { + const sections: string[] = [] + if (config.enableImageReadTool) { + sections.push( + '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.' + ) + } + if (config.enableGifReadTool) { + sections.push( + '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.' + ) + } + if (config.enableFileReadTool) { + sections.push( + '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).' + ) + } + return `Read files from URL(s) and return their content. +Enabled read_files capabilities: +${sections.join('\n')} +Use this tool when you need to read files from URL(s) as context.` +} + // --------------------------------------------------------------------------- // Plugin registration // --------------------------------------------------------------------------- diff --git a/packages/service-multimodal/src/read_files_schema.ts b/packages/service-multimodal/src/read_files_schema.ts deleted file mode 100644 index 8368f395d..000000000 --- a/packages/service-multimodal/src/read_files_schema.ts +++ /dev/null @@ -1,31 +0,0 @@ -import z from 'zod' - -const READ_FILE_SCHEMA = z.object({ - url: z.string().url() -}) - -function parseJsonStringInput(value: unknown): unknown { - if (typeof value !== 'string') { - return value - } - - try { - return JSON.parse(value) - } catch { - return value - } -} - -export const readFilesInputSchema = z.object({ - files: z - .preprocess( - parseJsonStringInput, - z.union([ - READ_FILE_SCHEMA, - z.array(READ_FILE_SCHEMA).min(1).max(10) - ]) - ) - .describe( - 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' - ) -}) diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts index bfb0532d0..564ccb9f0 100644 --- a/packages/service-multimodal/src/utils.ts +++ b/packages/service-multimodal/src/utils.ts @@ -1,6 +1,5 @@ import { HumanMessage, - MessageContent, MessageContentComplex, MessageContentText } from '@langchain/core/messages' @@ -12,195 +11,310 @@ import { isMessageContentImageUrl } from 'koishi-plugin-chatluna/utils/string' import { Context } from 'koishi' +import type {} from 'koishi-plugin-ffmpeg-path' import { Config, logger } from '.' import { GifReader } from 'omggif' import { Jimp } from 'jimp' -export interface GifExtractionConfig { - strategy: 'first' | 'head' | 'average' - frameCount: number +// --------------------------------------------------------------------------- +// MIME helpers +// --------------------------------------------------------------------------- + +export const IMAGE_MIME_TYPES = new Set([ + 'image/png', + 'image/jpeg', + 'image/bmp', + 'image/webp', + 'image/gif' +]) + +const FILE_EXTENSION_TO_MIME_TYPE: Record = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.bmp': 'image/bmp', + '.webp': 'image/webp', + '.gif': 'image/gif', + '.pdf': 'application/pdf', + '.txt': 'text/plain', + '.md': 'text/markdown', + '.html': 'text/html', + '.htm': 'text/html', + '.css': 'text/css', + '.xml': 'text/xml', + '.csv': 'text/csv', + '.rtf': 'text/rtf', + '.js': 'text/javascript', + '.mjs': 'text/javascript', + '.json': 'application/json', + '.mp4': 'video/mp4', + '.mpeg': 'video/mpeg', + '.mov': 'video/mov', + '.avi': 'video/avi', + '.flv': 'video/x-flv', + '.webm': 'video/webm', + '.wmv': 'video/wmv', + '.3gp': 'video/3gpp', + '.3gpp': 'video/3gpp', + '.mp3': 'audio/mpeg', + '.aiff': 'audio/aiff', + '.aac': 'audio/aac', + '.flac': 'audio/flac', + '.wav': 'audio/wav', + '.ogg': 'audio/ogg', + '.m4a': 'audio/mp4' } -/** - * Check if any frame in the range [start, end) has complex disposal methods - * that require resetting the canvas (disposal method 2 or 3) - */ -function hasComplexDisposal( - reader: GifReader, - start: number, - end: number -): boolean { - for (let i = start; i < end; i++) { - const disposal = reader.frameInfo(i).disposal - // disposal 2: restore to background color - // disposal 3: restore to previous (before current frame was drawn) - if (disposal === 2 || disposal === 3) { - return true - } +export function inferMimeTypeFromUrl(url: string): string | null { + try { + const path = new URL(url).pathname.toLowerCase() + const dot = path.lastIndexOf('.') + return dot < 0 + ? null + : (FILE_EXTENSION_TO_MIME_TYPE[path.slice(dot)] ?? null) + } catch { + return null } - return false } -export async function extractGifFrames( +export function normalizeMimeType( + raw: string | null | undefined +): string | null { + return raw?.split(';')[0]?.trim()?.toLowerCase() || null +} + +/** + * Detect audio MIME from buffer header. Recognises QQ Silk + AMR + common + * audio container magic bytes. Falls back to the declared MIME otherwise. + */ +export function detectAudioMimeType( buffer: Buffer, - config: GifExtractionConfig -): Promise { - try { - const reader = new GifReader(buffer) - const totalFrames = reader.numFrames() + declared?: string | null +): string | null { + const head = buffer.subarray(0, 16).toString('latin1') + + if (head.startsWith('#!AMR')) return 'audio/amr' + if ( + head.startsWith('#!SILK_V3') || + buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + ) { + return 'audio/silk' + } + // MP3 frame sync: 0xFFEx. Reject JPEG (0xFFD8) by checking the full sync word. + if ( + head.startsWith('ID3') || + (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) + ) { + return 'audio/mpeg' + } + if ( + head.startsWith('RIFF') && + buffer.subarray(8, 12).toString('latin1') === 'WAVE' + ) { + return 'audio/wav' + } + if (head.startsWith('fLaC')) return 'audio/flac' + if (head.startsWith('OggS')) return 'audio/ogg' - if (totalFrames === 0) { - throw new Error('No frames found in GIF') - } + return declared ?? null +} - const width = reader.width - const height = reader.height +// --------------------------------------------------------------------------- +// FFmpeg / Silk +// --------------------------------------------------------------------------- + +export async function convertAudioToMp3( + ctx: Context, + buffer: Buffer +): Promise { + if (!ctx.ffmpeg) { + logger.warn( + 'FFmpeg service unavailable; install koishi-plugin-ffmpeg-path to enable audio transcoding.' + ) + return null + } - let frameIndices: number[] = [] + try { + const isSilk = + buffer.subarray(0, 9).toString('latin1') === '#!SILK_V3' || + buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + + let source = buffer + let silkSampleRate: number | null = null + if (isSilk) { + const decoded = await decodeSilkToPcm(ctx, buffer) + if (!decoded) return null + source = decoded.buffer + silkSampleRate = decoded.sampleRate + } - switch (config.strategy) { - case 'first': - frameIndices = [0] - break + const builder = ctx.ffmpeg.builder().input(source) + if (silkSampleRate != null) { + builder.inputOption( + '-f', + 's16le', + '-ar', + String(silkSampleRate), + '-ac', + '1' + ) + } + return await builder + .outputOption( + '-vn', + '-acodec', + 'libmp3lame', + '-q:a', + '4', + '-f', + 'mp3' + ) + .run('buffer') + } catch (error) { + logger.warn(`Audio transcoding to mp3 failed:`, error) + return null + } +} - case 'head': { - const count = Math.min(config.frameCount, totalFrames) - frameIndices = Array.from({ length: count }, (_, i) => i) - break +async function decodeSilkToPcm( + ctx: Context, + buffer: Buffer +): Promise<{ buffer: Buffer; sampleRate: number } | null> { + if (!ctx.silk) { + logger.warn( + 'Silk service unavailable; install koishi-plugin-ffmpeg-path 2.0+ for silk decoding.' + ) + return null + } + for (const sampleRate of [24000, 16000, 12000, 8000]) { + try { + const result = (await ctx.silk.decode(buffer, sampleRate)) as { + data?: Uint8Array } - - case 'average': { - const count = Math.min(config.frameCount, totalFrames) - if (count >= totalFrames) { - frameIndices = Array.from( - { length: totalFrames }, - (_, i) => i - ) - } else if (count === 1) { - // Special case: single frame, pick the first one - frameIndices = [0] - } else { - // Use span (totalFrames - 1) to ensure first and last frames are included - const step = (totalFrames - 1) / (count - 1) - frameIndices = Array.from({ length: count }, (_, i) => - Math.floor(i * step) - ) - } - break + if (result?.data != null) { + return { buffer: Buffer.from(result.data), sampleRate } } + } catch { + // try next sample rate } + } + return null +} - const frameBuffers: Buffer[] = [] - - // Build canvas incrementally, only decoding frames we need - const canvas = new Uint8ClampedArray(width * height * 4) - let lastDecodedFrame = -1 +// --------------------------------------------------------------------------- +// GIF +// --------------------------------------------------------------------------- - for (const frameIndex of frameIndices) { - // Check if we need to restart decoding from frame 0 - // This happens when: - // 1. Jumping backwards in frame sequence - // 2. Any frames between lastDecodedFrame and current have complex disposal methods - // (disposal 2 or 3) which affect how the canvas should be prepared - const needsFullDecode = - frameIndex < lastDecodedFrame || - (lastDecodedFrame >= 0 && - hasComplexDisposal(reader, lastDecodedFrame, frameIndex)) +export interface GifExtractionConfig { + strategy: 'first' | 'head' | 'average' + frameCount: number +} - if (needsFullDecode) { - canvas.fill(0) // Clear canvas - // Decode from frame 0 to current frame - for (let i = 0; i <= frameIndex; i++) { - reader.decodeAndBlitFrameRGBA(i, canvas) - } - } else { - // Disposal method 0 (no disposal) or 1 (do not dispose) - // Just decode from last position to current frame - for (let i = lastDecodedFrame + 1; i <= frameIndex; i++) { - reader.decodeAndBlitFrameRGBA(i, canvas) - } +export async function parseGifToFrames( + buffer: Buffer, + config: GifExtractionConfig +): Promise { + const reader = new GifReader(buffer) + const total = reader.numFrames() + if (total === 0) throw new Error('No frames found in GIF') + + const indices = pickGifFrameIndices(total, config) + const { width, height } = reader + const canvas = new Uint8ClampedArray(width * height * 4) + let lastDecoded = -1 + const frames: string[] = [] + + for (const idx of indices) { + const needsFullDecode = + idx < lastDecoded || + (lastDecoded >= 0 && hasComplexDisposal(reader, lastDecoded, idx)) + if (needsFullDecode) { + canvas.fill(0) + for (let i = 0; i <= idx; i++) + reader.decodeAndBlitFrameRGBA(i, canvas) + } else { + for (let i = lastDecoded + 1; i <= idx; i++) { + reader.decodeAndBlitFrameRGBA(i, canvas) } - - lastDecodedFrame = frameIndex - - // Copy canvas to avoid reference issues - const frameData = new Uint8ClampedArray(canvas) - const image = new Jimp({ - data: Buffer.from(frameData), - width, - height - }) - - const pngBuffer = await image.getBuffer('image/png') - frameBuffers.push(pngBuffer) } - - return frameBuffers - } catch (error) { - logger.error('Failed to extract GIF frames:', error) - throw error + lastDecoded = idx + + const png = await new Jimp({ + data: Buffer.from(new Uint8ClampedArray(canvas)), + width, + height + }).getBuffer('image/png') + frames.push(`data:image/png;base64,${png.toString('base64')}`) } + return frames } -export async function parseGifToFrames( - buffer: Buffer, +function pickGifFrameIndices( + total: number, config: GifExtractionConfig -): Promise { - const frameBuffers = await extractGifFrames(buffer, config) - return frameBuffers.map((frameBuffer) => { - const base64 = frameBuffer.toString('base64') - return `data:image/png;base64,${base64}` - }) +): number[] { + if (config.strategy === 'first') return [0] + const count = Math.min(config.frameCount, total) + if (config.strategy === 'head') { + return Array.from({ length: count }, (_, i) => i) + } + // average + if (count >= total) return Array.from({ length: total }, (_, i) => i) + if (count === 1) return [0] + const step = (total - 1) / (count - 1) + return Array.from({ length: count }, (_, i) => Math.floor(i * step)) +} + +function hasComplexDisposal( + reader: GifReader, + start: number, + end: number +): boolean { + for (let i = start; i < end; i++) { + const d = reader.frameInfo(i).disposal + if (d === 2 || d === 3) return true + } + return false } +// --------------------------------------------------------------------------- +// Image +// --------------------------------------------------------------------------- + export async function readImage(ctx: Context, url: string) { if (url.startsWith('data:image') && url.includes('base64')) { const buffer = Buffer.from(url.split(',')[1], 'base64') - const ext = getImageType(buffer) - - return { - base64Source: url, - buffer, - ext - } + return { base64Source: url, buffer, ext: getImageType(buffer) } } - try { - const response = await ctx.http(url, { + const { data } = await ctx.http(url, { responseType: 'arraybuffer', method: 'get', - headers: { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' - } + headers: { 'User-Agent': BROWSER_UA } }) - - const buffer = Buffer.from(response.data) - - const base64 = buffer.toString('base64') - + const buffer = Buffer.from(data) const ext = getImageType(buffer) - return { - base64Source: `data:${ext};base64,${base64}`, + base64Source: `data:${ext};base64,${buffer.toString('base64')}`, buffer, ext } } catch (error) { logger.error(`Failed to read image from ${url}:`, error) - return { - base64Source: null, - buffer: null, - ext: null - } + return { base64Source: null, buffer: null, ext: null } } } + export async function processImageWithModel( model: ChatLunaChatModel, config: Config, message: Message -) { - const images = extractImages(message.content) +): Promise { + const images = Array.isArray(message.content) + ? message.content.filter((item: MessageContentComplex) => + isMessageContentImageUrl(item) + ) + : [] if (images.length === 0) return null try { @@ -208,9 +322,7 @@ export async function processImageWithModel( { type: 'text', text: config.imagePrompt } as MessageContentText, ...images ] - const result = await model.invoke([new HumanMessage({ content })]) - return config.imageInsertPrompt.replace( '{img}', getMessageContent(result.content) @@ -221,45 +333,36 @@ export async function processImageWithModel( } } -export const addImageToContent = (message: Message, imageUrl: string) => { - if (typeof message.content === 'string') { - message.content = [ - { - type: 'text', - text: message.content - } - ] - } +export function addImageToContent(message: Message, imageUrl: string) { + ensureContentArray(message) ;(message.content as MessageContentComplex[]).push({ type: 'image_url', - image_url: { - url: imageUrl - } + image_url: { url: imageUrl } }) } -export const addTextToContent = (message: Message, text: string) => { +export function addTextToContent(message: Message, text: string) { if (typeof message.content === 'string') { message.content += text return } - const content = message.content as MessageContentComplex[] - const lastItem = content[content.length - 1] - - if (lastItem && lastItem.type === 'text') { - lastItem.text += text + const last = content[content.length - 1] + if (last && last.type === 'text') { + last.text += text } else { - content.push({ - type: 'text', - text - }) + content.push({ type: 'text', text }) } } -export const extractImages = (content: MessageContent) => - Array.isArray(content) - ? content.filter((item: MessageContentComplex) => - isMessageContentImageUrl(item) - ) - : [] +export function ensureContentArray(message: Message, fallbackText = '') { + if (typeof message.content !== 'string') return + message.content = message.content.length + ? [{ type: 'text', text: message.content }] + : fallbackText.length + ? [{ type: 'text', text: fallbackText }] + : [] +} + +export const BROWSER_UA = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' diff --git a/packages/service-multimodal/tests/audio-mimo.test.ts b/packages/service-multimodal/tests/audio-mimo.test.ts deleted file mode 100644 index 240e67c3d..000000000 --- a/packages/service-multimodal/tests/audio-mimo.test.ts +++ /dev/null @@ -1,169 +0,0 @@ -import assert from 'node:assert/strict' -import { test } from 'node:test' -import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' -import { - MIMO_BASE64_AUDIO_BYTES, - MIMO_BASE64_IMAGE_BYTES, - buildAudioContent, - buildImageContent, - isMimoAudioMime, - isMimoImageMime, - modelCanReadAudio, - modelCanReadImage -} from '../src/audio' -import { detectAudioMimeType } from '../src/media' -import { readFilesInputSchema } from '../src/read_files_schema' - -test('recognizes MiMo audio models even when adapter metadata lacks AudioInput', () => { - assert.equal( - modelCanReadAudio( - { value: { capabilities: [ModelCapabilities.ToolCall] } }, - 'unifyllm/mimo-v2.5' - ), - true - ) - assert.equal( - modelCanReadAudio( - { value: { capabilities: [ModelCapabilities.ToolCall] } }, - 'mimo-v2-omni' - ), - true - ) - assert.equal( - modelCanReadAudio( - { value: { capabilities: [ModelCapabilities.ToolCall] } }, - 'unifyllm/deepseek-v4-flash' - ), - false - ) -}) - -test('uses MiMo input_audio data URL instead of ChatLuna audio_url', () => { - assert.deepEqual(buildAudioContent('mimo-v2.5', 'abc', 'audio/mpeg'), { - type: 'input_audio', - input_audio: { - data: 'data:audio/mpeg;base64,abc' - } - }) - assert.deepEqual(buildAudioContent('gpt-4o-audio', 'abc', 'audio/mpeg'), { - type: 'audio_url', - audio_url: { - url: 'data:audio/mpeg;base64,abc', - mimeType: 'audio/mpeg' - } - }) -}) - -test('keeps MiMo base64 audio within the documented 50 MB limit', () => { - assert.equal(MIMO_BASE64_AUDIO_BYTES, 50 * 1024 * 1024) - assert.equal(isMimoAudioMime('audio/mpeg'), true) - assert.equal(isMimoAudioMime('audio/wav'), true) - assert.equal(isMimoAudioMime('audio/flac'), true) - assert.equal(isMimoAudioMime('audio/mp4'), true) - assert.equal(isMimoAudioMime('audio/ogg'), true) - assert.equal(isMimoAudioMime('audio/aac'), false) -}) - -test('recognizes MiMo image models even when adapter metadata lacks ImageInput', () => { - assert.equal( - modelCanReadImage( - { value: { capabilities: [ModelCapabilities.ToolCall] } }, - 'unifyllm/mimo-v2.5' - ), - true - ) - assert.equal( - modelCanReadImage( - { value: { capabilities: [ModelCapabilities.ToolCall] } }, - 'mimo-v2-omni' - ), - true - ) - assert.equal( - modelCanReadImage( - { value: { capabilities: [ModelCapabilities.ToolCall] } }, - 'unifyllm/deepseek-v4-flash' - ), - false - ) -}) - -test('uses OpenAI image_url content for MiMo images', () => { - assert.deepEqual(buildImageContent('abc', 'image/png'), { - type: 'image_url', - image_url: { - url: 'data:image/png;base64,abc' - } - }) -}) - -test('keeps MiMo base64 images within the documented 50 MB limit', () => { - assert.equal(MIMO_BASE64_IMAGE_BYTES, 50 * 1024 * 1024) - assert.equal(isMimoImageMime('image/jpeg'), true) - assert.equal(isMimoImageMime('image/png'), true) - assert.equal(isMimoImageMime('image/gif'), true) - assert.equal(isMimoImageMime('image/webp'), true) - assert.equal(isMimoImageMime('image/bmp'), true) - assert.equal(isMimoImageMime('image/svg+xml'), false) -}) - -test('accepts JSON-stringified read_files input from tool calls', () => { - assert.deepEqual( - readFilesInputSchema.parse({ - files: '{"url":"http://127.0.0.1:5140/image.png"}' - }), - { - files: { - url: 'http://127.0.0.1:5140/image.png' - } - } - ) - - assert.deepEqual( - readFilesInputSchema.parse({ - files: '[{"url":"http://127.0.0.1:5140/image.png"}]' - }), - { - files: [ - { - url: 'http://127.0.0.1:5140/image.png' - } - ] - } - ) -}) - -test('detects AMR audio even when storage declares it as MP3', () => { - assert.equal( - detectAudioMimeType(Buffer.from('#!AMR\nabc'), 'audio/mp3'), - 'audio/amr' - ) - assert.equal( - detectAudioMimeType(Buffer.from('#!AMR\nabc'), null), - 'audio/amr' - ) - assert.equal( - detectAudioMimeType(Buffer.from('ID3abc'), 'audio/mp3'), - 'audio/mpeg' - ) -}) - -test('does not misidentify JPEG as audio/mpeg', () => { - // JPEG starts with FF D8 FF E0 (JFIF) — 0xD8 & 0xE0 = 0xC0, not an MP3 sync - const jpegHeader = Buffer.from([0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46]) - assert.equal( - detectAudioMimeType(jpegHeader, 'image/jpeg'), - 'image/jpeg' - ) - assert.equal(detectAudioMimeType(jpegHeader, null), null) -}) - -test('still detects valid MP3 frame sync', () => { - // MP3: FF FB (MPEG1 Layer3) — 0xFB & 0xE0 = 0xE0, valid sync - const mp3Header = Buffer.from([0xff, 0xfb, 0x90, 0x00]) - assert.equal(detectAudioMimeType(mp3Header, null), 'audio/mpeg') - - // MP3: FF F3 (MPEG2 Layer3) - const mp3v2Header = Buffer.from([0xff, 0xf3, 0x90, 0x00]) - assert.equal(detectAudioMimeType(mp3v2Header, null), 'audio/mpeg') -}) diff --git a/packages/shared-adapter/src/client.ts b/packages/shared-adapter/src/client.ts index 36af06aa3..1015a4350 100644 --- a/packages/shared-adapter/src/client.ts +++ b/packages/shared-adapter/src/client.ts @@ -1,6 +1,10 @@ +import { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client' import { ModelInfo } from 'koishi-plugin-chatluna/llm-core/platform/types' import { getModelContextSize } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens' +export const DEFAULT_AUDIO_MAX_BASE64_BYTES = 50 * 1024 * 1024 +export const DEFAULT_IMAGE_MAX_BASE64_BYTES = 50 * 1024 * 1024 + export type OpenAIReasoningEffort = | 'none' | 'minimal' @@ -153,7 +157,11 @@ function createGlobMatcher(pattern: string): (text: string) => boolean { return (text: string) => regex.test(text) } -const imageModelMatchers = [ +function createRegexMatcher(regex: RegExp): (text: string) => boolean { + return (text: string) => regex.test(text) +} + +const imageModelMatchers: ((text: string) => boolean)[] = [ 'vision', 'vl', 'gpt-4o', @@ -176,11 +184,76 @@ const imageModelMatchers = [ 'glm-*v', 'kimi-k2.5', 'step3', - 'grok-4', - 'mimo-v2.5*' -].map((pattern) => createGlobMatcher(pattern)) + 'grok-4' +].map(createGlobMatcher) + +// mimo-v2.5 supports image/audio; mimo-v2.5-pro does NOT (text only). +imageModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/)) export function supportImageInput(modelName: string) { const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase() return imageModelMatchers.some((matcher) => matcher(lowerModel)) } + +const audioModelMatchers: ((text: string) => boolean)[] = [ + 'gpt-4o-audio', + 'gpt-4o-mini-audio', + 'gpt-audio', + 'mimo-v2-omni' +].map(createGlobMatcher) + +audioModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/)) + +export function supportAudioInput(modelName: string) { + const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase() + return audioModelMatchers.some((matcher) => matcher(lowerModel)) +} + +const openAIImageMimeTypes = [ + 'image/png', + 'image/jpeg', + 'image/gif', + 'image/webp', + 'image/bmp' +] + +const openAIAudioMimeTypes = [ + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/flac', + 'audio/mp4', + 'audio/ogg' +] + +export function getOpenAIFileHandlingConfig( + modelName: string +): FileHandlingConfig | undefined { + const image = supportImageInput(modelName) + const audio = supportAudioInput(modelName) + if (!image && !audio) return undefined + + const supportedMimeTypes = new Set() + const overrides: Record = {} + + if (image) { + for (const mime of openAIImageMimeTypes) { + supportedMimeTypes.add(mime) + overrides[mime] = DEFAULT_IMAGE_MAX_BASE64_BYTES + } + } + + if (audio) { + for (const mime of openAIAudioMimeTypes) { + supportedMimeTypes.add(mime) + overrides[mime] = DEFAULT_AUDIO_MAX_BASE64_BYTES + } + } + + return { + supportedMimeTypes, + maxTotalSizeBytes: 100 * 1024 * 1024, + maxFileSizeBytes: 100 * 1024 * 1024, + maxFileSizeBytesOverrides: overrides + } +} diff --git a/packages/shared-adapter/src/utils.ts b/packages/shared-adapter/src/utils.ts index a171e25f6..f88b5fc4a 100644 --- a/packages/shared-adapter/src/utils.ts +++ b/packages/shared-adapter/src/utils.ts @@ -30,15 +30,24 @@ import { ResponseUsage } from './types' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' +import { logger } from 'koishi-plugin-chatluna' import { getImageMimeType, getMimeTypeFromSource, isMessageContentImageUrl } from 'koishi-plugin-chatluna/utils/string' -import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain' +import { + isChatLunaUserMessage, + isMessageContentAudio +} from 'koishi-plugin-chatluna/utils/langchain' import { ToolCallChunk } from '@langchain/core/messages/tool' import { isZodSchemaV3 } from '@langchain/core/utils/types' -import { normalizeOpenAIModelName, supportImageInput } from './client' +import { + DEFAULT_AUDIO_MAX_BASE64_BYTES, + normalizeOpenAIModelName, + supportAudioInput, + supportImageInput +} from './client' export function createUsageMetadata(data: { inputTokens: number @@ -222,6 +231,7 @@ export function responseInputContent( } satisfies ResponseInputContent } + // OpenAI Response API does not accept `input_audio` yet — drop it. return undefined }) .filter((part) => part != null) @@ -343,64 +353,58 @@ export async function langchainMessageToOpenAIMessage( } } - const images = rawMessage.additional_kwargs.images as string[] | null - - const lowerModel = normalizedModel?.toLowerCase() ?? '' - if ( - images != null && - (supportImageInput(lowerModel) || supportImageInputType) - ) { - msg.content = [ - { - type: 'text', - text: rawMessage.content as string - } - ] - - const imageContents = await Promise.all( - images.map(async (image) => { - try { - const url = await fetchImageUrl(plugin, { - type: 'image_url', - image_url: { url: image } - } as MessageContentImageUrl) - return { - type: 'image_url', - image_url: { - url, - detail: 'low' - } - } as const - } catch { - return null - } - }) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' ) + } - msg.content.push( - ...imageContents.filter((content) => content != null) - ) - } else if (Array.isArray(msg.content) && msg.content.length > 0) { + if (Array.isArray(msg.content) && msg.content.length > 0) { + const supportsAudio = supportAudioInput(normalizedModel ?? '') + const supportsImage = + supportImageInput(normalizedModel ?? '') || + supportImageInputType === true const mappedContent = await Promise.all( msg.content.map(async (content) => { - if (!isMessageContentImageUrl(content)) return content - - try { - const url = await fetchImageUrl(plugin, content) - return { - type: 'image_url', - image_url: { - url, - detail: 'low' + if (isMessageContentImageUrl(content)) { + if (!supportsImage) { + logger.warn( + `Model ${normalizedModel} does not accept image input; dropping image content.` + ) + return null + } + try { + const url = await fetchImageUrl(plugin, content) + return { + type: 'image_url', + image_url: { url, detail: 'low' } } + } catch { + return null + } + } + + if (isMessageContentAudio(content)) { + if (!supportsAudio) { + logger.warn( + `Model ${normalizedModel} does not accept audio input; dropping audio content.` + ) + return null + } + try { + return await fetchAudioContentPart(plugin, content) + } catch { + return null } - } catch { - return null } + + return content }) ) - msg.content = mappedContent.filter((content) => content != null) + msg.content = mappedContent.filter( + (content) => content != null + ) as ChatCompletionResponseMessage['content'] } result.push(msg) @@ -676,6 +680,48 @@ export async function fetchFileLikeUrl( } } +const AUDIO_MIME_TO_FORMAT: Record = { + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/x-wav': 'wav', + 'audio/flac': 'flac', + 'audio/x-flac': 'flac', + 'audio/ogg': 'ogg', + 'audio/mp4': 'mp4', + 'audio/aac': 'aac', + 'audio/webm': 'webm' +} + +function audioMimeToFormat(mime: string): string { + return AUDIO_MIME_TO_FORMAT[mime.toLowerCase()] ?? 'mp3' +} + +/** + * Fetch an `audio_url` content part and convert it to the OpenAI-compatible + * `input_audio` shape used by gpt-4o-audio / MiMo. Returns `null` when the + * encoded payload exceeds {@link DEFAULT_AUDIO_MAX_BASE64_BYTES}. + */ +async function fetchAudioContentPart( + plugin: ChatLunaPlugin, + content: MessageContentFileLike & { type: 'audio_url' } +): Promise { + const { buffer, mimeType } = await fetchFileLikeUrl(plugin, content) + const base64 = buffer.toString('base64') + + if (base64.length > DEFAULT_AUDIO_MAX_BASE64_BYTES) { + return null + } + + return { + type: 'input_audio', + input_audio: { + data: base64, + format: audioMimeToFormat(mimeType) + } + } as unknown as MessageContentComplex +} + export function messageTypeToOpenAIRole( type: MessageType ): ChatCompletionResponseMessageRoleEnum { From e00ea561222642e22980ddced5f5b6018c9eb471 Mon Sep 17 00:00:00 2001 From: dingyi Date: Mon, 18 May 2026 16:12:49 +0800 Subject: [PATCH 4/7] [Fix] harden multimodal audio handling --- packages/adapter-openai/src/client.ts | 12 ++++---- .../service-multimodal/src/plugins/audio.ts | 4 ++- .../src/plugins/read_files.ts | 29 +++++++++++++++---- packages/service-multimodal/src/utils.ts | 4 +++ packages/shared-adapter/src/utils.ts | 27 ++++++++++++++--- 5 files changed, 58 insertions(+), 18 deletions(-) diff --git a/packages/adapter-openai/src/client.ts b/packages/adapter-openai/src/client.ts index 0e0f0ffc1..9251a5fdc 100644 --- a/packages/adapter-openai/src/client.ts +++ b/packages/adapter-openai/src/client.ts @@ -67,13 +67,11 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient (model) => !( model.includes('instruct') || - [ - 'whisper', - 'tts', - 'dall-e', - 'audio', - 'realtime' - ].some((keyword) => model.includes(keyword)) + ['whisper', 'tts', 'dall-e', 'realtime'].some( + (keyword) => model.includes(keyword) + ) || + (model.includes('audio') && + !supportAudioInput(model)) ) ) .map((model) => { diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts index 6afad7fab..8d0a6ea15 100644 --- a/packages/service-multimodal/src/plugins/audio.ts +++ b/packages/service-multimodal/src/plugins/audio.ts @@ -30,7 +30,9 @@ const MIME_TO_EXT: Record = { 'audio/wav': 'wav', 'audio/flac': 'flac', 'audio/ogg': 'ogg', - 'audio/mp4': 'm4a' + 'audio/mp4': 'm4a', + 'audio/aac': 'aac', + 'audio/webm': 'webm' } /** diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index 06fe42d33..84280670a 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -109,10 +109,7 @@ export class ReadFilesTool extends StructuredTool { fetched.buffer, declared ) - const mime = - declared?.startsWith('audio/') || detectedAudio - ? detectedAudio - : declared + const mime = detectedAudio ?? declared if (!mime) { pushError( @@ -323,8 +320,7 @@ export class ReadFilesTool extends StructuredTool { }) return { buffer: Buffer.from(response.data), - contentType: - (response.headers as Headers)?.get?.('content-type') ?? null + contentType: getHeaderValue(response.headers, 'content-type') } } catch { return null @@ -413,6 +409,27 @@ function pushNative( report.successCount++ } +function getHeaderValue(headers: unknown, name: string): string | null { + if (headers == null) return null + + if (typeof (headers as { get?: unknown }).get === 'function') { + const value = (headers as { get(name: string): string | null }).get( + name + ) + return typeof value === 'string' ? value : null + } + + const record = headers as Record + const lower = name.toLowerCase() + for (const key of Object.keys(record)) { + if (key.toLowerCase() === lower) { + const value = record[key] + return typeof value === 'string' ? value : null + } + } + return null +} + function isHttp(url: string): boolean { try { const { protocol } = new URL(url) diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts index 564ccb9f0..55ecfdbb4 100644 --- a/packages/service-multimodal/src/utils.ts +++ b/packages/service-multimodal/src/utils.ts @@ -94,6 +94,8 @@ export function detectAudioMimeType( const head = buffer.subarray(0, 16).toString('latin1') if (head.startsWith('#!AMR')) return 'audio/amr' + // QQ/OneBot ships SILK voice files with a leading flag byte before the + // standard `#!SILK_V3` magic, so we also check offset 1 for that variant. if ( head.startsWith('#!SILK_V3') || buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' @@ -135,6 +137,8 @@ export async function convertAudioToMp3( } try { + // Match both the standard SILK magic and the QQ/OneBot variant that + // prepends a flag byte before `#!SILK_V3`. const isSilk = buffer.subarray(0, 9).toString('latin1') === '#!SILK_V3' || buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' diff --git a/packages/shared-adapter/src/utils.ts b/packages/shared-adapter/src/utils.ts index f88b5fc4a..fe130f7fc 100644 --- a/packages/shared-adapter/src/utils.ts +++ b/packages/shared-adapter/src/utils.ts @@ -392,9 +392,22 @@ export async function langchainMessageToOpenAIMessage( return null } try { - return await fetchAudioContentPart(plugin, content) - } catch { - return null + const part = await fetchAudioContentPart( + plugin, + content + ) + if (part == null) { + logger.warn( + `Audio content for model ${normalizedModel} was dropped (exceeded size limits or no data).` + ) + } + return part + } catch (err) { + logger.error( + `Failed to fetch audio part for model ${normalizedModel}`, + err + ) + throw err } } @@ -694,7 +707,13 @@ const AUDIO_MIME_TO_FORMAT: Record = { } function audioMimeToFormat(mime: string): string { - return AUDIO_MIME_TO_FORMAT[mime.toLowerCase()] ?? 'mp3' + const format = AUDIO_MIME_TO_FORMAT[mime.toLowerCase()] + if (!format) { + throw new Error( + `Unsupported audio MIME for OpenAI input_audio: ${mime}` + ) + } + return format } /** From a66e7090273efcfc48de5774739e0f25534213b8 Mon Sep 17 00:00:00 2001 From: dingyi Date: Tue, 19 May 2026 03:58:30 +0800 Subject: [PATCH 5/7] fix(servuce-multimodal): inline read_files schema in tool definition --- .../src/plugins/read_files.ts | 30 ++++++++++++------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index 84280670a..f987defdc 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -29,15 +29,6 @@ import z from 'zod' const DEFAULT_MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024 const DEFAULT_MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024 -const fileSchema = z.object({ url: z.string().url() }) -const readFilesSchema = z.object({ - files: z - .union([fileSchema, z.array(fileSchema).min(1).max(10)]) - .describe( - 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' - ) -}) - interface NativePart { mimeType: string base64Data: string @@ -50,7 +41,26 @@ interface NativePart { export class ReadFilesTool extends StructuredTool { name = 'read_files' - schema = readFilesSchema + schema = z.object({ + files: z + .union([ + z.object({ + url: z.string().url() + }), + z + .array( + z.object({ + url: z.string().url() + }) + ) + .min(1) + .max(10) + ]) + .describe( + 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' + ) + }) + description: string constructor( From 58b0247422d4b4331f21e508c8e2a8b5db9efbf5 Mon Sep 17 00:00:00 2001 From: dingyi Date: Tue, 19 May 2026 04:03:04 +0800 Subject: [PATCH 6/7] fix(servuce-multimodal): handle stringified file lists in read_files --- .../src/plugins/read_files.ts | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index f987defdc..43585655e 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -43,19 +43,38 @@ export class ReadFilesTool extends StructuredTool { name = 'read_files' schema = z.object({ files: z - .union([ - z.object({ - url: z.string().url() - }), - z - .array( - z.object({ - url: z.string().url() - }) - ) - .min(1) - .max(10) - ]) + .preprocess( + (arg: unknown) => { + if (typeof arg === 'string') { + const base = JSON.parse(arg) + if ( + typeof base === 'object' && + typeof base['files'] === 'string' + ) { + try { + base['files'] = JSON.parse(base['files']) + return base + } catch { + return base + } + } + } + return arg + }, + z.union([ + z.object({ + url: z.string().url() + }), + z + .array( + z.object({ + url: z.string().url() + }) + ) + .min(1) + .max(10) + ]) + ) .describe( 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' ) From 64d3eef080018ba04bf7d21808b7c165060a8f07 Mon Sep 17 00:00:00 2001 From: dingyi Date: Tue, 19 May 2026 05:58:17 +0800 Subject: [PATCH 7/7] [Fix] handle read_files GIF injection limits --- .../src/plugins/read_files.ts | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index 43585655e..1d40cd135 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -226,22 +226,36 @@ export class ReadFilesTool extends StructuredTool { } if (mime === 'image/gif') { + let pushed = 0 const frames = await parseGifToFrames(fetched.buffer, { strategy: this.config.gifStrategy, frameCount: this.config.gifFrameCount }) for (const frame of frames) { const frameBase64 = frame.split(',')[1] - const frameSize = getBase64EncodedSize( - Buffer.from(frameBase64, 'base64').byteLength + const buf = Buffer.from(frameBase64, 'base64') + const sizeError = checkSize( + buf, + 'image/png', + fileConfig, + totalBytes, + maxTotal ) - if (totalBytes + frameSize > maxTotal) { + if (sizeError) { + if (pushed < 1) { + pushError( + report, + sourceUrl, + sizeError, + 'image/png' + ) + } logger.warn( 'Skipping remaining GIF frames due to total size limit' ) break } - totalBytes += frameSize + totalBytes += getBase64EncodedSize(buf.byteLength) pushNative( report, native, @@ -249,6 +263,7 @@ export class ReadFilesTool extends StructuredTool { 'image/png', frameBase64 ) + pushed++ } } else { totalBytes += getBase64EncodedSize( @@ -308,6 +323,7 @@ export class ReadFilesTool extends StructuredTool { } } + const injected = native.length > 0 && !!conversationId if (native.length > 0 && conversationId) { this.ctx.chatluna.contextManager.inject({ conversationId, @@ -326,14 +342,15 @@ export class ReadFilesTool extends StructuredTool { return JSON.stringify({ response: report, - note: - native.length > 0 - ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.` - : describedCount > 0 - ? `Described ${describedCount} image file(s) using the vision model.` - : report.failureCount > 0 - ? `Failed to read ${report.failureCount} file(s).` - : 'No files were processed.' + note: injected + ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.` + : native.length > 0 + ? `Successfully read ${native.length} file(s), but no conversation id was available, so the file content was not added to the conversation context.` + : describedCount > 0 + ? `Described ${describedCount} image file(s) using the vision model.` + : report.failureCount > 0 + ? `Failed to read ${report.failureCount} file(s).` + : 'No files were processed.' }) }