diff --git a/packages/adapter-claude/src/utils.ts b/packages/adapter-claude/src/utils.ts index 06c8f8e9e..b9d38a422 100644 --- a/packages/adapter-claude/src/utils.ts +++ b/packages/adapter-claude/src/utils.ts @@ -56,41 +56,15 @@ export async function langchainMessageToClaudeMessage( const mappedMessages = await Promise.all( messages.map(async (rawMessage) => { - let content: string | ClaudeInputContentBlockParam[] | undefined = + const content: string | ClaudeInputContentBlockParam[] | undefined = typeof rawMessage.content === 'string' ? rawMessage.content : await processMessageContent(plugin, rawMessage.content) - const images = rawMessage.additional_kwargs.images as - | string[] - | null - - if ( - (model?.includes('claude-3') || model?.includes('claude-4')) && - images != null - ) { - const mappedImages = await Promise.all( - images.map(async (image) => - processImageContent(plugin, { - type: 'image_url', - image_url: { url: image } - } as MessageContentImageUrl) - ) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' ) - - const nextContent: ClaudeInputContentBlockParam[] = - mappedImages.filter((item) => item != null) - - if (Array.isArray(content)) { - nextContent.push(...content) - } else if ((content?.length ?? 0) > 0) { - nextContent.push({ - type: 'text', - text: content - }) - } - - content = nextContent } const result: ClaudeMessage = { diff --git a/packages/adapter-gemini/src/utils.ts b/packages/adapter-gemini/src/utils.ts index 3875a1b95..9f5a0c5ba 100644 --- a/packages/adapter-gemini/src/utils.ts +++ b/packages/adapter-gemini/src/utils.ts @@ -14,7 +14,6 @@ import { ChatCompletionResponseMessageRoleEnum, ChatFunctionCallingPart, ChatFunctionResponsePart, - ChatMessagePart, ChatPart, ChatResponse, GeminiUsageMetadata @@ -78,9 +77,10 @@ export async function langchainMessageToGeminiMessage( thoughtData ) - const images = message.additional_kwargs.images as string[] | null - if (images) { - processImageParts(result, images, model) + if (message.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' + ) } return result @@ -203,39 +203,6 @@ async function processFunctionMessage( ] } } -function processImageParts( - result: ChatCompletionResponseMessage, - images: string[], - model: string -) { - if ( - !( - (model.includes('vision') || - model.includes('gemini') || - model.includes('gemma2')) && - !model.includes('gemini-1.0') - ) - ) { - return - } - - for (const image of images) { - const mineType = image.split(';')?.[0]?.split(':')?.[1] ?? 'image/jpeg' - const data = image.replace(/^data:image\/\w+;base64,/, '') - - result.parts.push({ - inline_data: { data, mime_type: mineType } - }) - } - - result.parts = result.parts.filter((uncheckedPart) => { - const part = partAsTypeCheck( - uncheckedPart, - (part) => part['text'] != null - ) - return part == null || part.text.length > 0 - }) -} async function processGeminiImageContent( plugin: ChatLunaPlugin, diff --git a/packages/adapter-ollama/src/utils.ts b/packages/adapter-ollama/src/utils.ts index 30f671cc6..946e2bab1 100644 --- a/packages/adapter-ollama/src/utils.ts +++ b/packages/adapter-ollama/src/utils.ts @@ -21,25 +21,24 @@ export async function langchainMessageToOllamaMessage( const mappedMessage = await Promise.all( messages.map(async (rawMessage) => { - let images: string[] = [] - - if (rawMessage.additional_kwargs.images != null && supportImage) { - images = rawMessage.additional_kwargs.images as string[] - } else { - images = - typeof rawMessage.content === 'string' - ? undefined - : await Promise.all( - rawMessage.content - .filter((part) => - isMessageContentImageUrl(part) - ) - .map((part) => - processOllamaImageContent(plugin, part) - ) - ) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' + ) } + const images: string[] | undefined = supportImage + ? typeof rawMessage.content === 'string' + ? undefined + : await Promise.all( + rawMessage.content + .filter((part) => isMessageContentImageUrl(part)) + .map((part) => + processOllamaImageContent(plugin, part) + ) + ) + : undefined + const result = { role: messageTypeToOllamaRole(rawMessage.getType()), content: getMessageContent(rawMessage.content), diff --git a/packages/adapter-openai-like/src/client.ts b/packages/adapter-openai-like/src/client.ts index 20403fdf3..3435f9df2 100644 --- a/packages/adapter-openai-like/src/client.ts +++ b/packages/adapter-openai-like/src/client.ts @@ -20,10 +20,12 @@ import { OpenAIRequester } from './requester' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { getModelMaxContextSize, + getOpenAIFileHandlingConfig, isEmbeddingModel, isImageGenerationModel, isNonLLMModel, isRerankerModel, + supportAudioInput, supportImageInput } from '@chatluna/v1-shared-adapter' import { RunnableConfig } from '@langchain/core/runnables' @@ -92,6 +94,9 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient { ModelCapabilities.ToolCall, supportImageInput(model) ? ModelCapabilities.ImageInput + : null, + supportAudioInput(model) + ? ModelCapabilities.AudioInput : null ].filter(Boolean) } @@ -167,6 +172,7 @@ export class OpenAIClient extends PlatformModelEmbeddingsAndRerankerClient { temperature: this._config.temperature, maxRetries: this._config.maxRetries, llmType: 'openai', + fileHandlingConfig: getOpenAIFileHandlingConfig(model), isThinkModel: model.includes('reasoner') || model.includes('r1') || diff --git a/packages/adapter-openai/src/client.ts b/packages/adapter-openai/src/client.ts index 6a4857855..9251a5fdc 100644 --- a/packages/adapter-openai/src/client.ts +++ b/packages/adapter-openai/src/client.ts @@ -20,6 +20,8 @@ import { OpenAIRequester } from './requester' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { getModelMaxContextSize, + getOpenAIFileHandlingConfig, + supportAudioInput, supportImageInput } from '@chatluna/v1-shared-adapter' import { RunnableConfig } from '@langchain/core/runnables' @@ -65,13 +67,11 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient (model) => !( model.includes('instruct') || - [ - 'whisper', - 'tts', - 'dall-e', - 'audio', - 'realtime' - ].some((keyword) => model.includes(keyword)) + ['whisper', 'tts', 'dall-e', 'realtime'].some( + (keyword) => model.includes(keyword) + ) || + (model.includes('audio') && + !supportAudioInput(model)) ) ) .map((model) => { @@ -84,6 +84,9 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient ModelCapabilities.ToolCall, supportImageInput(model) ? ModelCapabilities.ImageInput + : undefined, + supportAudioInput(model) + ? ModelCapabilities.AudioInput : undefined ].filter(Boolean) } as ModelInfo @@ -125,6 +128,7 @@ export class OpenAIClient extends PlatformModelAndEmbeddingsClient timeout: this._config.timeout, temperature: this._config.temperature, maxRetries: this._config.maxRetries, + fileHandlingConfig: getOpenAIFileHandlingConfig(model), llmType: 'openai' }) } diff --git a/packages/adapter-qwen/src/utils.ts b/packages/adapter-qwen/src/utils.ts index 09201bb36..9491127cf 100644 --- a/packages/adapter-qwen/src/utils.ts +++ b/packages/adapter-qwen/src/utils.ts @@ -5,7 +5,6 @@ import { ChatMessageChunk, FunctionMessageChunk, HumanMessageChunk, - MessageContentImageUrl, MessageType, SystemMessageChunk, ToolMessage, @@ -21,11 +20,11 @@ import { } from './types' import { fetchImageUrl, - removeAdditionalProperties, - supportImageInput + removeAdditionalProperties } from '@chatluna/v1-shared-adapter' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { isZodSchemaV3 } from '@langchain/core/utils/types' +import { logger } from '.' export function formatToolsToQWenTools( tools: StructuredTool[] @@ -113,50 +112,13 @@ export async function langchainMessageToQWenMessage( } } - const images = rawMessage.additional_kwargs.images as string[] | null - - if ( - (model?.includes('qwen-vl') || - model?.includes('omni') || - model?.includes('qwen2.5-vl') || - model?.includes('qwen2.5-omni') || - model?.includes('qwen-omni') || - model?.includes('qwen2-vl') || - model?.includes('qvq') || - supportImageInput(model)) && - images != null - ) { - msg.content = [ - { - type: 'text', - text: rawMessage.content as string - } - ] - - const imageContents = await Promise.all( - images.map(async (image) => { - try { - const url = await fetchImageUrl(plugin, { - type: 'image_url', - image_url: { url: image } - } as MessageContentImageUrl) - return { - type: 'image_url', - image_url: { - url, - detail: 'low' - } - } as const - } catch { - return null - } - }) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' ) + } - msg.content.push( - ...imageContents.filter((content) => content != null) - ) - } else if (Array.isArray(msg.content) && msg.content.length > 0) { + if (Array.isArray(msg.content) && msg.content.length > 0) { const mappedContent = await Promise.all( msg.content.map(async (content) => { if (!isMessageContentImageUrl(content)) return content diff --git a/packages/service-multimodal/README.md b/packages/service-multimodal/README.md index e96b4badc..84e92d7bf 100644 --- a/packages/service-multimodal/README.md +++ b/packages/service-multimodal/README.md @@ -1,7 +1,7 @@ -## koishi-plugin-chatluna-long-memory +## koishi-plugin-chatluna-multimodal-service -## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package/koishi-plugin-chatluna-long-memory) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-long-memory)](https://www.npmjs.com/package//koishi-plugin-chatluna-long-memory) +## [![npm](https://img.shields.io/npm/v/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service) [![npm](https://img.shields.io/npm/dm/koishi-plugin-chatluna-multimodal-service)](https://www.npmjs.com/package/koishi-plugin-chatluna-multimodal-service) -> 提供长期记忆支持的插件 +> ChatLuna 的多模态服务插件,提供上下文图像/语音描述、GIF 处理与 `read_files` 文件读取工具。 -[长期记忆文档](https://chatluna.chat/ecosystem/renderer/image.html) +[多模态插件文档](https://chatluna.chat/ecosystem/plugin/multimodal-service.html) diff --git a/packages/service-multimodal/src/index.ts b/packages/service-multimodal/src/index.ts index f1e7588a6..62ff61556 100644 --- a/packages/service-multimodal/src/index.ts +++ b/packages/service-multimodal/src/index.ts @@ -83,7 +83,7 @@ export const Config: Schema = Schema.intersect([ export const inject = { required: ['chatluna'], - optional: ['chatluna_storage', 'ffmpeg', 'silk'] + optional: ['ffmpeg', 'silk'] } export const name = 'chatluna-multimodal-service' diff --git a/packages/service-multimodal/src/plugins/audio.ts b/packages/service-multimodal/src/plugins/audio.ts index f8ce7fbdb..8d0a6ea15 100644 --- a/packages/service-multimodal/src/plugins/audio.ts +++ b/packages/service-multimodal/src/plugins/audio.ts @@ -1,92 +1,95 @@ import { MessageContentComplex } from '@langchain/core/messages' import { Context, h, Session } from 'koishi' import type { OneBotBot } from 'koishi-plugin-adapter-onebot' -import { Message } from 'koishi-plugin-chatluna' import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' -import type {} from 'koishi-plugin-chatluna-storage-service' import type {} from 'koishi-plugin-ffmpeg-path' import { Config, logger } from '..' +import { + BROWSER_UA, + convertAudioToMp3, + detectAudioMimeType, + ensureContentArray +} from '../utils' + +// MIMEs commonly accepted by OpenAI / Gemini / MiMo audio inputs. Anything +// else (Silk, AMR, ...) is transcoded to MP3. +const NATIVE_AUDIO_MIMES = new Set([ + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/flac', + 'audio/ogg', + 'audio/mp4', + 'audio/aac', + 'audio/webm' +]) + +const MIME_TO_EXT: Record = { + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/flac': 'flac', + 'audio/ogg': 'ogg', + 'audio/mp4': 'm4a', + 'audio/aac': 'aac', + 'audio/webm': 'webm' +} -const CHATLUNA_DOWNLOAD_USER_AGENT = - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' -const MAX_AUDIO_BYTES = 25 * 1024 * 1024 - +/** + * Intercept voice/audio elements: download, transcode unfriendly formats + * (Silk/AMR/...) to MP3, then inject as a Base64 `audio_url` content part. + * OpenAI-compatible adapters convert the result to `input_audio` downstream. + */ export function apply(ctx: Context, config: Config) { - if (!config.enableAudioFfmpegConversion) { - return - } + if (!config.enableAudioFfmpegConversion) return ctx.effect(() => ctx.chatluna.messageTransformer.intercept( 'audio', async (session, element, message, model) => { - const modelInfo = model - ? ctx.chatluna.platform.findModel(model) - : undefined + if (!modelAcceptsAudio(ctx, model)) return false - // If the model doesn't accept audio input, keep fallback path unchanged. - if ( - modelInfo?.value?.capabilities?.includes( - ModelCapabilities.AudioInput - ) === false - ) { - return false - } + const sourceUrl = await resolveAudioSourceUrl(session, element) + if (!sourceUrl) return false - const sourceUrl = await resolveAudioSourceUrl( - ctx, - session, - element - ) - if (!sourceUrl) { - return false - } + const buffer = await downloadAudio(ctx, sourceUrl) + if (!buffer) return false - const fileName = - element.attrs['file'] ?? - element.attrs['name'] ?? - element.attrs['filename'] + const detected = detectAudioMimeType( + buffer, + element.attrs['mime'] as string | null + ) - const fileData = await readFile(ctx, sourceUrl) - if (!fileData.buffer) { - return false - } + let outBuffer = buffer + let outMime = detected ?? 'audio/mpeg' - const converted = await tryConvertAudioToMp3( - ctx, - fileData.buffer, - fileName - ) - if (!converted) { - logger.warn(`Failed to convert audio to MP3: ${sourceUrl}`) - return false + if (!detected || !NATIVE_AUDIO_MIMES.has(detected)) { + const converted = await convertAudioToMp3(ctx, buffer) + if (!converted) { + logger.warn( + `Skip audio: format ${detected ?? 'unknown'} not natively supported and ffmpeg conversion failed.` + ) + return false + } + outBuffer = converted + outMime = 'audio/mpeg' } - const { fileName: displayFileName, buffer } = converted - element.attrs['file'] = displayFileName - element.attrs['filename'] = displayFileName - - const audioUrl = ctx.chatluna_storage - ? (element.attrs['chatluna_file_url'] = ( - await ctx.chatluna_storage.createTempFile( - buffer, - displayFileName - ) - ).url) - : ((element.attrs['chatluna_file_url'] = sourceUrl), - `data:audio/mpeg;base64,${buffer.toString('base64')}`) + const dataUrl = `data:${outMime};base64,${outBuffer.toString('base64')}` + const ext = MIME_TO_EXT[outMime] ?? 'mp3' + const fileName = `${stripExtension(audioName(element))}.${ext}` + element.attrs['file'] = fileName + element.attrs['filename'] = fileName + element.attrs['chatluna_file_url'] = sourceUrl - ensureContentArray(message, `[voice:${displayFileName}]`) + ensureContentArray(message, `[voice:${fileName}]`) ;(message.content as MessageContentComplex[]).push({ type: 'audio_url', - audio_url: { - url: audioUrl, - mimeType: 'audio/mpeg' - } + audio_url: { url: dataUrl, mimeType: outMime } } as unknown as MessageContentComplex) logger.debug( - `Transcoded unsupported audio to mp3 for multimodal input: ${displayFileName}` + `Injected audio for ${model}: ${fileName} (${outMime}, ${outBuffer.byteLength} bytes)` ) return true }, @@ -95,22 +98,28 @@ export function apply(ctx: Context, config: Config) { ) } +function modelAcceptsAudio(ctx: Context, model: string | undefined): boolean { + if (!model) return false + return ( + ctx.chatluna.platform + .findModel(model) + ?.value?.capabilities?.includes(ModelCapabilities.AudioInput) === + true + ) +} + async function resolveAudioSourceUrl( - ctx: Context, session: Session, element: h ): Promise { - const srcAttr = (element.attrs['src'] ?? element.attrs['url']) as + const src = (element.attrs['src'] ?? element.attrs['url']) as | string | undefined - if (srcAttr?.startsWith('http')) { - return srcAttr - } - - if (session.platform !== 'onebot') return srcAttr ?? null + if (src?.startsWith('http')) return src + if (session.platform !== 'onebot') return src ?? null const fileId = element.attrs['fileId'] ?? element.attrs['fileid'] - if (!fileId) return srcAttr ?? null + if (!fileId) return src ?? null try { const bot = session.bot as OneBotBot @@ -119,239 +128,37 @@ async function resolveAudioSourceUrl( ? await bot.internal.getPrivateFileUrl(session.userId, fileId) : await bot.internal.getGroupFileUrl(session.guildId, fileId, busId) } catch { - return srcAttr ?? null + return src ?? null } } -async function readFile( +async function downloadAudio( ctx: Context, url: string -): Promise<{ buffer: Buffer | null; mimeType: string | null }> { - const headers = { 'User-Agent': CHATLUNA_DOWNLOAD_USER_AGENT } - - let sanitizedUrl: string - try { - const parsed = new URL(url) - sanitizedUrl = parsed.origin + parsed.pathname - } catch { - sanitizedUrl = url - } - - let mimeTypeFromHead: string | null = null - - // Try HEAD request for size check - try { - const headResponse = await ctx.http(url, { method: 'head', headers }) - const headHeaders: Headers = headResponse?.headers - mimeTypeFromHead = - headHeaders - ?.get('content-type') - ?.split(';')[0] - ?.trim() - ?.toLowerCase() ?? null - - const headContentLength = headHeaders?.get('content-length') - ? Number(headHeaders.get('content-length')) - : null - - if ( - headContentLength != null && - Number.isFinite(headContentLength) && - headContentLength > MAX_AUDIO_BYTES - ) { - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: ${headContentLength} bytes > ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType: mimeTypeFromHead } - } - } catch { - // Some endpoints do not support HEAD; continue with GET safeguards. - } - - try { - const response = await fetch(url, { method: 'GET', headers }) - - if (!response.ok) { - throw new Error(`HTTP ${response.status}`) - } - - const mimeType = - response.headers - .get('content-type') - ?.split(';')[0] - ?.trim() - ?.toLowerCase() ?? mimeTypeFromHead - const responseContentLength = response.headers.get('content-length') - ? Number(response.headers.get('content-length')) - : null - - if ( - responseContentLength != null && - Number.isFinite(responseContentLength) && - responseContentLength > MAX_AUDIO_BYTES - ) { - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: ${responseContentLength} bytes > ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType } - } - - if (response.body == null) { - const arrayBuffer = await response.arrayBuffer() - if (arrayBuffer.byteLength > MAX_AUDIO_BYTES) { - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: ${arrayBuffer.byteLength} bytes > ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType } - } - return { buffer: Buffer.from(arrayBuffer), mimeType } - } - - const reader = response.body.getReader() - const chunks: Buffer[] = [] - let totalBytes = 0 - - while (true) { - const { done, value } = await reader.read() - if (done) break - - if (!value?.byteLength) continue - - totalBytes += value.byteLength - if (totalBytes > MAX_AUDIO_BYTES) { - await reader.cancel('audio exceeds max size') - logger.warn( - `Skip reading oversized audio from ${sanitizedUrl}: streamed bytes exceed ${MAX_AUDIO_BYTES} bytes` - ) - return { buffer: null, mimeType } - } - - chunks.push(Buffer.from(value)) - } - - return { buffer: Buffer.concat(chunks, totalBytes), mimeType } - } catch (error) { - logger.warn(`Failed to read audio from ${sanitizedUrl}:`, error) - return { buffer: null, mimeType: null } - } -} - -function toMp3FileName(fileName?: string): string { - const baseName = (fileName ?? 'voice').trim() - const dotIndex = baseName.lastIndexOf('.') - return `${dotIndex <= 0 ? baseName : baseName.slice(0, dotIndex)}.mp3` -} - -async function tryConvertAudioToMp3( - ctx: Context, - inputBuffer: Buffer, - fileName?: string -): Promise<{ buffer: Buffer; fileName: string } | null> { +): Promise { try { - let sourceBuffer = inputBuffer - let decodedPcmSampleRate: number | null = null - - if (isSilkAudio(inputBuffer)) { - const decoded = await decodeSilkAudio(ctx, inputBuffer) - sourceBuffer = decoded.buffer - decodedPcmSampleRate = decoded.sampleRate - logger.debug('Decoded silk audio before mp3 transcoding.') - } - - const ffmpeg = ctx.ffmpeg - if (!ffmpeg) { - throw new Error( - 'FFmpeg service is unavailable. Please enable koishi-plugin-ffmpeg-path.' - ) - } - - const builder = ffmpeg.builder().input(sourceBuffer) - if (decodedPcmSampleRate != null) { - builder.inputOption( - '-f', - 's16le', - '-ar', - String(decodedPcmSampleRate), - '-ac', - '1' - ) - } - - const outputBuffer = await builder - .outputOption( - '-vn', - '-acodec', - 'libmp3lame', - '-q:a', - '4', - '-f', - 'mp3' - ) - .run('buffer') - - return { - buffer: outputBuffer, - fileName: toMp3FileName(fileName) - } + const { data } = await ctx.http(url, { + responseType: 'arraybuffer', + method: 'get', + headers: { 'User-Agent': BROWSER_UA } + }) + return Buffer.from(data) } catch (error) { - logger.warn( - `Audio transcoding to mp3 failed, fallback to original audio: ${error instanceof Error ? error.message : String(error)}` - ) + logger.warn(`Failed to fetch audio from ${url}:`, error) return null } } -function isSilkAudio(inputBuffer: Buffer): boolean { - if (inputBuffer.length < 9) return false - const sig = inputBuffer.subarray(0, 9).toString('latin1') +function audioName(element: h): string { return ( - sig === '#!SILK_V3' || - inputBuffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + (element.attrs['file'] as string | undefined) ?? + (element.attrs['name'] as string | undefined) ?? + (element.attrs['filename'] as string | undefined) ?? + 'voice' ) } -async function decodeSilkAudio( - ctx: Context, - inputBuffer: Buffer -): Promise<{ buffer: Buffer; sampleRate: number }> { - const silk = ctx.silk - if (!silk) { - throw new Error( - 'Detected silk audio, but no silk service is available for decoding' - ) - } - for (const sampleRate of [24000, 16000, 12000, 8000]) { - try { - const result = (await silk.decode( - inputBuffer, - sampleRate - )) as DecodeResult - - if (result?.data != null) { - return { buffer: Buffer.from(result.data), sampleRate } - } - } catch { - continue - } - } - - throw new Error('silk decode returned empty output') -} - -function ensureContentArray(message: Message, fallbackText: string) { - if (typeof message.content === 'string') { - message.content = [ - { - type: 'text', - text: message.content.trim().length - ? message.content - : fallbackText - } - ] - } -} - -interface DecodeResult { - data: Uint8Array - duration: number +function stripExtension(name: string): string { + const dot = name.lastIndexOf('.') + return dot > 0 ? name.slice(0, dot) : name } diff --git a/packages/service-multimodal/src/plugins/image.ts b/packages/service-multimodal/src/plugins/image.ts index e29db3c56..f1b64e51a 100644 --- a/packages/service-multimodal/src/plugins/image.ts +++ b/packages/service-multimodal/src/plugins/image.ts @@ -1,6 +1,6 @@ -/* eslint-disable max-len */ import { Context } from 'koishi' import { Message } from 'koishi-plugin-chatluna' +import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model' import { ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' import { Config, logger } from '..' @@ -12,6 +12,11 @@ import { readImage } from '../utils' +/** + * Intercept image elements. Native-capable models receive the data URL + * directly (GIFs are split into frames). Otherwise fall back to describing + * the image via the configured vision model and inject the description. + */ export async function apply( ctx: Context, config: Config, @@ -21,136 +26,106 @@ export async function apply( config.imageModel ) - const disposable = ctx.chatluna.messageTransformer.intercept( - 'img', - async (_session, element, message, model) => { - const parsedModelInfo = - model != null - ? ctx.chatluna.platform.findModel(model) - : undefined - const modelSupportsImageInput = - parsedModelInfo?.value != null && - parsedModelInfo.value.capabilities.includes( - ModelCapabilities.ImageInput - ) - - let imageData: Awaited> - const url = (element.attrs.url ?? element.attrs.src) as string - - if (modelSupportsImageInput) { - imageData = await readImage(ctx, url) + ctx.effect(() => + ctx.chatluna.messageTransformer.intercept( + 'img', + async (_session, element, message, model) => { + const url = (element.attrs.url ?? element.attrs.src) as string + if (!url) return false - if (imageData.ext == null) { + const native = modelAcceptsImage(ctx, model) + if (!native && !config.enableContextImageDescription) { return false } - if (imageData.ext === 'image/gif') { - if (!config.enableContextGifHandling) { - return false - } - - logger.debug(`image url: ${url.substring(0, 50)}...`) - const frames = await parseGifToFrames(imageData.buffer, { - strategy: config.gifStrategy, - frameCount: config.gifFrameCount - }) + const imageData = await readImage(ctx, url) + if (imageData.ext == null) return false - logger.debug(`Extracted ${frames.length} frames from GIF`) + const isGif = imageData.ext === 'image/gif' + if (isGif && !config.enableContextGifHandling) return false - for (const frame of frames) { - addImageToContent(message, frame) + if (native) { + if (isGif) { + await injectGifFrames(message, imageData.buffer, config) + addTextToContent(message, '[image: GIF]') + } else if (imageData.base64Source) { + addImageToContent(message, imageData.base64Source) } - - addTextToContent(message, '[image: GIF]') - return true } - if (imageData.base64Source != null) { - addImageToContent(message, imageData.base64Source) - return true - } - } - - if (!config.enableContextImageDescription) { - return false - } - - if (imageUnderstandModel.value == null) { - logger.warn( - `The model ${config.imageModel} is not loaded, please check your chat adapter` - ) - return false - } - - if ( - !imageUnderstandModel.value.modelInfo.capabilities.includes( - ModelCapabilities.ImageInput - ) - ) { - logger.warn( - `The model ${config.imageModel} in image-service does not support image input, please check your chat adapter` - ) - return false - } - - try { - const fakeMessage: Message = { - content: [] - } - - logger.debug(`image url: ${url}`) - - imageData = imageData ?? (await readImage(ctx, url)) - - if (imageData.ext == null) { - return false - } - - if (imageData.ext === 'image/gif') { - if (!config.enableContextGifHandling) { - return false - } - - const frames = await parseGifToFrames(imageData.buffer, { - strategy: config.gifStrategy, - frameCount: config.gifFrameCount - }) - - logger.debug( - `Extracted ${frames.length} frames from GIF for model processing` - ) - - addTextToContent( - fakeMessage, - 'This is a GIF image. See the frames below:' - ) - for (const frame of frames) { - addImageToContent(fakeMessage, frame) - } - } else { - addImageToContent(fakeMessage, imageData.base64Source) - } - - const result = await processImageWithModel( - imageUnderstandModel.value, + return describeAndInject( + message, + imageData, + isGif, config, - fakeMessage + imageUnderstandModel.value, + url ) + }, + 100 + ) + ) +} - if (result) { - addTextToContent(message, '\n\n' + result) - return true - } - } catch (error) { - logger.warn( - `Read image ${url} error, check your chat adapter`, - error - ) - } - }, - 100 +function modelAcceptsImage(ctx: Context, model: string | undefined): boolean { + if (!model) return false + return ( + ctx.chatluna.platform + .findModel(model) + ?.value?.capabilities?.includes(ModelCapabilities.ImageInput) === + true ) +} - ctx.effect(() => disposable) +async function injectGifFrames( + message: Message, + buffer: Buffer, + config: Config +): Promise { + const frames = await parseGifToFrames(buffer, { + strategy: config.gifStrategy, + frameCount: config.gifFrameCount + }) + logger.debug(`Extracted ${frames.length} frames from GIF`) + for (const frame of frames) addImageToContent(message, frame) +} + +async function describeAndInject( + message: Message, + imageData: Awaited>, + isGif: boolean, + config: Config, + imageModel: ChatLunaChatModel | undefined, + url: string +): Promise { + if ( + imageModel == null || + !imageModel.modelInfo.capabilities.includes( + ModelCapabilities.ImageInput + ) + ) { + logger.warn( + `Image-description model "${config.imageModel}" is missing or lacks image input — skip.` + ) + return false + } + + try { + const fake: Message = { content: [] } + if (isGif) { + addTextToContent(fake, 'This is a GIF image. See the frames below:') + await injectGifFrames(fake, imageData.buffer, config) + } else if (imageData.base64Source) { + addImageToContent(fake, imageData.base64Source) + } + const result = await processImageWithModel(imageModel, config, fake) + if (result) { + addTextToContent(message, '\n\n' + result) + return true + } + } catch (error) { + logger.warn(`Image describe failed for ${url}:`, error) + } + return false } diff --git a/packages/service-multimodal/src/plugins/read_files.ts b/packages/service-multimodal/src/plugins/read_files.ts index ea2e578df..1d40cd135 100644 --- a/packages/service-multimodal/src/plugins/read_files.ts +++ b/packages/service-multimodal/src/plugins/read_files.ts @@ -4,266 +4,35 @@ import { HumanMessage, MessageContentComplex } from '@langchain/core/messages' import { Context } from 'koishi' import { ComputedRef, Message } from 'koishi-plugin-chatluna' import { ChatLunaChatModel } from 'koishi-plugin-chatluna/llm-core/platform/model' +import type { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client' import { ChatLunaToolRunnable, ModelCapabilities } from 'koishi-plugin-chatluna/llm-core/platform/types' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' -import { - isMessageContentAudio, - isMessageContentVideo, - type MessageContentAudio, - type MessageContentVideo -} from 'koishi-plugin-chatluna/utils/langchain' import { getBase64EncodedSize } from 'koishi-plugin-chatluna/utils/base64' import { Config, logger } from '..' import { addImageToContent, addTextToContent, + BROWSER_UA, + convertAudioToMp3, + detectAudioMimeType, + IMAGE_MIME_TYPES, + inferMimeTypeFromUrl, + normalizeMimeType, parseGifToFrames, processImageWithModel } from '../utils' import z from 'zod' -// --------------------------------------------------------------------------- -// Constants -// --------------------------------------------------------------------------- - -const IMAGE_MIME_TYPES = new Set([ - 'image/png', - 'image/jpeg', - 'image/bmp', - 'image/webp', - 'image/gif' -]) - const DEFAULT_MAX_FILE_SIZE_BYTES = 100 * 1024 * 1024 const DEFAULT_MAX_TOTAL_SIZE_BYTES = 100 * 1024 * 1024 -const FILE_EXTENSION_TO_MIME_TYPE = new Map([ - ['.png', 'image/png'], - ['.jpg', 'image/jpeg'], - ['.jpeg', 'image/jpeg'], - ['.bmp', 'image/bmp'], - ['.webp', 'image/webp'], - ['.gif', 'image/gif'], - ['.pdf', 'application/pdf'], - ['.txt', 'text/plain'], - ['.md', 'text/markdown'], - ['.html', 'text/html'], - ['.htm', 'text/html'], - ['.css', 'text/css'], - ['.xml', 'text/xml'], - ['.csv', 'text/csv'], - ['.rtf', 'text/rtf'], - ['.js', 'text/javascript'], - ['.mjs', 'text/javascript'], - ['.json', 'application/json'], - ['.mp4', 'video/mp4'], - ['.mpeg', 'video/mpeg'], - ['.mov', 'video/mov'], - ['.avi', 'video/avi'], - ['.flv', 'video/x-flv'], - ['.mpg', 'video/mpg'], - ['.webm', 'video/webm'], - ['.wmv', 'video/wmv'], - ['.3gp', 'video/3gpp'], - ['.3gpp', 'video/3gpp'], - ['.mp3', 'audio/mpeg'], - ['.aiff', 'audio/aiff'], - ['.aac', 'audio/aac'], - ['.flac', 'audio/flac'], - ['.wav', 'audio/wav'], - ['.ogg', 'audio/ogg'], - ['.m4a', 'audio/mp4'] -]) - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -function isHttpOrHttpsUrl(url: string): boolean { - try { - const parsed = new URL(url) - return parsed.protocol === 'http:' || parsed.protocol === 'https:' - } catch { - return false - } -} - -function normalizeMimeType(raw: string | null): string | null { - if (raw == null) return null - const mimeType = raw.split(';')[0]?.trim()?.toLowerCase() - return mimeType || null -} - -function inferMimeTypeFromPath(path: string): string | null { - const sanitizedPath = path.toLowerCase().split(/[?#]/, 1)[0] - const fileName = sanitizedPath.split(/[/\\]/).pop() ?? sanitizedPath - const extensionIndex = fileName.lastIndexOf('.') - - if (extensionIndex < 0) { - return null - } - - const extension = fileName.slice(extensionIndex) - return FILE_EXTENSION_TO_MIME_TYPE.get(extension) ?? null -} - -function inferMimeTypeFromUrl(url: string): string | null { - try { - const pathname = new URL(url).pathname - return inferMimeTypeFromPath(pathname) - } catch { - // ignore - } - - return null -} - -/** - * Check whether the model natively supports a given MIME type based on its - * capabilities and `FileHandlingConfig`. - */ -function modelSupportsNativeMimeType( - model: ChatLunaChatModel, +interface NativePart { mimeType: string -): boolean { - const caps = model.modelInfo.capabilities - - let capabilitySupportsMime = false - if (IMAGE_MIME_TYPES.has(mimeType)) { - capabilitySupportsMime = caps.includes(ModelCapabilities.ImageInput) - } else if (mimeType.startsWith('audio/')) { - capabilitySupportsMime = caps.includes(ModelCapabilities.AudioInput) - } else if (mimeType.startsWith('video/')) { - capabilitySupportsMime = caps.includes(ModelCapabilities.VideoInput) - } else if ( - mimeType.startsWith('text/') || - mimeType === 'application/json' || - mimeType === 'application/pdf' - ) { - capabilitySupportsMime = caps.includes(ModelCapabilities.FileInput) - } - - if (!capabilitySupportsMime) { - return false - } - - const fileConfig = model.fileHandlingConfig - if (fileConfig != null) { - return fileConfig.supportedMimeTypes.has(mimeType) - } - - return true -} - -function isMimeTypeEnabled(config: Config, mimeType: string): boolean { - if (mimeType === 'image/gif') { - return config.enableGifReadTool - } - - if (IMAGE_MIME_TYPES.has(mimeType)) { - return config.enableImageReadTool - } - - return config.enableFileReadTool -} - -function buildReadFilesDescription(config: Config): string { - const sections: string[] = [] - - if (config.enableImageReadTool) { - sections.push( - '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.' - ) - } - - if (config.enableGifReadTool) { - sections.push( - '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.' - ) - } - - if (config.enableFileReadTool) { - sections.push( - '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).' - ) - } - - return `Read files from URL(s) and return their content. -Enabled read_files capabilities: -${sections.join('\n')} -Use this tool when you need to read files from URL(s) as context.` -} - -/** - * Build a multimodal `HumanMessage` containing the file(s) as content parts, - * suitable for injecting into the conversation context. - */ -function buildMultimodalMessage( - parts: { - mimeType: string - base64Data: string - sourceUrl: string - }[], - insertPrompt: string -): HumanMessage { - const content: MessageContentComplex[] = [] - - for (const part of parts) { - const { mimeType, base64Data } = part - - if (IMAGE_MIME_TYPES.has(mimeType)) { - content.push({ - type: 'image_url', - image_url: { - url: `data:${mimeType};base64,${base64Data}` - } - }) - } else if (mimeType.startsWith('audio/')) { - const audioContent: MessageContentAudio = { - type: 'audio_url', - audio_url: { - url: `data:${mimeType};base64,${base64Data}`, - mimeType - } - } - - if (isMessageContentAudio(audioContent as MessageContentComplex)) { - content.push(audioContent as MessageContentComplex) - } - } else if (mimeType.startsWith('video/')) { - const videoContent: MessageContentVideo = { - type: 'video_url', - video_url: { - url: `data:${mimeType};base64,${base64Data}`, - mimeType - } - } - - if (isMessageContentVideo(videoContent as MessageContentComplex)) { - content.push(videoContent as MessageContentComplex) - } - } else { - // Inline data for text/pdf/etc. (Gemini-style) - content.push({ - inline_data: { - mime_type: mimeType, - data: base64Data - } - } as unknown as MessageContentComplex) - } - } - - if (content.length > 0) { - content.unshift({ - type: 'text', - text: insertPrompt - }) - } - - return new HumanMessage({ content }) + base64Data: string + sourceUrl: string } // --------------------------------------------------------------------------- @@ -272,28 +41,47 @@ function buildMultimodalMessage( export class ReadFilesTool extends StructuredTool { name = 'read_files' - description: string - schema = z.object({ files: z - .union([ - z.object({ - url: z.string().url() - }), - z - .array( - z.object({ - url: z.string().url() - }) - ) - .min(1) - .max(10) - ]) + .preprocess( + (arg: unknown) => { + if (typeof arg === 'string') { + const base = JSON.parse(arg) + if ( + typeof base === 'object' && + typeof base['files'] === 'string' + ) { + try { + base['files'] = JSON.parse(base['files']) + return base + } catch { + return base + } + } + } + return arg + }, + z.union([ + z.object({ + url: z.string().url() + }), + z + .array( + z.object({ + url: z.string().url() + }) + ) + .min(1) + .max(10) + ]) + ) .describe( 'One file or a list of files to read (max 10). File format: { url: string }. MIME type is inferred from response headers, then URL extension.' ) }) + description: string + constructor( private readonly ctx: Context, private readonly config: Config, @@ -302,7 +90,7 @@ export class ReadFilesTool extends StructuredTool { > ) { super({}) - this.description = buildReadFilesDescription(config) + this.description = describeTool(config) } async _call( @@ -314,366 +102,314 @@ export class ReadFilesTool extends StructuredTool { const model = runConfig?.configurable?.model const conversationId = runConfig?.configurable?.conversationId const fileConfig = model?.fileHandlingConfig - - let totalBase64Bytes = 0 - const maxTotalSize = + const maxTotal = fileConfig?.maxTotalSizeBytes ?? DEFAULT_MAX_TOTAL_SIZE_BYTES - const nativeParts: { - mimeType: string - base64Data: string - sourceUrl: string - }[] = [] - - const response: { - files: { - sourceUrl: string - mimeType?: string - status: 'ok' | 'described' | 'error' - description?: string - error?: string - }[] - successCount: number - failureCount: number - } = { + const native: NativePart[] = [] + const report: ToolReport = { files: [], successCount: 0, failureCount: 0 } + let totalBytes = 0 let describedCount = 0 - for (const file of files) { - const sourceUrl = file.url - - const pushError = (errorMessage: string, mimeType?: string) => { - response.files.push({ + for (const { url: sourceUrl } of files) { + if (!isHttp(sourceUrl)) { + pushError( + report, sourceUrl, - mimeType, - status: 'error', - error: errorMessage - }) - response.failureCount++ + 'Only http/https URLs are supported.' + ) + continue } try { - if (!isHttpOrHttpsUrl(sourceUrl)) { - pushError( - 'Only http/https URLs are supported for read_files.' - ) + const fetched = await this._fetch(sourceUrl) + if (!fetched) { + pushError(report, sourceUrl, 'Failed to fetch URL.') continue } - // Determine MIME type first by fetching with headers - const controller = new AbortController() - const timeout = setTimeout(() => controller.abort(), 60_000) - const httpResponse = await this.ctx - .http(sourceUrl, { - responseType: 'arraybuffer', - method: 'get', - headers: { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' - }, - signal: controller.signal - }) - .finally(() => { - clearTimeout(timeout) - }) - - const buffer = Buffer.from(httpResponse.data) - - // Resolve MIME type from response headers or URL - const headers = httpResponse.headers as unknown as - | Record - | undefined - const rawCt = - headers?.['content-type'] ?? headers?.['Content-Type'] - let responseMimeType: string | null = null - if (typeof rawCt === 'string') { - responseMimeType = normalizeMimeType(rawCt) - } else if ( - Array.isArray(rawCt) && - typeof rawCt[0] === 'string' - ) { - responseMimeType = normalizeMimeType(rawCt[0]) - } - - const mimeType = - responseMimeType ?? inferMimeTypeFromUrl(sourceUrl) + const declared = + normalizeMimeType(fetched.contentType) ?? + inferMimeTypeFromUrl(sourceUrl) + const detectedAudio = detectAudioMimeType( + fetched.buffer, + declared + ) + const mime = detectedAudio ?? declared - if (!mimeType) { + if (!mime) { pushError( - `Could not determine MIME type for ${sourceUrl}. Please ensure the URL returns a valid content type.` + report, + sourceUrl, + `Could not determine MIME type for ${sourceUrl}.` ) continue } - - if (!isMimeTypeEnabled(this.config, mimeType)) { + if (!mimeEnabled(this.config, mime)) { pushError( - `Feature disabled for MIME type "${mimeType}". Please enable the corresponding read_files switch.`, - mimeType + report, + sourceUrl, + `Feature disabled for MIME type "${mime}".`, + mime ) continue } - // Check if the model supports this MIME type natively - const isImage = IMAGE_MIME_TYPES.has(mimeType) - const modelSupports = - model != null && - modelSupportsNativeMimeType(model, mimeType) - - if (modelSupports && !isImage) { - // Non-image file that the model supports natively -> inline inject - const maxFileSize = - fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES - - const encodedSize = getBase64EncodedSize(buffer.byteLength) - - if (encodedSize > maxFileSize) { - pushError( - `File too large (${encodedSize} bytes after base64), max ${maxFileSize} bytes for ${mimeType}`, - mimeType + const isImage = IMAGE_MIME_TYPES.has(mime) + const isAudio = mime.startsWith('audio/') + const supportsNative = + model != null && modelSupportsMime(model, mime) + + // ----- Non-image native: maybe transcode audio, then inline --- + if (!isImage && supportsNative) { + let bytes = fetched.buffer + let outMime = mime + if ( + isAudio && + fileConfig?.supportedMimeTypes && + !fileConfig.supportedMimeTypes.has(mime) + ) { + const converted = await convertAudioToMp3( + this.ctx, + bytes ) - continue + if (!converted) { + pushError( + report, + sourceUrl, + `Unsupported audio MIME "${mime}" and ffmpeg conversion failed.`, + mime + ) + continue + } + bytes = converted + outMime = 'audio/mpeg' } - if (totalBase64Bytes + encodedSize > maxTotalSize) { - pushError( - `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`, - mimeType - ) + const sizeError = checkSize( + bytes, + outMime, + fileConfig, + totalBytes, + maxTotal + ) + if (sizeError) { + pushError(report, sourceUrl, sizeError, outMime) continue } - - totalBase64Bytes += encodedSize - nativeParts.push({ - mimeType, - base64Data: buffer.toString('base64'), - sourceUrl - }) - - response.files.push({ + totalBytes += getBase64EncodedSize(bytes.byteLength) + pushNative( + report, + native, sourceUrl, - mimeType, - status: 'ok' - }) - response.successCount++ - } else if (isImage && modelSupports) { - // Image that the model supports natively -> inject directly - // Unified per-file size check before any branching - const maxFileSize = - fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES - - const encodedSize = getBase64EncodedSize(buffer.byteLength) - - if (encodedSize > maxFileSize) { - pushError( - `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`, - mimeType - ) + outMime, + bytes.toString('base64') + ) + continue + } + + // ----- Image native: inject directly (GIF splits to frames) --- + if (isImage && supportsNative) { + const sizeError = checkSize( + fetched.buffer, + mime, + fileConfig, + totalBytes, + maxTotal + ) + if (sizeError) { + pushError(report, sourceUrl, sizeError, mime) continue } - // For GIF: split into frames - if (mimeType === 'image/gif') { - const frames = await parseGifToFrames(buffer, { + if (mime === 'image/gif') { + let pushed = 0 + const frames = await parseGifToFrames(fetched.buffer, { strategy: this.config.gifStrategy, frameCount: this.config.gifFrameCount }) - - logger.debug( - `Extracted ${frames.length} frames from GIF for native model injection` - ) - for (const frame of frames) { - // Frames are data:image/png;base64,... strings const frameBase64 = frame.split(',')[1] - const frameSize = getBase64EncodedSize( - Buffer.from(frameBase64, 'base64').byteLength + const buf = Buffer.from(frameBase64, 'base64') + const sizeError = checkSize( + buf, + 'image/png', + fileConfig, + totalBytes, + maxTotal ) - - if (totalBase64Bytes + frameSize > maxTotalSize) { + if (sizeError) { + if (pushed < 1) { + pushError( + report, + sourceUrl, + sizeError, + 'image/png' + ) + } logger.warn( 'Skipping remaining GIF frames due to total size limit' ) break } - - totalBase64Bytes += frameSize - nativeParts.push({ - mimeType: 'image/png', - base64Data: frameBase64, - sourceUrl - }) - } - } else { - if (totalBase64Bytes + encodedSize > maxTotalSize) { - pushError( - `Total inline upload size too large (${totalBase64Bytes + encodedSize} bytes), max ${maxTotalSize} bytes per request`, - mimeType + totalBytes += getBase64EncodedSize(buf.byteLength) + pushNative( + report, + native, + sourceUrl, + 'image/png', + frameBase64 ) - continue + pushed++ } - - totalBase64Bytes += encodedSize - nativeParts.push({ - mimeType, - base64Data: buffer.toString('base64'), - sourceUrl - }) - } - - response.files.push({ - sourceUrl, - mimeType, - status: 'ok' - }) - response.successCount++ - } else if (isImage) { - // Image but model doesn't support it natively -> describe using image model - const maxFileSize = - fileConfig?.maxFileSizeBytesOverrides?.[mimeType] ?? - fileConfig?.maxFileSizeBytes ?? - DEFAULT_MAX_FILE_SIZE_BYTES - - const encodedSize = getBase64EncodedSize(buffer.byteLength) - - if (encodedSize > maxFileSize) { - pushError( - `File too large (${encodedSize} bytes after base64, raw ${buffer.byteLength} bytes), max ${maxFileSize} bytes for ${mimeType}`, - mimeType + } else { + totalBytes += getBase64EncodedSize( + fetched.buffer.byteLength + ) + pushNative( + report, + native, + sourceUrl, + mime, + fetched.buffer.toString('base64') ) - continue } + continue + } - const describeResult = await this._describeImageWithModel( + // ----- Image without native support: describe via vision model - + if (isImage) { + const described = await this._describeImage( sourceUrl, - buffer, - mimeType + fetched.buffer, + mime ) - - if (describeResult) { - response.files.push({ + if (described) { + report.files.push({ sourceUrl, - mimeType, + mimeType: mime, status: 'described', - description: describeResult + description: described }) - response.successCount++ + report.successCount++ describedCount++ } else { pushError( - `Failed to describe image from ${sourceUrl}`, - mimeType + report, + sourceUrl, + 'Failed to describe image.', + mime ) - continue } - } else { - // Non-image, model doesn't support it natively - pushError( - `Unsupported MIME type "${mimeType}" for the current model. The model does not natively support this file type.`, - mimeType - ) continue } + + pushError( + report, + sourceUrl, + `Unsupported MIME "${mime}" for the current model.`, + mime + ) } catch (error) { logger.warn(`read_files error for ${sourceUrl}:`, error) - const errorMessage = + pushError( + report, + sourceUrl, error instanceof Error ? error.message : String(error) - pushError(errorMessage) + ) } } - // Inject native parts into next-round context via contextManager - if (nativeParts.length > 0 && conversationId) { - const message = buildMultimodalMessage( - nativeParts, - this.config.fileInsertPrompt - ) - + const injected = native.length > 0 && !!conversationId + if (native.length > 0 && conversationId) { this.ctx.chatluna.contextManager.inject({ conversationId, name: 'read_files_context', - value: message, + value: buildMultimodalMessage( + native, + this.config.fileInsertPrompt + ), once: true, stage: 'after_scratchpad' }) - logger.debug( - `Injected ${nativeParts.length} file part(s) into context for conversation ${conversationId}` + `Injected ${native.length} file part(s) into context for conversation ${conversationId}` ) } return JSON.stringify({ - response, - note: - nativeParts.length > 0 - ? `Successfully read ${nativeParts.length} file(s). The file content has been added to the conversation context and will be available in the next turn.` - : describedCount > 0 - ? `Described ${describedCount} image file(s) using the vision model.` - : response.failureCount > 0 - ? `Failed to read ${response.failureCount} file(s).` - : 'No files were processed.' + response: report, + note: injected + ? `Successfully read ${native.length} file(s). The file content has been added to the conversation context and will be available in the next turn.` + : native.length > 0 + ? `Successfully read ${native.length} file(s), but no conversation id was available, so the file content was not added to the conversation context.` + : describedCount > 0 + ? `Described ${describedCount} image file(s) using the vision model.` + : report.failureCount > 0 + ? `Failed to read ${report.failureCount} file(s).` + : 'No files were processed.' }) } - /** - * Describe an image using the configured image model (fallback when the - * main model doesn't support image input). - */ - private async _describeImageWithModel( + private async _fetch( + url: string + ): Promise<{ buffer: Buffer; contentType: string | null } | null> { + try { + const response = await this.ctx.http(url, { + responseType: 'arraybuffer', + method: 'get', + headers: { 'User-Agent': BROWSER_UA }, + timeout: 60_000 + }) + return { + buffer: Buffer.from(response.data), + contentType: getHeaderValue(response.headers, 'content-type') + } + } catch { + return null + } + } + + private async _describeImage( url: string, buffer: Buffer, mimeType: string ): Promise { const imageModel = this.imageModelRef().value - if (imageModel == null) { - logger.warn( - 'Image model is not loaded, cannot describe image. Please check your chat adapter.' - ) - return null - } - if ( + !imageModel || !imageModel.modelInfo.capabilities.includes( ModelCapabilities.ImageInput ) ) { - logger.warn('Image model does not support image input.') + logger.warn( + 'Image model not loaded or lacks image input; cannot describe.' + ) return null } try { - const fakeMessage: Message = { content: [] } - + const fake: Message = { content: [] } if (mimeType === 'image/gif') { const frames = await parseGifToFrames(buffer, { strategy: this.config.gifStrategy, frameCount: this.config.gifFrameCount }) - addTextToContent( - fakeMessage, + fake, 'This is a GIF image. See the frames below:' ) - for (const frame of frames) { - addImageToContent(fakeMessage, frame) - } + for (const frame of frames) addImageToContent(fake, frame) } else { - const base64 = buffer.toString('base64') - const base64Source = `data:${mimeType};base64,${base64}` - addImageToContent(fakeMessage, base64Source) + addImageToContent( + fake, + `data:${mimeType};base64,${buffer.toString('base64')}` + ) } - - return await processImageWithModel( - imageModel, - this.config, - fakeMessage - ) + return await processImageWithModel(imageModel, this.config, fake) } catch (error) { logger.warn(`Describe image ${url} error:`, error) return null @@ -681,6 +417,169 @@ export class ReadFilesTool extends StructuredTool { } } +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +interface ToolReport { + files: { + sourceUrl: string + mimeType?: string + status: 'ok' | 'described' | 'error' + description?: string + error?: string + }[] + successCount: number + failureCount: number +} + +function pushError( + report: ToolReport, + sourceUrl: string, + error: string, + mimeType?: string +) { + report.files.push({ sourceUrl, mimeType, status: 'error', error }) + report.failureCount++ +} + +function pushNative( + report: ToolReport, + native: NativePart[], + sourceUrl: string, + mimeType: string, + base64Data: string +) { + native.push({ sourceUrl, mimeType, base64Data }) + report.files.push({ sourceUrl, mimeType, status: 'ok' }) + report.successCount++ +} + +function getHeaderValue(headers: unknown, name: string): string | null { + if (headers == null) return null + + if (typeof (headers as { get?: unknown }).get === 'function') { + const value = (headers as { get(name: string): string | null }).get( + name + ) + return typeof value === 'string' ? value : null + } + + const record = headers as Record + const lower = name.toLowerCase() + for (const key of Object.keys(record)) { + if (key.toLowerCase() === lower) { + const value = record[key] + return typeof value === 'string' ? value : null + } + } + return null +} + +function isHttp(url: string): boolean { + try { + const { protocol } = new URL(url) + return protocol === 'http:' || protocol === 'https:' + } catch { + return false + } +} + +function modelSupportsMime(model: ChatLunaChatModel, mime: string): boolean { + const caps = model.modelInfo.capabilities + const isImage = IMAGE_MIME_TYPES.has(mime) + const capOk = isImage + ? caps.includes(ModelCapabilities.ImageInput) + : mime.startsWith('audio/') + ? caps.includes(ModelCapabilities.AudioInput) + : mime.startsWith('video/') + ? caps.includes(ModelCapabilities.VideoInput) + : caps.includes(ModelCapabilities.FileInput) + if (!capOk) return false + const file = model.fileHandlingConfig + return file == null || file.supportedMimeTypes.has(mime) +} + +function mimeEnabled(config: Config, mime: string): boolean { + if (mime === 'image/gif') return config.enableGifReadTool + if (IMAGE_MIME_TYPES.has(mime)) return config.enableImageReadTool + return config.enableFileReadTool +} + +function checkSize( + buffer: Buffer, + mime: string, + fileConfig: FileHandlingConfig | undefined, + totalBytes: number, + maxTotal: number +): string | null { + const max = + fileConfig?.maxFileSizeBytesOverrides?.[mime] ?? + fileConfig?.maxFileSizeBytes ?? + DEFAULT_MAX_FILE_SIZE_BYTES + const encoded = getBase64EncodedSize(buffer.byteLength) + if (encoded > max) { + return `File too large (${encoded} bytes after base64, raw ${buffer.byteLength} bytes), max ${max} bytes for ${mime}.` + } + if (totalBytes + encoded > maxTotal) { + return `Total inline upload size too large (${totalBytes + encoded} bytes), max ${maxTotal} bytes per request.` + } + return null +} + +function buildMultimodalMessage( + parts: NativePart[], + prompt: string +): HumanMessage { + const content: MessageContentComplex[] = [] + for (const { mimeType, base64Data } of parts) { + const dataUrl = `data:${mimeType};base64,${base64Data}` + if (IMAGE_MIME_TYPES.has(mimeType)) { + content.push({ type: 'image_url', image_url: { url: dataUrl } }) + } else if (mimeType.startsWith('audio/')) { + content.push({ + type: 'audio_url', + audio_url: { url: dataUrl, mimeType } + } as unknown as MessageContentComplex) + } else if (mimeType.startsWith('video/')) { + content.push({ + type: 'video_url', + video_url: { url: dataUrl, mimeType } + } as unknown as MessageContentComplex) + } else { + // Inline data for text/pdf/etc. (Gemini-style) + content.push({ + inline_data: { mime_type: mimeType, data: base64Data } + } as unknown as MessageContentComplex) + } + } + if (content.length > 0) content.unshift({ type: 'text', text: prompt }) + return new HumanMessage({ content }) +} + +function describeTool(config: Config): string { + const sections: string[] = [] + if (config.enableImageReadTool) { + sections.push( + '- Image read/describe (non-GIF): image/bmp, image/jpeg, image/png, image/webp. If the model lacks native image input, fallback image description will be used.' + ) + } + if (config.enableGifReadTool) { + sections.push( + '- GIF read/describe: image/gif. Native-capable models receive extracted frames; otherwise fallback image description is used.' + ) + } + if (config.enableFileReadTool) { + sections.push( + '- File read: text/html, text/css, text/plain, text/markdown, text/xml, text/csv, text/rtf, text/javascript, application/json, application/pdf, audio/*, video/* (effective MIME set still depends on model capabilities and FileHandlingConfig).' + ) + } + return `Read files from URL(s) and return their content. +Enabled read_files capabilities: +${sections.join('\n')} +Use this tool when you need to read files from URL(s) as context.` +} + // --------------------------------------------------------------------------- // Plugin registration // --------------------------------------------------------------------------- diff --git a/packages/service-multimodal/src/utils.ts b/packages/service-multimodal/src/utils.ts index bfb0532d0..55ecfdbb4 100644 --- a/packages/service-multimodal/src/utils.ts +++ b/packages/service-multimodal/src/utils.ts @@ -1,6 +1,5 @@ import { HumanMessage, - MessageContent, MessageContentComplex, MessageContentText } from '@langchain/core/messages' @@ -12,195 +11,314 @@ import { isMessageContentImageUrl } from 'koishi-plugin-chatluna/utils/string' import { Context } from 'koishi' +import type {} from 'koishi-plugin-ffmpeg-path' import { Config, logger } from '.' import { GifReader } from 'omggif' import { Jimp } from 'jimp' -export interface GifExtractionConfig { - strategy: 'first' | 'head' | 'average' - frameCount: number +// --------------------------------------------------------------------------- +// MIME helpers +// --------------------------------------------------------------------------- + +export const IMAGE_MIME_TYPES = new Set([ + 'image/png', + 'image/jpeg', + 'image/bmp', + 'image/webp', + 'image/gif' +]) + +const FILE_EXTENSION_TO_MIME_TYPE: Record = { + '.png': 'image/png', + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.bmp': 'image/bmp', + '.webp': 'image/webp', + '.gif': 'image/gif', + '.pdf': 'application/pdf', + '.txt': 'text/plain', + '.md': 'text/markdown', + '.html': 'text/html', + '.htm': 'text/html', + '.css': 'text/css', + '.xml': 'text/xml', + '.csv': 'text/csv', + '.rtf': 'text/rtf', + '.js': 'text/javascript', + '.mjs': 'text/javascript', + '.json': 'application/json', + '.mp4': 'video/mp4', + '.mpeg': 'video/mpeg', + '.mov': 'video/mov', + '.avi': 'video/avi', + '.flv': 'video/x-flv', + '.webm': 'video/webm', + '.wmv': 'video/wmv', + '.3gp': 'video/3gpp', + '.3gpp': 'video/3gpp', + '.mp3': 'audio/mpeg', + '.aiff': 'audio/aiff', + '.aac': 'audio/aac', + '.flac': 'audio/flac', + '.wav': 'audio/wav', + '.ogg': 'audio/ogg', + '.m4a': 'audio/mp4' } -/** - * Check if any frame in the range [start, end) has complex disposal methods - * that require resetting the canvas (disposal method 2 or 3) - */ -function hasComplexDisposal( - reader: GifReader, - start: number, - end: number -): boolean { - for (let i = start; i < end; i++) { - const disposal = reader.frameInfo(i).disposal - // disposal 2: restore to background color - // disposal 3: restore to previous (before current frame was drawn) - if (disposal === 2 || disposal === 3) { - return true - } +export function inferMimeTypeFromUrl(url: string): string | null { + try { + const path = new URL(url).pathname.toLowerCase() + const dot = path.lastIndexOf('.') + return dot < 0 + ? null + : (FILE_EXTENSION_TO_MIME_TYPE[path.slice(dot)] ?? null) + } catch { + return null } - return false } -export async function extractGifFrames( +export function normalizeMimeType( + raw: string | null | undefined +): string | null { + return raw?.split(';')[0]?.trim()?.toLowerCase() || null +} + +/** + * Detect audio MIME from buffer header. Recognises QQ Silk + AMR + common + * audio container magic bytes. Falls back to the declared MIME otherwise. + */ +export function detectAudioMimeType( buffer: Buffer, - config: GifExtractionConfig -): Promise { - try { - const reader = new GifReader(buffer) - const totalFrames = reader.numFrames() + declared?: string | null +): string | null { + const head = buffer.subarray(0, 16).toString('latin1') + + if (head.startsWith('#!AMR')) return 'audio/amr' + // QQ/OneBot ships SILK voice files with a leading flag byte before the + // standard `#!SILK_V3` magic, so we also check offset 1 for that variant. + if ( + head.startsWith('#!SILK_V3') || + buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + ) { + return 'audio/silk' + } + // MP3 frame sync: 0xFFEx. Reject JPEG (0xFFD8) by checking the full sync word. + if ( + head.startsWith('ID3') || + (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) + ) { + return 'audio/mpeg' + } + if ( + head.startsWith('RIFF') && + buffer.subarray(8, 12).toString('latin1') === 'WAVE' + ) { + return 'audio/wav' + } + if (head.startsWith('fLaC')) return 'audio/flac' + if (head.startsWith('OggS')) return 'audio/ogg' - if (totalFrames === 0) { - throw new Error('No frames found in GIF') - } + return declared ?? null +} - const width = reader.width - const height = reader.height +// --------------------------------------------------------------------------- +// FFmpeg / Silk +// --------------------------------------------------------------------------- + +export async function convertAudioToMp3( + ctx: Context, + buffer: Buffer +): Promise { + if (!ctx.ffmpeg) { + logger.warn( + 'FFmpeg service unavailable; install koishi-plugin-ffmpeg-path to enable audio transcoding.' + ) + return null + } - let frameIndices: number[] = [] + try { + // Match both the standard SILK magic and the QQ/OneBot variant that + // prepends a flag byte before `#!SILK_V3`. + const isSilk = + buffer.subarray(0, 9).toString('latin1') === '#!SILK_V3' || + buffer.subarray(1, 10).toString('latin1') === '#!SILK_V3' + + let source = buffer + let silkSampleRate: number | null = null + if (isSilk) { + const decoded = await decodeSilkToPcm(ctx, buffer) + if (!decoded) return null + source = decoded.buffer + silkSampleRate = decoded.sampleRate + } - switch (config.strategy) { - case 'first': - frameIndices = [0] - break + const builder = ctx.ffmpeg.builder().input(source) + if (silkSampleRate != null) { + builder.inputOption( + '-f', + 's16le', + '-ar', + String(silkSampleRate), + '-ac', + '1' + ) + } + return await builder + .outputOption( + '-vn', + '-acodec', + 'libmp3lame', + '-q:a', + '4', + '-f', + 'mp3' + ) + .run('buffer') + } catch (error) { + logger.warn(`Audio transcoding to mp3 failed:`, error) + return null + } +} - case 'head': { - const count = Math.min(config.frameCount, totalFrames) - frameIndices = Array.from({ length: count }, (_, i) => i) - break +async function decodeSilkToPcm( + ctx: Context, + buffer: Buffer +): Promise<{ buffer: Buffer; sampleRate: number } | null> { + if (!ctx.silk) { + logger.warn( + 'Silk service unavailable; install koishi-plugin-ffmpeg-path 2.0+ for silk decoding.' + ) + return null + } + for (const sampleRate of [24000, 16000, 12000, 8000]) { + try { + const result = (await ctx.silk.decode(buffer, sampleRate)) as { + data?: Uint8Array } - - case 'average': { - const count = Math.min(config.frameCount, totalFrames) - if (count >= totalFrames) { - frameIndices = Array.from( - { length: totalFrames }, - (_, i) => i - ) - } else if (count === 1) { - // Special case: single frame, pick the first one - frameIndices = [0] - } else { - // Use span (totalFrames - 1) to ensure first and last frames are included - const step = (totalFrames - 1) / (count - 1) - frameIndices = Array.from({ length: count }, (_, i) => - Math.floor(i * step) - ) - } - break + if (result?.data != null) { + return { buffer: Buffer.from(result.data), sampleRate } } + } catch { + // try next sample rate } + } + return null +} - const frameBuffers: Buffer[] = [] - - // Build canvas incrementally, only decoding frames we need - const canvas = new Uint8ClampedArray(width * height * 4) - let lastDecodedFrame = -1 +// --------------------------------------------------------------------------- +// GIF +// --------------------------------------------------------------------------- - for (const frameIndex of frameIndices) { - // Check if we need to restart decoding from frame 0 - // This happens when: - // 1. Jumping backwards in frame sequence - // 2. Any frames between lastDecodedFrame and current have complex disposal methods - // (disposal 2 or 3) which affect how the canvas should be prepared - const needsFullDecode = - frameIndex < lastDecodedFrame || - (lastDecodedFrame >= 0 && - hasComplexDisposal(reader, lastDecodedFrame, frameIndex)) +export interface GifExtractionConfig { + strategy: 'first' | 'head' | 'average' + frameCount: number +} - if (needsFullDecode) { - canvas.fill(0) // Clear canvas - // Decode from frame 0 to current frame - for (let i = 0; i <= frameIndex; i++) { - reader.decodeAndBlitFrameRGBA(i, canvas) - } - } else { - // Disposal method 0 (no disposal) or 1 (do not dispose) - // Just decode from last position to current frame - for (let i = lastDecodedFrame + 1; i <= frameIndex; i++) { - reader.decodeAndBlitFrameRGBA(i, canvas) - } +export async function parseGifToFrames( + buffer: Buffer, + config: GifExtractionConfig +): Promise { + const reader = new GifReader(buffer) + const total = reader.numFrames() + if (total === 0) throw new Error('No frames found in GIF') + + const indices = pickGifFrameIndices(total, config) + const { width, height } = reader + const canvas = new Uint8ClampedArray(width * height * 4) + let lastDecoded = -1 + const frames: string[] = [] + + for (const idx of indices) { + const needsFullDecode = + idx < lastDecoded || + (lastDecoded >= 0 && hasComplexDisposal(reader, lastDecoded, idx)) + if (needsFullDecode) { + canvas.fill(0) + for (let i = 0; i <= idx; i++) + reader.decodeAndBlitFrameRGBA(i, canvas) + } else { + for (let i = lastDecoded + 1; i <= idx; i++) { + reader.decodeAndBlitFrameRGBA(i, canvas) } - - lastDecodedFrame = frameIndex - - // Copy canvas to avoid reference issues - const frameData = new Uint8ClampedArray(canvas) - const image = new Jimp({ - data: Buffer.from(frameData), - width, - height - }) - - const pngBuffer = await image.getBuffer('image/png') - frameBuffers.push(pngBuffer) } - - return frameBuffers - } catch (error) { - logger.error('Failed to extract GIF frames:', error) - throw error + lastDecoded = idx + + const png = await new Jimp({ + data: Buffer.from(new Uint8ClampedArray(canvas)), + width, + height + }).getBuffer('image/png') + frames.push(`data:image/png;base64,${png.toString('base64')}`) } + return frames } -export async function parseGifToFrames( - buffer: Buffer, +function pickGifFrameIndices( + total: number, config: GifExtractionConfig -): Promise { - const frameBuffers = await extractGifFrames(buffer, config) - return frameBuffers.map((frameBuffer) => { - const base64 = frameBuffer.toString('base64') - return `data:image/png;base64,${base64}` - }) +): number[] { + if (config.strategy === 'first') return [0] + const count = Math.min(config.frameCount, total) + if (config.strategy === 'head') { + return Array.from({ length: count }, (_, i) => i) + } + // average + if (count >= total) return Array.from({ length: total }, (_, i) => i) + if (count === 1) return [0] + const step = (total - 1) / (count - 1) + return Array.from({ length: count }, (_, i) => Math.floor(i * step)) +} + +function hasComplexDisposal( + reader: GifReader, + start: number, + end: number +): boolean { + for (let i = start; i < end; i++) { + const d = reader.frameInfo(i).disposal + if (d === 2 || d === 3) return true + } + return false } +// --------------------------------------------------------------------------- +// Image +// --------------------------------------------------------------------------- + export async function readImage(ctx: Context, url: string) { if (url.startsWith('data:image') && url.includes('base64')) { const buffer = Buffer.from(url.split(',')[1], 'base64') - const ext = getImageType(buffer) - - return { - base64Source: url, - buffer, - ext - } + return { base64Source: url, buffer, ext: getImageType(buffer) } } - try { - const response = await ctx.http(url, { + const { data } = await ctx.http(url, { responseType: 'arraybuffer', method: 'get', - headers: { - 'User-Agent': - 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' - } + headers: { 'User-Agent': BROWSER_UA } }) - - const buffer = Buffer.from(response.data) - - const base64 = buffer.toString('base64') - + const buffer = Buffer.from(data) const ext = getImageType(buffer) - return { - base64Source: `data:${ext};base64,${base64}`, + base64Source: `data:${ext};base64,${buffer.toString('base64')}`, buffer, ext } } catch (error) { logger.error(`Failed to read image from ${url}:`, error) - return { - base64Source: null, - buffer: null, - ext: null - } + return { base64Source: null, buffer: null, ext: null } } } + export async function processImageWithModel( model: ChatLunaChatModel, config: Config, message: Message -) { - const images = extractImages(message.content) +): Promise { + const images = Array.isArray(message.content) + ? message.content.filter((item: MessageContentComplex) => + isMessageContentImageUrl(item) + ) + : [] if (images.length === 0) return null try { @@ -208,9 +326,7 @@ export async function processImageWithModel( { type: 'text', text: config.imagePrompt } as MessageContentText, ...images ] - const result = await model.invoke([new HumanMessage({ content })]) - return config.imageInsertPrompt.replace( '{img}', getMessageContent(result.content) @@ -221,45 +337,36 @@ export async function processImageWithModel( } } -export const addImageToContent = (message: Message, imageUrl: string) => { - if (typeof message.content === 'string') { - message.content = [ - { - type: 'text', - text: message.content - } - ] - } +export function addImageToContent(message: Message, imageUrl: string) { + ensureContentArray(message) ;(message.content as MessageContentComplex[]).push({ type: 'image_url', - image_url: { - url: imageUrl - } + image_url: { url: imageUrl } }) } -export const addTextToContent = (message: Message, text: string) => { +export function addTextToContent(message: Message, text: string) { if (typeof message.content === 'string') { message.content += text return } - const content = message.content as MessageContentComplex[] - const lastItem = content[content.length - 1] - - if (lastItem && lastItem.type === 'text') { - lastItem.text += text + const last = content[content.length - 1] + if (last && last.type === 'text') { + last.text += text } else { - content.push({ - type: 'text', - text - }) + content.push({ type: 'text', text }) } } -export const extractImages = (content: MessageContent) => - Array.isArray(content) - ? content.filter((item: MessageContentComplex) => - isMessageContentImageUrl(item) - ) - : [] +export function ensureContentArray(message: Message, fallbackText = '') { + if (typeof message.content !== 'string') return + message.content = message.content.length + ? [{ type: 'text', text: message.content }] + : fallbackText.length + ? [{ type: 'text', text: fallbackText }] + : [] +} + +export const BROWSER_UA = + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' diff --git a/packages/shared-adapter/src/client.ts b/packages/shared-adapter/src/client.ts index 36af06aa3..1015a4350 100644 --- a/packages/shared-adapter/src/client.ts +++ b/packages/shared-adapter/src/client.ts @@ -1,6 +1,10 @@ +import { FileHandlingConfig } from 'koishi-plugin-chatluna/llm-core/platform/client' import { ModelInfo } from 'koishi-plugin-chatluna/llm-core/platform/types' import { getModelContextSize } from 'koishi-plugin-chatluna/llm-core/utils/count_tokens' +export const DEFAULT_AUDIO_MAX_BASE64_BYTES = 50 * 1024 * 1024 +export const DEFAULT_IMAGE_MAX_BASE64_BYTES = 50 * 1024 * 1024 + export type OpenAIReasoningEffort = | 'none' | 'minimal' @@ -153,7 +157,11 @@ function createGlobMatcher(pattern: string): (text: string) => boolean { return (text: string) => regex.test(text) } -const imageModelMatchers = [ +function createRegexMatcher(regex: RegExp): (text: string) => boolean { + return (text: string) => regex.test(text) +} + +const imageModelMatchers: ((text: string) => boolean)[] = [ 'vision', 'vl', 'gpt-4o', @@ -176,11 +184,76 @@ const imageModelMatchers = [ 'glm-*v', 'kimi-k2.5', 'step3', - 'grok-4', - 'mimo-v2.5*' -].map((pattern) => createGlobMatcher(pattern)) + 'grok-4' +].map(createGlobMatcher) + +// mimo-v2.5 supports image/audio; mimo-v2.5-pro does NOT (text only). +imageModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/)) export function supportImageInput(modelName: string) { const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase() return imageModelMatchers.some((matcher) => matcher(lowerModel)) } + +const audioModelMatchers: ((text: string) => boolean)[] = [ + 'gpt-4o-audio', + 'gpt-4o-mini-audio', + 'gpt-audio', + 'mimo-v2-omni' +].map(createGlobMatcher) + +audioModelMatchers.push(createRegexMatcher(/mimo-v2\.5(?!-pro)/)) + +export function supportAudioInput(modelName: string) { + const lowerModel = normalizeOpenAIModelName(modelName).toLowerCase() + return audioModelMatchers.some((matcher) => matcher(lowerModel)) +} + +const openAIImageMimeTypes = [ + 'image/png', + 'image/jpeg', + 'image/gif', + 'image/webp', + 'image/bmp' +] + +const openAIAudioMimeTypes = [ + 'audio/mpeg', + 'audio/mp3', + 'audio/wav', + 'audio/flac', + 'audio/mp4', + 'audio/ogg' +] + +export function getOpenAIFileHandlingConfig( + modelName: string +): FileHandlingConfig | undefined { + const image = supportImageInput(modelName) + const audio = supportAudioInput(modelName) + if (!image && !audio) return undefined + + const supportedMimeTypes = new Set() + const overrides: Record = {} + + if (image) { + for (const mime of openAIImageMimeTypes) { + supportedMimeTypes.add(mime) + overrides[mime] = DEFAULT_IMAGE_MAX_BASE64_BYTES + } + } + + if (audio) { + for (const mime of openAIAudioMimeTypes) { + supportedMimeTypes.add(mime) + overrides[mime] = DEFAULT_AUDIO_MAX_BASE64_BYTES + } + } + + return { + supportedMimeTypes, + maxTotalSizeBytes: 100 * 1024 * 1024, + maxFileSizeBytes: 100 * 1024 * 1024, + maxFileSizeBytesOverrides: overrides + } +} diff --git a/packages/shared-adapter/src/utils.ts b/packages/shared-adapter/src/utils.ts index a171e25f6..fe130f7fc 100644 --- a/packages/shared-adapter/src/utils.ts +++ b/packages/shared-adapter/src/utils.ts @@ -30,15 +30,24 @@ import { ResponseUsage } from './types' import { ChatLunaPlugin } from 'koishi-plugin-chatluna/services/chat' +import { logger } from 'koishi-plugin-chatluna' import { getImageMimeType, getMimeTypeFromSource, isMessageContentImageUrl } from 'koishi-plugin-chatluna/utils/string' -import { isChatLunaUserMessage } from 'koishi-plugin-chatluna/utils/langchain' +import { + isChatLunaUserMessage, + isMessageContentAudio +} from 'koishi-plugin-chatluna/utils/langchain' import { ToolCallChunk } from '@langchain/core/messages/tool' import { isZodSchemaV3 } from '@langchain/core/utils/types' -import { normalizeOpenAIModelName, supportImageInput } from './client' +import { + DEFAULT_AUDIO_MAX_BASE64_BYTES, + normalizeOpenAIModelName, + supportAudioInput, + supportImageInput +} from './client' export function createUsageMetadata(data: { inputTokens: number @@ -222,6 +231,7 @@ export function responseInputContent( } satisfies ResponseInputContent } + // OpenAI Response API does not accept `input_audio` yet — drop it. return undefined }) .filter((part) => part != null) @@ -343,64 +353,71 @@ export async function langchainMessageToOpenAIMessage( } } - const images = rawMessage.additional_kwargs.images as string[] | null - - const lowerModel = normalizedModel?.toLowerCase() ?? '' - if ( - images != null && - (supportImageInput(lowerModel) || supportImageInputType) - ) { - msg.content = [ - { - type: 'text', - text: rawMessage.content as string - } - ] - - const imageContents = await Promise.all( - images.map(async (image) => { - try { - const url = await fetchImageUrl(plugin, { - type: 'image_url', - image_url: { url: image } - } as MessageContentImageUrl) - return { - type: 'image_url', - image_url: { - url, - detail: 'low' - } - } as const - } catch { - return null - } - }) + if (rawMessage.additional_kwargs.images != null) { + logger.warn( + 'Deprecated: `additional_kwargs.images` is no longer supported. Use `image_url` content parts instead.' ) + } - msg.content.push( - ...imageContents.filter((content) => content != null) - ) - } else if (Array.isArray(msg.content) && msg.content.length > 0) { + if (Array.isArray(msg.content) && msg.content.length > 0) { + const supportsAudio = supportAudioInput(normalizedModel ?? '') + const supportsImage = + supportImageInput(normalizedModel ?? '') || + supportImageInputType === true const mappedContent = await Promise.all( msg.content.map(async (content) => { - if (!isMessageContentImageUrl(content)) return content - - try { - const url = await fetchImageUrl(plugin, content) - return { - type: 'image_url', - image_url: { - url, - detail: 'low' + if (isMessageContentImageUrl(content)) { + if (!supportsImage) { + logger.warn( + `Model ${normalizedModel} does not accept image input; dropping image content.` + ) + return null + } + try { + const url = await fetchImageUrl(plugin, content) + return { + type: 'image_url', + image_url: { url, detail: 'low' } + } + } catch { + return null + } + } + + if (isMessageContentAudio(content)) { + if (!supportsAudio) { + logger.warn( + `Model ${normalizedModel} does not accept audio input; dropping audio content.` + ) + return null + } + try { + const part = await fetchAudioContentPart( + plugin, + content + ) + if (part == null) { + logger.warn( + `Audio content for model ${normalizedModel} was dropped (exceeded size limits or no data).` + ) } + return part + } catch (err) { + logger.error( + `Failed to fetch audio part for model ${normalizedModel}`, + err + ) + throw err } - } catch { - return null } + + return content }) ) - msg.content = mappedContent.filter((content) => content != null) + msg.content = mappedContent.filter( + (content) => content != null + ) as ChatCompletionResponseMessage['content'] } result.push(msg) @@ -676,6 +693,54 @@ export async function fetchFileLikeUrl( } } +const AUDIO_MIME_TO_FORMAT: Record = { + 'audio/mpeg': 'mp3', + 'audio/mp3': 'mp3', + 'audio/wav': 'wav', + 'audio/x-wav': 'wav', + 'audio/flac': 'flac', + 'audio/x-flac': 'flac', + 'audio/ogg': 'ogg', + 'audio/mp4': 'mp4', + 'audio/aac': 'aac', + 'audio/webm': 'webm' +} + +function audioMimeToFormat(mime: string): string { + const format = AUDIO_MIME_TO_FORMAT[mime.toLowerCase()] + if (!format) { + throw new Error( + `Unsupported audio MIME for OpenAI input_audio: ${mime}` + ) + } + return format +} + +/** + * Fetch an `audio_url` content part and convert it to the OpenAI-compatible + * `input_audio` shape used by gpt-4o-audio / MiMo. Returns `null` when the + * encoded payload exceeds {@link DEFAULT_AUDIO_MAX_BASE64_BYTES}. + */ +async function fetchAudioContentPart( + plugin: ChatLunaPlugin, + content: MessageContentFileLike & { type: 'audio_url' } +): Promise { + const { buffer, mimeType } = await fetchFileLikeUrl(plugin, content) + const base64 = buffer.toString('base64') + + if (base64.length > DEFAULT_AUDIO_MAX_BASE64_BYTES) { + return null + } + + return { + type: 'input_audio', + input_audio: { + data: base64, + format: audioMimeToFormat(mimeType) + } + } as unknown as MessageContentComplex +} + export function messageTypeToOpenAIRole( type: MessageType ): ChatCompletionResponseMessageRoleEnum {