diff --git a/duui-open-webui/README.md b/duui-open-webui/README.md new file mode 100644 index 00000000..b27146b7 --- /dev/null +++ b/duui-open-webui/README.md @@ -0,0 +1,166 @@ + +[![Version](https://img.shields.io/static/v1?label=duui-multimodal\&message=0.1.0\&color=blue)](https://docker.texttechnologylab.org/v2/duui-multimodal/tags/list) +[![Python](https://img.shields.io/static/v1?label=Python\&message=3.12\&color=green)]() +[![Transformers](https://img.shields.io/static/v1?label=Transformers\&message=4.48.2\&color=yellow)]() +[![Torch](https://img.shields.io/static/v1?label=Torch\&message=2.6.0\&color=red)]() + +# DUUI Open-WebUI + +DUUI implementation for **multimodal Hugging Face models** that support combinations of: + +* Text +* Image + + + +--- + + +## Supported Models services + +| Model Name | Source | Dockerimage | Mode | Lang | Version | +|------------|--------------------------------------------------------------------------------------------------|-------------|------------|---------------------------------|---------| +| ollama | https://docs.ollama.com/api/openai-compatibility | NA | image/text | multi | 0.0.1 | + +--- + +## Supported Modes + +| Mode | Description | +|---------|---------------------------------------------------------------------| +| `text` | Process raw text prompts | +| `image` | Process images and prompt combinations | + +--- + +## How To Use + +Requires the [Docker Unified UIMA Interface (DUUI)](https://github.com/texttechnologylab/DockerUnifiedUIMAInterface). + +### Start Docker Container + +```bash +docker run -p 9714:9714 docker.texttechnologylab.org/duui-open-webui +``` + +Find available image tags: [Docker Registry](https://docker.texttechnologylab.org/v2/duui-mutlimodality-transformer/tags/list) + +--- + +## Use within DUUI + +### ollama setup +```java + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", "qwen2.5vl:3b") + .withParameter("mode", "image") + .withParameter("language", "en") + .withParameter("ollama_host", "localhost") // https:/llm.example +// .withParameter("ollama_port", "8080") + .withParameter("ollama_auth_token", "") + .withParameter("system_prompt", "") + .build().withTimeout(1000) + ); +``` +### Transformer Models + +```java +// Code before as it is.. + +List prompts = Arrays.asList( + "Who is the current president of the USA?", + "Is Frankfurt the capital of EU finance?" +); + +createCas("en", prompts); + composer.run(cas); + +verifyNoImages(); + +// Print results + for (Result result : JCasUtil.select(cas, Result.class)) { + System.out.println(result.getMeta()); + } +// +// Helper method to create CAS with prompts +public void createCas(String language, List prompts) throws UIMAException { + cas.setDocumentLanguage(language); + StringBuilder sb = new StringBuilder(); + + for (String messageText : prompts) { + Prompt prompt = new Prompt(cas); + prompt.setArgs("{}"); + + Message message = new Message(cas); + message.setRole("user"); + message.setContent(messageText); + message.addToIndexes(); + + FSArray messages = new FSArray(cas, 1); + messages.set(0, message); + prompt.setMessages(messages); + prompt.addToIndexes(); + + sb.append(messageText).append(" "); + } + + inputView.setDocumentText(sb.toString().trim()); +// cas.setDocumentText(sb.toString().trim()); +} +``` + + + +--- + +## Parameters + +| Name | Description | +| ------------ |--------------------------------------------------------| +| `model_name` | Name of the multimodal model to use (inside ollama) | +| `mode` | Processing mode: text, image | +| `ollama_host` | ollama host url | +| `ollama_port` | ollama port, default 8080 | +| `ollama_auth_token`| ollama auth token if exists, default empty | +| `system_prompt` | System prompt for all prompts if needed, default empty | + +--- + +## Cite + +If you want to use the DUUI image, please cite the following: + +**Leonhardt et al. (2023)** +*"Unlocking the Heterogeneous Landscape of Big Data NLP with DUUI."* +Findings of the Association for Computational Linguistics: EMNLP 2023, 385–399. +\[[LINK](https://aclanthology.org/2023.findings-emnlp.29)] \[[PDF](https://aclanthology.org/2023.findings-emnlp.29.pdf)] + +**Abusaleh (2026)** +*"OpenWebUI wrapper as {DUUI} Component"* +\[[LINK](https://github.com/texttechnologylab/duui-uima/tree/main/duui-open-webui])] +--- + +## BibTeX + +```bibtex +@inproceedings{Leonhardt:et:al:2023, + title = {Unlocking the Heterogeneous Landscape of Big Data {NLP} with {DUUI}}, + author = {Leonhardt, Alexander and Abrami, Giuseppe and Baumartz, Daniel and Mehler, Alexander}, + booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2023}, + year = {2023}, + address = {Singapore}, + publisher = {Association for Computational Linguistics}, + url = {https://aclanthology.org/2023.findings-emnlp.29}, + pages = {385--399}, + pdf = {https://aclanthology.org/2023.findings-emnlp.29.pdf} +} + +@misc{abusaleh:duui:openwebui:2026, + author = {Abusaleh, Ali}, + title = {OpenWebUI Ollama wrapper as {DUUI} Component}, + year = {2026}, + howpublished = {https://github.com/texttechnologylab/duui-uima/tree/main/duui-open-webui} +} + + diff --git a/duui-open-webui/bash_dockers/docker_build.sh b/duui-open-webui/bash_dockers/docker_build.sh new file mode 100644 index 00000000..233a6d32 --- /dev/null +++ b/duui-open-webui/bash_dockers/docker_build.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + + +export MM_ANNOTATOR_CUDA=transformer +#export DUUI_MM_CUDA="-cuda" + +export MM_ANNOTATOR_NAME=duui-openwebui +export MM_ANNOTATOR_VERSION=0.2.0 +export MM_LOG_LEVEL=DEBUG +export MM_MODEL_CACHE_SIZE=3 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +cd .. + +docker build \ + --build-arg MM_ANNOTATOR_NAME \ + --build-arg MM_ANNOTATOR_VERSION \ + --build-arg MM_LOG_LEVEL \ + -t ${DOCKER_REGISTRY}${MM_ANNOTATOR_NAME}-${MM_ANNOTATOR_CUDA}:${MM_ANNOTATOR_VERSION} \ + -f src/main/docker/Dockerfile${MM_ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${MM_ANNOTATOR_NAME}-${MM_ANNOTATOR_CUDA}:${MM_ANNOTATOR_VERSION} \ + ${DOCKER_REGISTRY}${MM_ANNOTATOR_NAME}-${MM_ANNOTATOR_CUDA}:latest diff --git a/duui-open-webui/bash_dockers/docker_build_vlm.sh b/duui-open-webui/bash_dockers/docker_build_vlm.sh new file mode 100644 index 00000000..3faf6720 --- /dev/null +++ b/duui-open-webui/bash_dockers/docker_build_vlm.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + + +export MM_ANNOTATOR_CUDA=vllm +#export DUUI_MM_CUDA="-cuda" + +export MM_ANNOTATOR_NAME=duui-openwebui +export MM_ANNOTATOR_VERSION=0.1.0 +export MM_LOG_LEVEL=DEBUG +export MM_MODEL_CACHE_SIZE=3 +export DOCKER_REGISTRY="docker.texttechnologylab.org/" + +cd .. + +docker build \ + --build-arg MM_ANNOTATOR_NAME \ + --build-arg MM_ANNOTATOR_VERSION \ + --build-arg MM_LOG_LEVEL \ + -t ${DOCKER_REGISTRY}${MM_ANNOTATOR_NAME}-${MM_ANNOTATOR_CUDA}:${MM_ANNOTATOR_VERSION} \ + -f src/main/docker/Dockerfile${MM_ANNOTATOR_CUDA} \ + . + +docker tag \ + ${DOCKER_REGISTRY}${MM_ANNOTATOR_NAME}-${MM_ANNOTATOR_CUDA}:${MM_ANNOTATOR_VERSION} \ + ${DOCKER_REGISTRY}${MM_ANNOTATOR_NAME}-${MM_ANNOTATOR_CUDA}:latest diff --git a/duui-open-webui/pom.xml b/duui-open-webui/pom.xml new file mode 100644 index 00000000..c81ee5da --- /dev/null +++ b/duui-open-webui/pom.xml @@ -0,0 +1,162 @@ + + + 4.0.0 + + org.texttechnology + duui-WebUI-Wrapper + 1.0-SNAPSHOT + + + AGPL-3.0-or-later + https://www.gnu.org/licenses/agpl.txt + repo + GNU Affero General Public License v3.0 or later + + + + + Texttechnology Lab + https://www.texttechnologylab.org + + + + mehler + Prof. Dr. Alexander Mehler + mehler@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/alexander-abrami/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + head of department + + + + aabusale + Ali Abusaleh + a.abusaleh@em.uni-frankfurt.de + https://www.texttechnologylab.org/team/ali-abusaleh/ + Goethe University Frankfurt / Texttechnology Lab + https://www.texttechnologylab.org + + Research assistant + + Europe/Berlin + + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.0 + + + --illegal-access=permit + --add-opens java.base/java.util=ALL-UNNAMED + + + + + + + + + 21 + 21 + UTF-8 + 2.4.0 + + + + + jitpack.io + https://jitpack.io + + + + + + + org.dkpro.core + dkpro-core-asl + ${dkpro.core.version} + pom + import + + + + + + + + + com.github.texttechnologylab + UIMATypeSystem + 3.0.10 + + + com.github.texttechnologylab + DockerUnifiedUIMAInterface + 1.4.6 + + + + + + + + + + + + + + + + + + + + + + + org.junit.jupiter + junit-jupiter + 5.9.0 + test + + + + org.dkpro.core + dkpro-core-api-segmentation-asl + test + + + + org.dkpro.core + dkpro-core-io-xmi-asl + test + + + + org.dkpro.core + dkpro-core-io-json-asl + 2.0.0 + test + + + + org.dkpro.core + dkpro-core-api-resources-asl + test + + + + \ No newline at end of file diff --git a/duui-open-webui/requirements.txt b/duui-open-webui/requirements.txt new file mode 100644 index 00000000..7fa967a3 --- /dev/null +++ b/duui-open-webui/requirements.txt @@ -0,0 +1,29 @@ +opencv-python +torch==2.8.0 +torchaudio==2.8.0 +setfit==1.0.3 +sentencepiece==0.2.0 +protobuf +germansentiment==1.1.0 +pytorch-lightning==2.4.0 +numpy==1.26.3 +fastapi==0.115.12 +dkpro-cassis==0.9.1 +uvicorn[standard]==0.27.1 +pydantic-settings==2.0.2 +torchmetrics==1.2.0 +six==1.16.0 +pandas==1.4.3 +pysentimiento==0.7.3 +starlette==0.40.0 +transformers +accelerate==1.3.0 +soundfile==0.13.1 +pillow==11.1.0 +scipy==1.15.2 +backoff==2.2.1 +peft==0.13.2 +qwen-omni-utils[decord] +qwen-vl-utils[decord]==0.0.8 +huggingface_hub[hf_xet] + python-multipart \ No newline at end of file diff --git a/duui-open-webui/src/main/docker/Dockerfiletransformer b/duui-open-webui/src/main/docker/Dockerfiletransformer new file mode 100644 index 00000000..8c9379b2 --- /dev/null +++ b/duui-open-webui/src/main/docker/Dockerfiletransformer @@ -0,0 +1,69 @@ +FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 + + +RUN apt update && \ + DEBIAN_FRONTEND=noninteractive \ + apt install --no-install-recommends -y build-essential software-properties-common && \ + add-apt-repository -y ppa:deadsnakes/ppa && \ + apt install --no-install-recommends -y python3.10 python3-pip python3-setuptools python3-distutils python3-dev && \ + apt clean && rm -rf /var/lib/apt/lists/* + +RUN ln -s /usr/bin/python3 /usr/bin/python +RUN python -m pip install --upgrade pip + + +WORKDIR /usr/src/app + +EXPOSE 9714 6659 6658 + + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install ffmpeg -y + + +# dependencies +RUN pip install -U pip +RUN pip install setuptools wheel psutil packaging torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 +RUN pip install --upgrade setuptools +COPY ./requirements.txt ./requirements.txt +RUN pip install -r requirements.txt + + +RUN python -c "from transformers import Qwen2_5_VLForConditionalGeneration; Qwen2_5_VLForConditionalGeneration.from_pretrained(pretrained_model_name_or_path='Qwen/Qwen2.5-VL-3B-Instruct', revision='66285546d2b821cf421d4f5eb2576359d3770cd3')" + +# copy scripts +COPY ./src/main/resources/TypeSystemMM.xml ./TypeSystemMM.xml +COPY ./src/main/python/duui-mm.py ./duui-mm.py +COPY ./src/main/python/start.sh ./start.sh + +COPY ./src/main/python/duui-mm.lua ./duui-mm.lua +COPY ./src/main/python/models/ ./models/ + + + +RUN chmod +x ./start.sh + + +RUN echo nvidia-smi + +# log level +ARG MM_LOG_LEVEL="DEBUG" +ENV MM_LOG_LEVEL=$MM_LOG_LEVEL + +# config +ARG MM_MODEL_CACHE_SIZE=3 +ENV MM_MODEL_CACHE_SIZE=$MM_MODEL_CACHE_SIZE + +# meta data +ARG MM_ANNOTATOR_NAME="duui-mutlimodality" +ENV MM_ANNOTATOR_NAME=$MM_ANNOTATOR_NAME +ARG MM_ANNOTATOR_VERSION="unset" +ENV MM_ANNOTATOR_VERSION=$MM_ANNOTATOR_VERSION + +# Model Info +ARG MM_MODEL_VERSION=0.1 +ENV MM_MODEL_VERSION=$MM_MODEL_VERSION + + + +ENTRYPOINT ["uvicorn", "duui-mm:app", "--host", "0.0.0.0", "--port" ,"9714"] +CMD ["--workers", "1"] \ No newline at end of file diff --git a/duui-open-webui/src/main/python/__init__.py b/duui-open-webui/src/main/python/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/duui-open-webui/src/main/python/duui_webUIWrapper.lua b/duui-open-webui/src/main/python/duui_webUIWrapper.lua new file mode 100644 index 00000000..a5abd046 --- /dev/null +++ b/duui-open-webui/src/main/python/duui_webUIWrapper.lua @@ -0,0 +1,267 @@ +StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") +Class = luajava.bindClass("java.lang.Class") +JCasUtil = luajava.bindClass("org.apache.uima.fit.util.JCasUtil") +TopicUtils = luajava.bindClass("org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaUtils") +Prompt = luajava.bindClass("org.texttechnologylab.type.llm.prompt.Prompt") +Image = luajava.bindClass("org.texttechnologylab.annotation.type.Image") +Audio = luajava.bindClass("org.texttechnologylab.annotation.type.Audio") + +function serialize(inputCas, outputStream, parameters) + print("start serilize") + + local doc_lang = inputCas:getDocumentLanguage() + + -- Get parameters or use defaults + local model_name = parameters["model_name"] or "llama3" + local individual = parameters["individual"] or "false" + local mode = parameters["mode"] or "text" + local ollama_host = parameters["ollama_host"] or "http://localhost" + local ollama_port = parameters["ollama_port"] or "" + local ollama_auth_token = parameters["ollama_auth_token"] or "" + + local system_prompt = parameters["system_prompt"] or "" + + -- Prompts handler + local prompts = {} + local prompts_it = luajava.newInstance("java.util.ArrayList", JCasUtil:select(inputCas, Prompt)):listIterator() + local prompt_count = 1 + while prompts_it:hasNext() do + local prompt = prompts_it:next() + local messages = {} + local messages_it = prompt:getMessages():iterator() + local messages_count = 1 + while messages_it:hasNext() do + local message = messages_it:next() + messages[messages_count] = { + role = message:getRole(), + content = message:getContent(), + ref = message:getAddress() + } + messages_count = messages_count + 1 + end + prompts[prompt_count] = { + args = prompt:getArgs(), + messages = messages, + ref = prompt:getAddress() + } + prompt_count = prompt_count + 1 + end + print("start Image loader") + + -- Images handler + local images = {} + local number_of_images = 1 + local image_it = JCasUtil:select(inputCas, Image):iterator() + while image_it:hasNext() do + local image = image_it:next() + images[number_of_images] = { + src = image:getSrc(), + height = image:getHeight(), + width = image:getWidth(), + begin = image:getBegin(), + ['end'] = image:getEnd() + } + number_of_images = number_of_images + 1 + end + + -- Audios handler + local audios = {} + local number_of_audios = 1 + local audio_it = JCasUtil:select(inputCas, Audio):iterator() + while audio_it:hasNext() do + local audio = audio_it:next() + audios[number_of_audios] = { + src = audio:getSrc(), + begin = audio:getBegin(), + ['end'] = audio:getEnd() + } + number_of_audios = number_of_audios + 1 + end + + -- Videos handler + local videos = {} + local number_of_videos = 1 + local class = Class:forName("org.texttechnologylab.annotation.type.Video") + local video_it = JCasUtil:select(inputCas, class):iterator() + while video_it:hasNext() do + local video = video_it:next() + videos[number_of_videos] = { + src = video:getSrc(), + length = video:getLength(), + fps = video:getFps(), + begin = video:getBegin(), + ['end'] = video:getEnd() + } + number_of_videos = number_of_videos + 1 + end + + -- Serialize to JSON + outputStream:write(json.encode({ + images = images, + audios = audios, + videos = videos, + prompts = prompts, + doc_lang = doc_lang, + model_name = model_name, + individual = individual, + mode = mode, + ollama_host = ollama_host, + ollama_port = ollama_port, + ollama_auth_token = ollama_auth_token, + system_prompt = system_prompt + })) +end + +-- function deserialize(inputCas, inputStream) +-- local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) +-- local results = json.decode(inputString) +-- +-- -- Handle errors +-- if results['errors'] ~= nil then +-- for _, error in ipairs(results['errors']) do +-- local warning = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) +-- warning:setKey("error") +-- warning:setValue(error['meta'] or error) +-- warning:addToIndexes() +-- end +-- end +-- +-- -- Handle model metadata +-- if results['model_source'] ~= nil and results['model_version'] ~= nil and results['model_name'] ~= nil and results['model_lang'] ~= nil then +-- local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) +-- model_meta:setModelVersion(results["model_version"]) +-- model_meta:setModelName(results["model_name"]) +-- model_meta:setSource(results["model_source"]) +-- model_meta:setLang(results["model_lang"]) +-- model_meta:addToIndexes() +-- end +-- +-- -- Handle prompts +-- if results['prompts'] ~= nil then +-- for _, prompt in ipairs(results["prompts"]) do +-- for _, message in pairs(prompt["messages"]) do +-- if message["fillable"] == true then +-- local msg_anno = inputCas:getLowLevelCas():ll_getFSForRef(message["ref"]) +-- msg_anno:setContent(message["content"]) +-- end +-- end +-- end +-- end +-- +-- -- Handle processed text +-- if results['processed_text'] ~= nil then +-- for _, llm_result in ipairs(results["processed_text"]) do +-- local llm_anno = luajava.newInstance("org.texttechnologylab.type.llm.prompt.Result", inputCas) +-- llm_anno:setMeta(llm_result["meta"]) +-- local prompt_anno = inputCas:getLowLevelCas():ll_getFSForRef(llm_result["prompt_ref"]) +-- llm_anno:setPrompt(prompt_anno) +-- llm_anno:addToIndexes() +-- end +-- end +-- end + +function deserialize(inputCas, inputStream) + -- 1. Parse Input + local StandardCharsets = luajava.bindClass("java.nio.charset.StandardCharsets") + local inputString = luajava.newInstance("java.lang.String", inputStream:readAllBytes(), StandardCharsets.UTF_8) + local results = json.decode(inputString) + + -- 2. Handle Errors (Existing Logic) + if results['errors'] ~= nil then + for _, error in ipairs(results['errors']) do + local warning = luajava.newInstance("org.texttechnologylab.annotation.AnnotationComment", inputCas) + warning:setKey("error") + warning:setValue(error['meta'] or error) + warning:addToIndexes() + end + end + + -- 3. Handle Model Metadata (Existing Logic) + if results['model_source'] ~= nil then + local model_meta = luajava.newInstance("org.texttechnologylab.annotation.model.MetaData", inputCas) + model_meta:setModelVersion(results["model_version"]) + model_meta:setModelName(results["model_name"]) + model_meta:setSource(results["model_source"]) + model_meta:setLang(results["model_lang"]) + model_meta:addToIndexes() + end + + -- 4. Handle Prompts (Existing Logic) + if results['prompts'] ~= nil then + for _, prompt in ipairs(results["prompts"]) do + for _, message in pairs(prompt["messages"]) do + if message["fillable"] == true then + local msg_anno = inputCas:getLowLevelCas():ll_getFSForRef(message["ref"]) + msg_anno:setContent(message["content"]) + end + end + end + end + + -- 5. NEW: Build Sofa String & Calculate Offsets + -- We use Java's StringBuilder to ensure offsets match UIMA's Java character counting + local sb = luajava.newInstance("java.lang.StringBuilder") + local pending_results = {} + local pending_prompts = {} + + -- 1. Process Prompts into StringBuilder + if results['prompts'] ~= nil then + for _, prompt_data in ipairs(results["prompts"]) do + for _, message in pairs(prompt_data["messages"]) do + local text = message["content"] or "" + + local b = sb:length() + sb:append("User: "):append(text):append("\n\n") + local e = sb:length() + + -- Update the existing Prompt/Message if it was passed by reference + if message["fillable"] == true and message["ref"] ~= nil then + local msg_anno = inputCas:getLowLevelCas():ll_getFSForRef(message["ref"]) + msg_anno:setBegin(b) + msg_anno:setEnd(e) + msg_anno:setContent(text) + end + end + end + end + + -- 2. Process Results into StringBuilder + if results['processed_text'] ~= nil then + for _, llm_result in ipairs(results["processed_text"]) do + local content = llm_result["meta"] or "" + + local b = sb:length() + sb:append("Assistant: "):append(content):append("\n\n") + local e = sb:length() + + table.insert(pending_results, { + begin_idx = b, + end_idx = e, + meta = content, + prompt_ref = llm_result["prompt_ref"] + }) + end + end + + -- 3. Safely set Document Text + local currentText = inputCas:getDocumentText() + if currentText == nil then + inputCas:setDocumentText(sb:toString()) + + -- 4. Only add Result annotations if we successfully set the text + for _, item in ipairs(pending_results) do + local llm_anno = luajava.newInstance("org.texttechnologylab.type.llm.prompt.Result", inputCas) + llm_anno:setBegin(item.begin_idx) + llm_anno:setEnd(item.end_idx) + llm_anno:setMeta(item.meta) + + if item.prompt_ref then + local prompt_anno = inputCas:getLowLevelCas():ll_getFSForRef(item.prompt_ref) + llm_anno:setPrompt(prompt_anno) + end + llm_anno:addToIndexes() + end + else + print("Sofa already set. Skipping annotation creation to prevent offset mismatch.") + end +end diff --git a/duui-open-webui/src/main/python/duui_webUIWrapper.py b/duui-open-webui/src/main/python/duui_webUIWrapper.py new file mode 100644 index 00000000..16652fe8 --- /dev/null +++ b/duui-open-webui/src/main/python/duui_webUIWrapper.py @@ -0,0 +1,210 @@ +import logging +import io +import gc +import torch +import uvicorn +from fastapi import FastAPI, File, Form, UploadFile +from cassis import load_typesystem +from fastapi import FastAPI, Response +from starlette.responses import PlainTextResponse +from fastapi.responses import JSONResponse +from typing import List, Optional +from models.duui_models import ( + DUUIMMRequest, + DUUIMMResponse, + MultiModelModes, + ImageType, + AudioType, + VideoTypes, + LLMResult, + Settings +) + +from models.ollama_models import OllamaConfig, OllamaRequest, OllamaResponse +from services.ollama_client import OllamaClient +from services.utils import encode_file_to_base64, map_duui_to_ollama, convert_base64_to_image, convert_base64_to_audio + +from fastapi.encoders import jsonable_encoder + +import os + +os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com' +os.environ['CURL_CA_BUNDLE'] = '' +os.environ['REQUESTS_CA_BUNDLE'] = '' + + +# Global cache +_loaded_models = {} +_loaded_processors = {} + + +# Load settings from env vars +settings = Settings() + +lua_communication_script, logger, type_system, device = None, None, None, None + +def init(): + global lua_communication_script, logger, type_system, device + + + logging.basicConfig(level=settings.log_level) + logger = logging.getLogger(__name__) + + device = "cuda" if torch.cuda.is_available() else "cpu" + # device = "cpu" + logger.info(f'USING {device}') + # Load the predefined typesystem that is needed for this annotator to work + # typesystem_filename = './TypeSystemMM.xml' + typesystem_filename = '../resources/TypeSystemMM.xml' + # logger.debug("Loading typesystem from \"%s\"", typesystem_filename) + + + logger.debug("*"*20 + "Lua communication script" + "*"*20) + # Load the Lua communication script + lua_communication_script_filename = "duui_webUIWrapper.lua" + + + with open(lua_communication_script_filename, 'rb') as f: + lua_communication_script = f.read().decode("utf-8") + logger.debug("Lua communication script:") + logger.debug(lua_communication_script_filename) + + with open(typesystem_filename, 'rb') as f: + type_system = load_typesystem(f) + + +init() + +app = FastAPI( + openapi_url="/openapi.json", + docs_url="/api", + redoc_url=None, + title=settings.annotator_name, + description="Wrapper for Ollama/OpenWebUI API with DUUI compatibility", + version=settings.annotator_version, + terms_of_service="https://www.texttechnologylab.org/legal_notice/", + contact={ + "name": "Ali Abusaleh, TTLab Team", + "url": "https://texttechnologylab.org", + "email": "a.abusaleh@em.uni-frankfurt.de", + }, + license_info={ + "name": "AGPL", + "url": "http://www.gnu.org/licenses/agpl-3.0.en.html", + }, +) + + +# Get typesystem of this annotator +@app.get("/v1/typesystem") +def get_typesystem() -> Response: + # TODO rimgve cassis dependency, as only needed for typesystem at the moment? + xml = type_system.to_xml() + xml_content = xml.encode("utf-8") + + return Response( + content=xml_content, + media_type="application/xml" + ) + + +# Return Lua communication script +@app.get("/v1/communication_layer", response_class=PlainTextResponse) +def get_communication_layer() -> str: + return lua_communication_script + + +# Get input / output of the annotator +@app.get("/v1/details/input_output") +def get_input_output() -> JSONResponse: + json_item = { + "inputs": ["string", "org.texttechnologylab.annotation.type.Image"], + "outputs": ["string", "org.texttechnologylab.annotation.type.Image"] + } + + json_compatible_item_data = jsonable_encoder(json_item) + return JSONResponse(content=json_compatible_item_data) + +# Return documentation info +@app.get("/v1/documentation") +def get_documentation(): + return JSONResponse({"documents": "testing"}) + # return DUUIMMDocumentation( + # annotator_name=settings.image_to_text_annotator_name, + # version=settings.image_to_text_model_version, + # implementation_lang="Python", + # meta={ + # "log_level": settings.log_level, + # "model_version": settings.model_version, + # "model_cache_size": settings.model_cache_size, + # # "models": sources, + # # "languages": languages, + # # "versions": versions, + # }, + # parameters={ + # "prompt": "Prompt", + # "doc_lang": "Document language", + # "model_name": "Model name", + # "individual": "A flag for processing the images as one (set of frames) or indivisual. Note: it only works in a complex-mode", + # "mode": "a mode of operation" + # + # } + # ) + +# --- API Endpoints --- +@app.post("/v1/process", response_model=DUUIMMResponse) +async def process_ollama(duui_request: DUUIMMRequest): + # Parse the JSON request body + # request_data = request.json() + # duui_request = request.json().load() + + # Initialize Ollama client + config = OllamaConfig() + config.host = duui_request.ollama_host + config.port = duui_request.ollama_port + config.auth_token = duui_request.ollama_auth_token + client = OllamaClient(config) + + # Encode files (if present in the request) + # encoded_images = [encode_file_to_base64(convert_base64_to_image(img.src)) for img in duui_request.images] if duui_request.images else None + encoded_images = [img.src for img in duui_request.images] if duui_request.images else None + encoded_audios = [encode_file_to_base64(convert_base64_to_audio(aud)) for aud in duui_request.audios] if duui_request.audios else None + encoded_videos = [encode_file_to_base64(vid) for vid in duui_request.videos] if duui_request.videos else None + + # TODO I need to support this. + system_prompt = duui_request.system_prompt + + Responses = [] + Errors = [] + # iterate over duui_request.prompts and make a Request per it. + for prompt in duui_request.prompts: + # Map DUUI request to Ollama request + ollama_request = map_duui_to_ollama(duui_request.model_name, system_prompt, prompt, encoded_images, encoded_audios, encoded_videos) + # Call Ollama + ollama_response = client.generate(ollama_request) + + if ollama_response.response: + Responses.append(LLMResult(meta=ollama_response.response, prompt_ref=0, message_ref="0")) + + if ollama_response.error: + Errors.append(LLMResult(meta=ollama_response.error, prompt_ref=0, message_ref="0")) + + # Map Ollama response to DUUIMMResponse + return DUUIMMResponse( + processed_text=Responses if Responses else None, + model_name=duui_request.model_name, + model_source="Ollama/OpenWebUI", + model_lang=duui_request.doc_lang, + model_version="1.0.0", + errors=Errors if Errors else None, + prompts=duui_request.prompts, + ) + +# --- Health Check --- +@app.get("/health") +def health_check(): + return {"status": "healthy"} + +if __name__ == "__main__": + uvicorn.run("duui_webUIWrapper:app", host="0.0.0.0", port=9714, workers=1) + diff --git a/duui-open-webui/src/main/python/env.sh b/duui-open-webui/src/main/python/env.sh new file mode 100644 index 00000000..ae5ec4ad --- /dev/null +++ b/duui-open-webui/src/main/python/env.sh @@ -0,0 +1,2 @@ +export VISION_LORA_PATH='/home/staff_homes/aabusale/.cache/huggingface/hub/models--microsoft--Phi-4-multimodal-instruct/snapshots/0af439b3adb8c23fda473c4f86001dbf9a226021/vision-lora' +export SPEECH_LORA_PATH='/home/staff_homes/aabusale/.cache/huggingface/hub/models--microsoft--Phi-4-multimodal-instruct/snapshots/0af439b3adb8c23fda473c4f86001dbf9a226021/speech-lora' diff --git a/duui-open-webui/src/main/python/models/__init__.py b/duui-open-webui/src/main/python/models/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/duui-open-webui/src/main/python/models/duui_models.py b/duui-open-webui/src/main/python/models/duui_models.py new file mode 100644 index 00000000..9b6c2a1e --- /dev/null +++ b/duui-open-webui/src/main/python/models/duui_models.py @@ -0,0 +1,172 @@ +from enum import Enum +from pydantic import BaseModel +from pydantic_settings import BaseSettings +from typing import List, Optional + + +class MultiModelModes(str, Enum): + TEXT = "text" + IMAGE = "image" + AUDIO = "audio" + FRAMES = "frames" + VIDEO = "video" + FRAMES_AND_AUDIO = "frames_and_audio" + + + +class Settings(BaseSettings): + # Name of this annotator + annotator_name: str + # Version of this annotator + # TODO add these to the settings + annotator_version: str + # Log level + log_level: str + # # # model_name + # Name of this annotator + model_version: str + #cach_size + model_cache_size: str + + + +# Documentation response +class DUUIMMDocumentation(BaseModel): + # Name of this annotator + annotator_name: str + + # Version of this annotator + version: str + + # Annotator implementation language (Python, Java, ...) + implementation_lang: Optional[str] + + # Optional map of additional meta data + meta: Optional[dict] + + # Docker container id, if any + docker_container_id: Optional[str] + + # Optional map of supported parameters + parameters: Optional[dict] + + +class ImageType(BaseModel): + """ + org.texttechnologylab.annotation.type.Image + """ + src: str + width: int + height: int + begin: int + end: int + +class Entity(BaseModel): + """ + Named bounding box entity + name: entity name + begin: start position + end: end position + bounding_box: list of bounding box coordinates + """ + name: str + begin: int + end: int + bounding_box: List[tuple[float, float, float, float]] + +class LLMMessage(BaseModel): + role: str = None + content: str + class_module: str = None + class_name: str = None + fillable: bool = False + context_name: str = None + ref: int # internal cas annotation id + + +class LLMPrompt(BaseModel): + messages: List[LLMMessage] + args: Optional[str] # json string + ref: Optional[int] # internal cas annotation id + +class LLMResult(BaseModel): + meta: str # json string + prompt_ref: int # internal cas annotation id + message_ref: str # internal cas annotation id + +class VideoTypes(BaseModel): + """ + org.texttechnologylab.annotation.type.Video + """ + src: str + length: int = -1 + fps: int = -1 + begin: int + end: int + + +class AudioType(BaseModel): + """ + org.texttechnologylab.annotation.type.Audio + """ + src: str + begin: int + end: int + +# Request sent by DUUI +# Note, this is transformed by the Lua script +class DUUIMMRequest(BaseModel): + + # list of images + images: Optional[List[ImageType]] + # audio + audios: Optional[List[AudioType]] + + # videos + videos :Optional[List[VideoTypes]] + + # List of prompt + prompts: List[LLMPrompt] + + system_prompt: str = "" + + # doc info + doc_lang: str + + # model name + model_name: str + + # individual or multiple image processing + individual: bool = False + + # mode for complex + mode: MultiModelModes = MultiModelModes.TEXT + + # ollama + ollama_host: str + ollama_port: str + ollama_auth_token: str + + + + + +# Response sent by DUUI +# Note, this is transformed by the Lua script +class DUUIMMResponse(BaseModel): + # list of processed text + processed_text: Optional[List[LLMResult]] + + # model source + model_source: str + # model language + model_lang: str + # model version + model_version: str + # model name + model_name: str + # list of errors + errors: Optional[List[LLMResult]] + + # original prompt + prompts: List[Optional[LLMPrompt]] = [] \ No newline at end of file diff --git a/duui-open-webui/src/main/python/models/ollama_models.py b/duui-open-webui/src/main/python/models/ollama_models.py new file mode 100644 index 00000000..e3e65e92 --- /dev/null +++ b/duui-open-webui/src/main/python/models/ollama_models.py @@ -0,0 +1,22 @@ +from pydantic import BaseModel +from typing import Optional, List + +class OllamaConfig(BaseModel): + host: str = "http://localhost" + port: int = 11434 + auth_token: Optional[str] = None + + +class OllamaRequest(BaseModel): + model: str + prompt: str + system_prompt: Optional[str] = None + images: Optional[List[str]] = None # Base64-encoded + audio: Optional[str] = None # Base64-encoded + video: Optional[str] = None # Base64-encoded + +class OllamaResponse(BaseModel): + response: str + model: str + status: str + error: Optional[str] = None diff --git a/duui-open-webui/src/main/python/services/__init__.py b/duui-open-webui/src/main/python/services/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/duui-open-webui/src/main/python/services/ollama_client.py b/duui-open-webui/src/main/python/services/ollama_client.py new file mode 100644 index 00000000..645dd3f8 --- /dev/null +++ b/duui-open-webui/src/main/python/services/ollama_client.py @@ -0,0 +1,205 @@ +import base64 +import logging +import requests +from typing import Optional, List +from models.ollama_models import OllamaConfig, OllamaRequest, OllamaResponse +from services.utils import convert_base64_to_image + +import base64 +import tempfile +import os +from io import BytesIO +from PIL import Image + + + + +logger = logging.getLogger(__name__) + +class OllamaClient: + def __init__(self, config: OllamaConfig): + self.base_url = f"{config.host}:{config.port}/api" + self.auth_token = config.auth_token + self.headers = ( + {"Authorization": f"Bearer {self.auth_token}"} + if self.auth_token + else {} + ) + # def generate(self, request: OllamaRequest) -> OllamaResponse: + # try: + # # Initialize the content array with the text prompt + # content = [{"type": "text", "text": request.prompt}] + # + # # 1. Handle Images + # if request.images: + # for base64_image in request.images: + # # Strip potential header if the input already contains "data:image/..." + # img_data = base64_image.split(",")[-1] + # content.append({ + # "type": "image_url", + # "image_url": { + # "url": f"data:image/jpeg;base64,{img_data}" + # } + # }) + # + # # 2. Handle Audio + # if request.audio: + # audio_data = request.audio.split(",")[-1] + # content.append({ + # "type": "input_audio", + # "input_audio": { + # "data": audio_data, + # "format": "wav" + # } + # }) + # + # # 3. Handle Video + # if request.video: + # video_data = request.video.split(",")[-1] + # content.append({ + # "type": "video_url", # Note: Check your specific provider's key for video + # "video_url": { + # "url": f"data:video/mp4;base64,{video_data}" + # } + # }) + # + # # Construct the messages array + # messages = [{"role": "user", "content": content}] + # + # # Add system prompt as the first message if provided + # if request.system_prompt: + # messages.insert(0, {"role": "system", "content": request.system_prompt}) + # + # # Construct the payload + # payload = { + # "model": request.model, + # "messages": messages, + # "stream": False + # } + # + # response = requests.post( + # f"{self.base_url}/chat/completions", + # json=payload, + # headers=self.headers, + # ) + # response.raise_for_status() + # + # # Extract the response text + # response_json = response.json() + # response_text = "" + # if "choices" in response_json and len(response_json["choices"]) > 0: + # response_text = response_json["choices"][0]["message"]["content"] + # + # return OllamaResponse( + # response=response_text, + # model=request.model, + # status="success", + # ) + # + # except Exception as e: + # logger.error(f"Ollama API Error: {e}") + # return OllamaResponse( + # response="", + # model=request.model, + # status="error", + # error=str(e), + # ) + + def generate(self, request: OllamaRequest) -> OllamaResponse: + try: + # 1. Prepare the content list with the text prompt + content = [{"type": "text", "text": request.prompt}] + + # 2. Add Images (The primary reason for 400 is often the URI format) + if request.images: + for base64_image in request.images: + # Ensure we have a clean base64 string without any existing headers + clean_base64 = base64_image.split(",")[-1] + content.append({ + "type": "image_url", + "image_url": { + "url": f"data:image/JPG;base64,{clean_base64}" + # "url": f"# data:image/jpeg;base64,{clean_base64}" + } + }) + + # 3. Handle Audio/Video with Caution + # NOTE: Most Ollama OpenAI-compatible endpoints will throw a 400 if they + # see 'input_audio' or 'video_url'. We only add them if present. + if request.audio: + # If your server strictly follows OpenAI's newer spec: + content.append({ + "type": "input_audio", + "input_audio": {"data": request.audio.split(",")[-1], "format": "wav"} + }) + + # Construct the messages array + messages = [{"role": "user", "content": content}] + + # Insert system prompt at the beginning + if request.system_prompt: + messages.insert(0, {"role": "system", "content": request.system_prompt}) + + payload = { + "model": request.model, + "messages": messages, + # "stream": False + } + + # Make the request + response = requests.post( + f"{self.base_url}/chat/completions", # Confirm this is the correct endpoint + json=payload, + headers=self.headers, + ) + + if response.status_code != 200: + logger.error(f"Ollama Error Response: {response.text}") + response.raise_for_status() + + response_json = response.json() + response_text = response_json["choices"][0]["message"]["content"] + + return OllamaResponse( + response=response_text, + model=request.model, + status="success", + ) + + except Exception as e: + logger.error(f"Ollama API Error: {e}") + return OllamaResponse( + response="", + model=request.model, + status="error", + error=str(e), + ) + def ping(self, request: OllamaRequest): + try: + chat_payload = { + "model": request.model, + "messages": [ + {"role": "user", "content": [{"type": "text", "text": "ping"}]} + ] + } + chat_resp = requests.post( + f"{self.base_url}/chat/completions", + json=chat_payload, + headers=self.headers + ) + + chat_resp.raise_for_status() + chat_data = chat_resp.json() + if not ("choices" in chat_data and len(chat_data["choices"]) > 0): + print("Chat completion endpoint responded but returned no choices.") + return False + print("Chat completion endpoint reachable.") + + except Exception as e: + logger.error(f"Ollama API Error: {e}") + return OllamaResponse( + response="", + model=request.model, + status="error", + error=str(e), + ) diff --git a/duui-open-webui/src/main/python/services/utils.py b/duui-open-webui/src/main/python/services/utils.py new file mode 100644 index 00000000..e0217e77 --- /dev/null +++ b/duui-open-webui/src/main/python/services/utils.py @@ -0,0 +1,151 @@ +import base64 +from typing import List, Union +from fastapi import UploadFile +from models.duui_models import ImageType, AudioType, VideoTypes, LLMPrompt +from models.ollama_models import OllamaConfig, OllamaRequest, OllamaResponse + +from PIL import Image + +from io import BytesIO +import base64 +import soundfile as sf +import io +import functools +import traceback +import tempfile +import subprocess +import os +from typing import Tuple, List +import json +import cv2 +from uuid import uuid4 + +async def encode_file_to_base64(file: UploadFile) -> str: + content = await file.read() + return base64.b64encode(content).decode("utf-8") + +def map_duui_to_ollama( + model_name, + system_pormpt: LLMPrompt, + prompt: LLMPrompt, + encoded_images: List[str] = None, + encoded_audios: List[str] = None, + encoded_videos: List[str] = None, +) -> OllamaRequest: + system_prompt = system_pormpt if system_pormpt else None + prompt = prompt.messages[-1].content if prompt else "" + + return OllamaRequest( + model=model_name, + prompt=prompt, + system_prompt=system_prompt, + images=encoded_images, + audio=encoded_audios[0] if encoded_audios else None, + video=encoded_videos[0] if encoded_videos else None, + ) + + +def convert_base64_to_image(base64_string): + return Image.open(BytesIO(base64.b64decode(base64_string))) + +def convert_image_to_base64(image): + buffered = BytesIO() + image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + +def convert_base64_to_audio(base64_audio): + """ + Converts a base64-encoded audio string to a NumPy waveform and sample rate. + """ + audio_bytes = base64.b64decode(base64_audio) + audio_buffer = io.BytesIO(audio_bytes) + waveform, sample_rate = sf.read(audio_buffer) + return waveform, sample_rate + +def convert_audio_to_base64(waveform, sample_rate, format="WAV"): + """ + Converts a NumPy waveform and sample rate to a base64-encoded audio string. + """ + audio_buffer = io.BytesIO() + sf.write(audio_buffer, waveform, sample_rate, format=format) + audio_bytes = audio_buffer.getvalue() + base64_audio = base64.b64encode(audio_bytes).decode("utf-8") + return base64_audio + + +def find_label_positions(text, label): + start = text.find(label) + end = start + len(label) if start != -1 else -1 + return start, end + + + +def save_base64_to_temp_file(base64_str, suffix=""): + data = base64.b64decode(base64_str) + with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: + tmp.write(data) + return tmp.name + + +def decouple_video(videobase64: str): + # Decode the video base64 and save to temp file + video_bytes = base64.b64decode(videobase64) + + with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as f_video: + f_video.write(video_bytes) + video_path = f_video.name + + audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name + + # Extract audio + try: + subprocess.run([ + "ffmpeg", "-y", "-i", video_path, + "-vn", "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2", + audio_path + ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + + with open(audio_path, "rb") as f: + audio_base64 = base64.b64encode(f.read()).decode("utf-8") + except subprocess.CalledProcessError: + print("⚠️ No audio stream found or ffmpeg failed. Continuing without audio.") + audio_base64 = None + + # Extract frames + frame_dir = tempfile.mkdtemp() + frame_pattern = os.path.join(frame_dir, "frame_%03d.jpg") + try: + subprocess.run([ + "ffmpeg", "-i", video_path, "-q:v", "2", frame_pattern + ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"ffmpeg frame extraction failed: {e}") + + # Collect every 5th frame + first and last + frame_files = sorted(os.listdir(frame_dir)) + selected_indices = set([0, len(frame_files) - 1] + list(range(5, len(frame_files), 5))) + selected_frames = [ + os.path.join(frame_dir, f) for i, f in enumerate(frame_files) if i in selected_indices + ] + + frames_b64 = [] + for frame_file in selected_frames: + with open(frame_file, "rb") as f: + frames_b64.append(base64.b64encode(f.read()).decode("utf-8")) + + return audio_base64, frames_b64 + + +def convert_base64_to_video(b64): + return BytesIO(base64.b64decode(b64)) + + + +def video_has_audio(video_path: str) -> bool: + """Returns True if the video file has an audio stream.""" + cmd = [ + "ffprobe", "-i", video_path, + "-show_streams", "-select_streams", "a", "-loglevel", "error" + ] + result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return bool(result.stdout.strip()) \ No newline at end of file diff --git a/duui-open-webui/src/main/python/setup_env.py b/duui-open-webui/src/main/python/setup_env.py new file mode 100644 index 00000000..a1ddc685 --- /dev/null +++ b/duui-open-webui/src/main/python/setup_env.py @@ -0,0 +1,22 @@ +import os +from huggingface_hub import snapshot_download +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument("--model_path", default="microsoft/Phi-4-multimodal-instruct") +args = parser.parse_args() + +model_path = snapshot_download(repo_id=args.model_path, revision='0af439b3adb8c23fda473c4f86001dbf9a226021') +vision_lora_path = os.path.join(model_path, "vision-lora") +speech_lora_path = os.path.join(model_path, "speech-lora") + +# Write them to a shell export file +with open("env.sh", "w") as f: + f.write(f"export VISION_LORA_PATH='{vision_lora_path}'\n") + f.write(f"export SPEECH_LORA_PATH='{speech_lora_path}'\n") + + +# # Load model directly +# from transformers import AutoModelForImageTextToText +# +# model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", code_revision="cc594898137f460bfe9f0759e9844b3ce807cfb5") \ No newline at end of file diff --git a/duui-open-webui/src/main/python/start.sh b/duui-open-webui/src/main/python/start.sh new file mode 100644 index 00000000..c0eb2caf --- /dev/null +++ b/duui-open-webui/src/main/python/start.sh @@ -0,0 +1,67 @@ +#!/bin/bash +set -e + +source ./env.sh + +export VLLM_SERVER_DEV_MODE=1 + +# Start Qwen2.5 VL-7B-Instruct using vLLM on a different port +echo "Launching Qwen2.5-VL-7B-Instruct..." +python -m vllm.entrypoints.openai.api_server \ + --model 'Qwen/Qwen2.5-VL-7B-Instruct' \ + --revision "cc594898137f460bfe9f0759e9844b3ce807cfb5" \ + --tensor-parallel-size 1 \ + --pipeline-parallel-size 1 \ + --distributed-executor-backend mp \ + --dtype auto \ + --trust-remote-code --chat-template-content-format openai \ + --enable-sleep-mode \ + --port 6659 & +QWEN_PID=$! + +# Wait for Qwen to be ready +until python -c "import requests; requests.get('http://localhost:6659/v1/models')" > /dev/null 2>&1; do + echo "Waiting for Qwen on port 6659..." + sleep 10 +done + +# Sleep Qwen initially using Python requests +python -c "import requests; requests.post('http://localhost:6659/sleep')" || true + +# --max-model-len 131072 \ + +# Start Microsoft Phi-4 using vLLM +echo "Launching vLLM server for Phi-4..." +python -m vllm.entrypoints.openai.api_server \ + --model 'microsoft/Phi-4-multimodal-instruct' \ + --revision '0af439b3adb8c23fda473c4f86001dbf9a226021' \ + --tensor-parallel-size 1 \ + --pipeline-parallel-size 1 \ + --distributed-executor-backend mp \ + --dtype auto \ + --enable-sleep-mode\ + --trust-remote-code \ + --max-model-len 131072 \ + --enable-lora \ + --max-lora-rank 320 \ + --lora-extra-vocab-size 256 \ + --limit-mm-per-prompt audio=10,image=10 \ + --max-loras 2 \ + --lora-modules speech=$SPEECH_LORA_PATH vision=$VISION_LORA_PATH \ + --port 6658 & +VLLM_PID=$! + +# Wait for vLLM to be ready +until python -c "import requests; requests.get('http://localhost:6658/v1/models')" > /dev/null 2>&1; do + echo "Waiting for vLLM on port 6658..." + sleep 10 +done + + +# Start DUUI FastAPI app +echo "Launching FastAPI server..." +uvicorn duui-mm:app --host 0.0.0.0 --port 9714 --workers 1 + +# Wait for background processes +wait $VLLM_PID +wait $QWEN_PID diff --git a/duui-open-webui/src/main/resources/TypeSystemMM.xml b/duui-open-webui/src/main/resources/TypeSystemMM.xml new file mode 100644 index 00000000..4a887b30 --- /dev/null +++ b/duui-open-webui/src/main/resources/TypeSystemMM.xml @@ -0,0 +1,268 @@ + + + + + org.texttechnologylab.annotation.type.Image + + uima.tcas.Annotation + + + src + + uima.cas.String + + + width + + uima.cas.Integer + + + height + + uima.cas.Integer + + + mimetype + + uima.cas.String + + + + + + org.texttechnologylab.annotation.type.Video + + + + uima.tcas.Annotation + + + + + + src + + path to the video or base64 value + + uima.cas.String + + + + + + length + + Length of the video in seconds + + uima.cas.Double + + + + + + fps + + Video fps + + uima.cas.Double + + + + + + + + + org.texttechnologylab.annotation.type.SubImage + + uima.tcas.Annotation + + + coordinates + + uima.cas.FSArray + org.texttechnologylab.annotation.type.Coordinate + + + parent + + org.texttechnologylab.annotation.type.Image + + + + + org.texttechnologylab.annotation.type.Coordinate + + uima.cas.AnnotationBase + + + x + + uima.cas.Integer + + + y + + uima.cas.Integer + + + + + + org.texttechnologylab.annotation.ModelAnnotation + + uima.tcas.Annotation + + + ModelReference + Reference to the Model + org.texttechnologylab.annotation.MetaData + + + + + + org.texttechnologylab.annotation.MetaData + + + + uima.tcas.Annotation + + + + + + Lang + + Language of the method or the Model + + uima.cas.String + + + + + + Source + + Link of the used resource + + uima.cas.String + + + + + + + + org.texttechnologylab.annotation.AnnotationComment + + uima.cas.AnnotationBase + + + reference + + uima.cas.TOP + + + value + + uima.cas.String + + + key + + uima.cas.String + + + + + org.texttechnologylab.type.llm.prompt.Prompt + Prompt for a LLM containing a list of "messages". + uima.tcas.Annotation + + + messages + The list of messages for this prompt. + uima.cas.FSArray + org.texttechnologylab.type.llm.prompt.Message + + + args + Prompt arguments + uima.cas.String + + + version + + uima.cas.String + + + reference + Reference to another annotation + uima.cas.TOP + + + + + org.texttechnologylab.type.llm.prompt.Message + List of messages. + uima.tcas.Annotation + + + role + Role, e.g. "system", "user", ... + uima.cas.String + + + content + Message content. + uima.cas.String + + + classModule + Name of the module of the class to use for constructing the message + uima.cas.String + + + className + Name of the class to use for constructing the message + uima.cas.String + + + + + org.texttechnologylab.type.llm.prompt.Result + LLM result, this allows the explicit selection of content generated by the LLM + uima.tcas.Annotation + + + meta + Additional metadata + uima.cas.String + + + prompt + Reference to the corresponding prompt + org.texttechnologylab.type.llm.prompt.Prompt + + + message + Reference to the corresponding message inside a prompt + org.texttechnologylab.type.llm.prompt.Message + + + + + org.texttechnologylab.type.llm.prompt.FillableMessage + This message is supposed to be filled by an LLM by using the previous messages as input/history. + org.texttechnologylab.type.llm.prompt.Message + + + contextName + If a contextName is given, the "content" of this message will be added to the model invocation to be used in a template placeholder + uima.cas.String + + + + + diff --git a/duui-open-webui/src/test/java/OllamaDUUITests.java b/duui-open-webui/src/test/java/OllamaDUUITests.java new file mode 100644 index 00000000..09d765d8 --- /dev/null +++ b/duui-open-webui/src/test/java/OllamaDUUITests.java @@ -0,0 +1,273 @@ +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.uima.UIMAException; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.junit.jupiter.api.*; +import org.texttechnologylab.DockerUnifiedUIMAInterface.DUUIComposer; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIRemoteDriver; +import org.texttechnologylab.DockerUnifiedUIMAInterface.lua.DUUILuaContext; +import org.texttechnologylab.annotation.type.Image; +import org.texttechnologylab.annotation.type.Video; +import org.texttechnologylab.annotation.type.Audio; +import org.texttechnologylab.type.llm.prompt.Prompt; +import org.texttechnologylab.type.llm.prompt.Message; +import org.texttechnologylab.type.llm.prompt.Result; +import org.texttechnologylab.DockerUnifiedUIMAInterface.driver.DUUIUIMADriver; +import org.dkpro.core.io.xmi.XmiWriter; +import org.xml.sax.SAXException; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.UnknownHostException; +import java.util.*; +import java.util.Base64; + +import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; +import static org.junit.jupiter.api.Assertions.*; + +public class OllamaDUUITests { + + static DUUIComposer composer; + static JCas cas; + static JCas inputView; + + // Update this URL to your Ollama/OpenWebUI wrapper endpoint + static String url = "http://127.0.0.1:9977"; + static String sOutputPath = "src/test/results"; + + @BeforeAll + static void beforeAll() throws URISyntaxException, IOException, UIMAException, SAXException, CompressorException { + composer = new DUUIComposer() + .withSkipVerification(true) + .withLuaContext(new DUUILuaContext().withJsonLibrary()); + + DUUIUIMADriver uima_driver = new DUUIUIMADriver(); + DUUIRemoteDriver remoteDriver = new DUUIRemoteDriver(); + composer.addDriver(remoteDriver, uima_driver); + + cas = JCasFactory.createJCas(); + inputView = cas.createView("UserRequestView"); + } + + @AfterAll + static void afterAll() throws UnknownHostException { + composer.shutdown(); + } + + @AfterEach + public void afterEach() throws IOException, SAXException { + composer.resetPipeline(); + cas.reset(); + } + + // Helper method to create CAS with prompts + public void createCas(String language, List prompts) throws UIMAException { + cas.setDocumentLanguage(language); + StringBuilder sb = new StringBuilder(); + + for (String messageText : prompts) { + Prompt prompt = new Prompt(cas); + prompt.setArgs("{}"); + + Message message = new Message(cas); + message.setRole("user"); + message.setContent(messageText); + message.addToIndexes(); + + FSArray messages = new FSArray(cas, 1); + messages.set(0, message); + prompt.setMessages(messages); + prompt.addToIndexes(); + + sb.append(messageText).append(" "); + } + + inputView.setDocumentText(sb.toString().trim()); +// cas.setDocumentText(sb.toString().trim()); + } + + // Helper method to create CAS with images + private void createCasWithImages(String language, List prompts, List imagePaths) throws UIMAException { + createCas(language, prompts); + + for (String path : imagePaths) { + Image img = new Image(cas); + img.setSrc(convertFileToBase64(path)); + img.addToIndexes(); + } + } + + // Helper method to create CAS with audio + private void createCasWithAudio(String language, List prompts, List audioPaths) throws UIMAException { + createCas(language, prompts); + for (String path : audioPaths) { + Audio audio = new Audio(cas); + audio.setSrc(readFileAsBase64(path)); + audio.setMimetype("audio/wav"); + audio.addToIndexes(); + } + } + + // Helper method to create CAS with video + public void createCasWithVideo(String language, List prompts, String videoBase64) throws UIMAException { + cas.setDocumentLanguage(language); + + for (String messageText : prompts) { + Prompt prompt = new Prompt(cas); + prompt.setArgs("{}"); + + Message message = new Message(cas); + message.setRole("user"); + message.setContent(messageText); + message.addToIndexes(); + + FSArray messages = new FSArray(cas, 1); + messages.set(0, message); + prompt.setMessages(messages); + prompt.addToIndexes(); + } + + Video videoWrapper = new Video(cas); + videoWrapper.setMimetype("video/mp4"); + videoWrapper.setSrc(videoBase64); + videoWrapper.addToIndexes(); + } + + // Helper method to convert file to Base64 + private static String convertFileToBase64(String filePath) { + try { + File file = new File(filePath); + FileInputStream fis = new FileInputStream(file); + byte[] bytes = fis.readAllBytes(); + fis.close(); + return Base64.getEncoder().encodeToString(bytes); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + // Helper method to read file as Base64 + private String readFileAsBase64(String filePath) { + try { + byte[] fileBytes = java.nio.file.Files.readAllBytes(new File(filePath).toPath()); + return Base64.getEncoder().encodeToString(fileBytes); + } catch (IOException e) { + e.printStackTrace(); + return null; + } + } + + // Helper method to save Base64 to image + private static void saveBase64ToImage(String base64String, String outputPath) { + try { + byte[] decodedBytes = Base64.getDecoder().decode(base64String); + java.nio.file.Files.write(new File(outputPath).toPath(), decodedBytes); + System.out.println("Image saved as: " + outputPath); + } catch (IOException e) { + e.printStackTrace(); + } + } + + // Helper method to save Base64 to video + private static void saveBase64ToVideo(String base64String, String outputPath) { + try { + byte[] decodedBytes = Base64.getDecoder().decode(base64String); + java.nio.file.Files.write(new File(outputPath).toPath(), decodedBytes); + System.out.println("Video saved as: " + outputPath); + } catch (IOException e) { + e.printStackTrace(); + } + } + + // Helper method to verify no images + private void verifyNoImages() { + Collection allImages = JCasUtil.select(cas, Image.class); + assertTrue(allImages.isEmpty(), "No images should be generated for text-only mode."); + } + + // Test for text-only mode + @Test + public void testTextOnly() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", "llama3.2:latest") + .withParameter("mode", "text") + .withParameter("language", "en") + .withParameter("ollama_host", "https://llm.texttechnologylab.org") +// .withParameter("ollama_port", "8080") + .withParameter("ollama_auth_token", "") + .withParameter("system_prompt", "") + .build().withTimeout(1000) + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, sOutputPath, + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1" + )).build()); + + List prompts = Arrays.asList( + "Who is the current president of the USA?", + "Is Frankfurt the capital of EU finance?" + ); + + createCas("en", prompts); + composer.run(cas); + + verifyNoImages(); + + // Print results + for (Result result : JCasUtil.select(cas, Result.class)) { + System.out.println(result.getMeta()); + } + } + + // Test for image-only mode + @Test + public void testImageOnly() throws Exception { + composer.add( + new DUUIRemoteDriver.Component(url) + .withParameter("model_name", "anduin.qwen2.5vl:3b") + .withParameter("mode", "image") + .withParameter("language", "en") + .withParameter("ollama_host", "localhost") // https:/llm.example +// .withParameter("ollama_port", "8080") + .withParameter("ollama_auth_token", "") + .withParameter("system_prompt", "") + .build().withTimeout(1000) + ); + + composer.add(new DUUIUIMADriver.Component(createEngineDescription(XmiWriter.class, + XmiWriter.PARAM_TARGET_LOCATION, sOutputPath, + XmiWriter.PARAM_PRETTY_PRINT, true, + XmiWriter.PARAM_OVERWRITE, true, + XmiWriter.PARAM_VERSION, "1.1" + )).build()); + + List prompts = Collections.singletonList("What is shown in this image?"); + List imagePaths = Arrays.asList( +// "src/test/resources/images/fridge.jpg", + "src/test/resources/images/cars.jpg" + ); + + createCasWithImages("en", prompts, imagePaths); + composer.run(cas); + + int idx = 0; + for (Image img : JCasUtil.select(cas, Image.class)) { + saveBase64ToImage(img.getSrc(), "src/test/results/images/output_image_" + idx++ + ".png"); + } + + // Print results + for (Result result : JCasUtil.select(cas, Result.class)) { + System.out.println(result.getMeta()); + } + } + + } diff --git a/duui-open-webui/src/test/results/UIMA-Document.xmi b/duui-open-webui/src/test/results/UIMA-Document.xmi new file mode 100644 index 00000000..3d6e6237 --- /dev/null +++ b/duui-open-webui/src/test/results/UIMA-Document.xmi @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + + diff --git a/duui-open-webui/src/test/results/audio/UIMA-Document.xmi b/duui-open-webui/src/test/results/audio/UIMA-Document.xmi new file mode 100644 index 00000000..d773ac08 --- /dev/null +++ b/duui-open-webui/src/test/results/audio/UIMA-Document.xmi @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/duui-open-webui/src/test/results/audio/new/UIMA-Document.xmi b/duui-open-webui/src/test/results/audio/new/UIMA-Document.xmi new file mode 100644 index 00000000..34dc8e0e --- /dev/null +++ b/duui-open-webui/src/test/results/audio/new/UIMA-Document.xmi @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/duui-open-webui/src/test/results/frames/output_frame_0.png b/duui-open-webui/src/test/results/frames/output_frame_0.png new file mode 100644 index 00000000..81c2e4ae Binary files /dev/null and b/duui-open-webui/src/test/results/frames/output_frame_0.png differ diff --git a/duui-open-webui/src/test/results/frames/output_frame_1.png b/duui-open-webui/src/test/results/frames/output_frame_1.png new file mode 100644 index 00000000..e3f7ea2b Binary files /dev/null and b/duui-open-webui/src/test/results/frames/output_frame_1.png differ diff --git a/duui-open-webui/src/test/results/frames/output_frame_2.png b/duui-open-webui/src/test/results/frames/output_frame_2.png new file mode 100644 index 00000000..b34d70ba Binary files /dev/null and b/duui-open-webui/src/test/results/frames/output_frame_2.png differ diff --git a/duui-open-webui/src/test/results/frames/output_frame_3.png b/duui-open-webui/src/test/results/frames/output_frame_3.png new file mode 100644 index 00000000..852d10f4 Binary files /dev/null and b/duui-open-webui/src/test/results/frames/output_frame_3.png differ diff --git a/duui-open-webui/src/test/results/images/output_image_0.png b/duui-open-webui/src/test/results/images/output_image_0.png new file mode 100644 index 00000000..1840de1a Binary files /dev/null and b/duui-open-webui/src/test/results/images/output_image_0.png differ diff --git a/duui-open-webui/src/test/results/images/output_image_1.png b/duui-open-webui/src/test/results/images/output_image_1.png new file mode 100644 index 00000000..1840de1a Binary files /dev/null and b/duui-open-webui/src/test/results/images/output_image_1.png differ