From a167a22002d697a392df339d147a4b7adaeb5f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Pek=C3=A1r?= <525077@mail.muni.cz> Date: Sat, 17 Jan 2026 11:47:17 +0100 Subject: [PATCH 01/39] feat: tensorrt support --- Dockerfile | 16 ++++++++------ models/semantic_segmentation.py | 39 +++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index ea4e4dc..fd31e28 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,14 +31,10 @@ RUN cd /tmp && \ ninja -C builddir install -FROM rayproject/ray:2.53.0-py312 +FROM rayproject/ray:2.53.0-py312-gpu COPY --from=builder /usr/local/ /usr/local/ -# Make sure the dynamic linker can find libraries built into /usr/local/lib, LD_LIBRARY_PATH starts with a colon -ENV LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH}" -RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && sudo ldconfig - # Update & Package installation RUN sudo apt-get update && sudo apt-get -y upgrade && \ sudo apt-get install -y --no-install-recommends \ @@ -46,9 +42,15 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ # Vips & Openslide packages zlib1g-dev libzstd-dev libpng-dev libjpeg-turbo8-dev libtiff-dev \ libopenjp2-7-dev libgdk-pixbuf2.0-dev libxml2-dev sqlite3 libsqlite3-dev \ - libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev python3-dev + libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev python3-dev # Cleanup RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0" +ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib:/home/ray/anaconda3/lib/python3.12/site-packages/tensorrt_libs:/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" +RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ + sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/tensorrt_libs" > /etc/ld.so.conf.d/trt-libs.conf' && \ + sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \ + sudo ldconfig + +RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ No newline at end of file diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index ab2847a..59fa7fa 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -12,7 +12,6 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float - intra_op_num_threads: int fastapi = FastAPI() @@ -36,16 +35,35 @@ async def reconfigure(self, config: Config) -> None: self.tile_size = config["tile_size"] self.mpp = config["mpp"] - sess_options = ort.SessionOptions() - sess_options.intra_op_num_threads = config["intra_op_num_threads"] - sess_options.inter_op_num_threads = 1 - module_path, attr_name = config["model"].pop("_target_").split(":") provider = getattr(importlib.import_module(module_path), attr_name) + + min_shape = f"input:1x3x{self.tile_size}x{self.tile_size}" + opt_shape = ( + f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}" + ) + max_shape = ( + f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}" + ) + providers = [ + ( + "TensorrtExecutionProvider", + { + "device_id": 0, + "trt_fp16_enable": True, + "trt_engine_cache_enable": True, + "trt_engine_cache_path": "./trt_cache", + "trt_profile_min_shapes": min_shape, + "trt_profile_max_shapes": max_shape, + "trt_profile_opt_shapes": opt_shape, + }, + ), + "CUDAExecutionProvider", + "CPUExecutionProvider", + ] + self.session = ort.InferenceSession( - provider(**config["model"]), - providers=["CPUExecutionProvider", "CUDAExecutionProvider"], - session_options=sess_options, + provider(**config["model"]), providers=providers ) self.input_name = self.session.get_inputs()[0].name self.output_name = self.session.get_outputs()[0].name @@ -54,10 +72,7 @@ async def reconfigure(self, config: Config) -> None: self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] def get_config(self) -> dict[str, Any]: - return { - "tile_size": self.tile_size, - "mpp": self.mpp, - } + return {"tile_size": self.tile_size, "mpp": self.mpp} @serve.batch async def predict( From 1d3310f67130d14187eec04ea661f8a30d172fa5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mat=C4=9Bj=20Pek=C3=A1r?= <525077@mail.muni.cz> Date: Sat, 17 Jan 2026 12:04:39 +0100 Subject: [PATCH 02/39] fix: remove flush --- builders/heatmap_builder.py | 1 + misc/tile_heatmap_builder.py | 1 + ray-service.yaml | 93 +++++++++++++++++++++++++++++------- 3 files changed, 79 insertions(+), 16 deletions(-) diff --git a/builders/heatmap_builder.py b/builders/heatmap_builder.py index 4fbb833..5b40636 100644 --- a/builders/heatmap_builder.py +++ b/builders/heatmap_builder.py @@ -100,6 +100,7 @@ async def process_tile(x: int, y: int) -> None: await asyncio.wait(tasks) + mask_builder.flush() mask_builder.save( output_path, tile_height=output_bigtiff_tile_height, diff --git a/misc/tile_heatmap_builder.py b/misc/tile_heatmap_builder.py index 0b8e7b6..843536c 100644 --- a/misc/tile_heatmap_builder.py +++ b/misc/tile_heatmap_builder.py @@ -38,6 +38,7 @@ def update(self, tile: np.ndarray, x: int, y: int) -> None: self.image[y : y + mm_y, x : x + mm_x] += tile[:mm_y, :mm_x] self.count[y : y + mm_y, x : x + mm_x] += 1 + def flush(self) -> None: self.image.flush() self.count.flush() diff --git a/ray-service.yaml b/ray-service.yaml index 35265f6..2a6a0a2 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -39,27 +39,27 @@ spec: import_path: models.semantic_segmentation:app route_prefix: /episeg-1 runtime_env: - working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip + working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/feature/gpu/model-service-feature-gpu.zip deployments: - name: SemanticSegmentation max_ongoing_requests: 16 - max_queued_requests: 64 + max_queued_requests: 32 autoscaling_config: min_replicas: 0 - max_replicas: 4 - target_ongoing_requests: 4 + max_replicas: 2 + target_ongoing_requests: 8 ray_actor_options: - num_cpus: 12 + num_cpus: 4 memory: 12884901888 # 12 GiB + num_gpus: 1 runtime_env: env_vars: MLFLOW_TRACKING_URI: http://mlflow.rationai-mlflow:5000 user_config: tile_size: 1024 mpp: 0.468 - max_batch_size: 2 - batch_wait_timeout_s: 0.5 - intra_op_num_threads: 11 + max_batch_size: 8 + batch_wait_timeout_s: 0.1 model: _target_: providers.model_provider:mlflow artifact_uri: mlflow-artifacts:/10/39f821ed5b964c71a603cc6db196f9fd/artifacts/checkpoints/epoch=19-step=32020/model.onnx/model.onnx @@ -76,14 +76,14 @@ spec: max_queued_requests: 64 autoscaling_config: min_replicas: 0 - max_replicas: 2 + max_replicas: 4 target_ongoing_requests: 2 ray_actor_options: - num_cpus: 4 + num_cpus: 8 memory: 12884901888 # 12 GiB user_config: - num_threads: 4 - max_concurrent_tasks: 8 + num_threads: 8 + max_concurrent_tasks: 24 rayClusterConfig: rayVersion: 2.53.0 @@ -151,11 +151,72 @@ spec: imagePullPolicy: Always resources: limits: - cpu: 32 - memory: 32Gi + cpu: 16 + memory: 24Gi + requests: + cpu: 16 + memory: 24Gi + env: + - name: HTTPS_PROXY + value: http://proxy.ics.muni.cz:3128 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + runAsUser: 1000 + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-c", "ray stop"] + volumeMounts: + - name: data + mountPath: /mnt/data + - name: public-data + mountPath: /mnt/data/Public + - name: projects + mountPath: /mnt/projects + - name: bioptic-tree + mountPath: /mnt/bioptic_tree + + volumes: + - name: data + persistentVolumeClaim: + claimName: data-ro + - name: public-data + persistentVolumeClaim: + claimName: rationai-data-ro-pvc-jobs + - name: projects + persistentVolumeClaim: + claimName: projects-rw + - name: bioptic-tree + persistentVolumeClaim: + claimName: bioptictree-ro + + - groupName: gpu-workers + replicas: 0 + minReplicas: 0 + maxReplicas: 2 + template: + spec: + securityContext: + fsGroupChangePolicy: OnRootMismatch + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A40 + containers: + - name: ray-worker + image: cerit.io/rationai/model-service:2.53.0-gpu + imagePullPolicy: Always + resources: + limits: + cpu: 4 + memory: 12Gi + nvidia.com/gpu: 1 requests: - cpu: 32 - memory: 32Gi + cpu: 4 + memory: 12Gi env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 From 4e27a4801db9985c8284923dcd882c0fc956e649 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sun, 8 Feb 2026 19:27:35 +0100 Subject: [PATCH 03/39] feat: add docker files for cpu/gpu --- docker/Dockerfile.cpu | 54 +++++++++++++++++++++++++++++ Dockerfile => docker/Dockerfile.gpu | 2 +- 2 files changed, 55 insertions(+), 1 deletion(-) create mode 100644 docker/Dockerfile.cpu rename Dockerfile => docker/Dockerfile.gpu (99%) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu new file mode 100644 index 0000000..ea4e4dc --- /dev/null +++ b/docker/Dockerfile.cpu @@ -0,0 +1,54 @@ +# We need to build libvips and openslide from source to get the required features. +# The base image needs to extend rayproject/ray image which is based on ubuntu:22.04. +# Our images are based on ubuntu:24.04, therefore, we can't reuse them. +FROM ubuntu:22.04 AS builder + +ARG VIPS_VERSION=8.17.2 + +# Install all build-time dependencies in a single layer. +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential ca-certificates git wget meson ninja-build \ + zlib1g-dev libzstd-dev libpng-dev libjpeg-turbo8-dev libtiff-dev \ + libopenjp2-7-dev libgdk-pixbuf2.0-dev libxml2-dev sqlite3 libsqlite3-dev \ + libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev && \ + rm -rf /var/lib/apt/lists/* + +# Build OpenSlide from the specified GitHub fork. +RUN git clone https://github.com/iewchen/openslide.git /tmp/openslide-lib && \ + cd /tmp/openslide-lib && \ + meson setup builddir --prefix=/usr/local && \ + meson compile -C builddir && \ + meson install -C builddir + +# Download and build libvips from source. +RUN cd /tmp && \ + wget https://github.com/libvips/libvips/releases/download/v${VIPS_VERSION}/vips-${VIPS_VERSION}.tar.xz && \ + tar xf vips-${VIPS_VERSION}.tar.xz && \ + cd vips-${VIPS_VERSION} && \ + meson setup builddir --prefix=/usr/local && \ + ninja -C builddir && \ + ninja -C builddir install + + +FROM rayproject/ray:2.53.0-py312 + +COPY --from=builder /usr/local/ /usr/local/ + +# Make sure the dynamic linker can find libraries built into /usr/local/lib, LD_LIBRARY_PATH starts with a colon +ENV LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH}" +RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && sudo ldconfig + +# Update & Package installation +RUN sudo apt-get update && sudo apt-get -y upgrade && \ + sudo apt-get install -y --no-install-recommends \ + gcc \ + # Vips & Openslide packages + zlib1g-dev libzstd-dev libpng-dev libjpeg-turbo8-dev libtiff-dev \ + libopenjp2-7-dev libgdk-pixbuf2.0-dev libxml2-dev sqlite3 libsqlite3-dev \ + libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev python3-dev + +# Cleanup +RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* + +RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0" diff --git a/Dockerfile b/docker/Dockerfile.gpu similarity index 99% rename from Dockerfile rename to docker/Dockerfile.gpu index fd31e28..f5bf633 100644 --- a/Dockerfile +++ b/docker/Dockerfile.gpu @@ -53,4 +53,4 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \ sudo ldconfig -RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ No newline at end of file +RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" From fd3154d2628a6eb380c2d33789a41e652a08419f Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sun, 8 Feb 2026 19:27:48 +0100 Subject: [PATCH 04/39] feat: add PVC for TensorRT --- pvc/tensorrt-cache-pvc.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 pvc/tensorrt-cache-pvc.yaml diff --git a/pvc/tensorrt-cache-pvc.yaml b/pvc/tensorrt-cache-pvc.yaml new file mode 100644 index 0000000..01aef8f --- /dev/null +++ b/pvc/tensorrt-cache-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: tensorrt-cache-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: nfs-csi + resources: + requests: + storage: 20Gi From eaac807a60846de706068030901b176ceca50076 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sun, 8 Feb 2026 19:28:06 +0100 Subject: [PATCH 05/39] feat: add support of TensorRT for models --- models/binary_classifier.py | 96 ++++++++++++++++++++++++++------- models/semantic_segmentation.py | 69 +++++++++++++++++------- 2 files changed, 126 insertions(+), 39 deletions(-) diff --git a/models/binary_classifier.py b/models/binary_classifier.py index 087a73e..efbe6a7 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -1,3 +1,5 @@ +import asyncio +import os from typing import Any, TypedDict import numpy as np @@ -7,9 +9,9 @@ class Config(TypedDict): + """Configuration for BinaryClassifier deployment.""" + tile_size: int - mean: list[float] - std: list[float] model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float @@ -19,63 +21,119 @@ class Config(TypedDict): fastapi = FastAPI() -@serve.deployment(num_replicas="auto") +@serve.deployment( + num_replicas="auto", + ray_actor_options={"num_gpus": 1}, +) @serve.ingress(fastapi) class BinaryClassifier: + """Binary classifier for tissue tiles using ONNX Runtime with GPU support.""" + tile_size: int def __init__(self) -> None: import lz4.frame - self.decompress = lz4.frame.decompress + self.lz4 = lz4.frame - async def reconfigure(self, config: Config) -> None: + def reconfigure(self, config: Config) -> None: + """Load the ONNX model and configure inference settings.""" import importlib import onnxruntime as ort self.tile_size = config["tile_size"] - self.mean = np.array(config["mean"], dtype=np.float32).reshape(1, 3, 1, 1) - self.inv_std = 1 / np.array(config["std"], dtype=np.float32).reshape(1, 3, 1, 1) + cache_path = "/mnt/cache/trt_cache" + os.makedirs(cache_path, exist_ok=True) + min_shape = f"input:1x3x{self.tile_size}x{self.tile_size}" + opt_shape = ( + f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}" + ) + max_shape = ( + f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}" + ) + + trt_options = { + "device_id": 0, + "trt_fp16_enable": True, + "trt_engine_cache_enable": True, + "trt_engine_cache_path": cache_path, + "trt_max_workspace_size": 4 * 1024 * 1024 * 1024, # 4GB + "trt_builder_optimization_level": 5, + "trt_timing_cache_enable": True, + "trt_profile_min_shapes": min_shape, + "trt_profile_max_shapes": max_shape, + "trt_profile_opt_shapes": opt_shape, + } + + # Configure ONNX Runtime session sess_options = ort.SessionOptions() sess_options.intra_op_num_threads = config["intra_op_num_threads"] sess_options.inter_op_num_threads = 1 + # Enable graph optimizations + sess_options.graph_optimization_level = ( + ort.GraphOptimizationLevel.ORT_ENABLE_ALL + ) + sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + + # Load model from provider (e.g., MLflow) module_path, attr_name = config["model"].pop("_target_").split(":") provider = getattr(importlib.import_module(module_path), attr_name) self.session = ort.InferenceSession( provider(**config["model"]), - providers=["CPUExecutionProvider"], + providers=[ + ( + "TensorrtExecutionProvider", + trt_options, + ), + "CUDAExecutionProvider", + "CPUExecutionProvider", + ], session_options=sess_options, ) + self.input_name = self.session.get_inputs()[0].name self.output_name = self.session.get_outputs()[0].name + # Configure batching self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] + dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size) + dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8) + + self.session.run([self.output_name], {self.input_name: dummy_input}) + @serve.batch async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]: - batch = np.stack(images, axis=0).astype(np.float32) - - # Normalization - batch -= self.mean - batch *= self.inv_std + """Run inference on a batch of images.""" + batch = np.ascontiguousarray(np.stack(images, axis=0).astype(np.uint8)) - outputs = self.session.run([self.output_name], {self.input_name: batch}) + outputs = self.session.run( + [self.output_name], + {self.input_name: batch}, + ) - return outputs[0].squeeze(1).tolist() + return outputs[0].flatten().tolist() @fastapi.post("/") async def root(self, request: Request) -> float: - data = self.decompress(await request.body()) - image = np.frombuffer(data, dtype=np.uint8).reshape( - self.tile_size, self.tile_size, 3 + """Handle inference request with LZ4-compressed image.""" + data = await asyncio.to_thread(self.lz4.decompress, await request.body()) + + image = ( + np.frombuffer(data, dtype=np.uint8) + .reshape(self.tile_size, self.tile_size, 3) + .transpose(2, 0, 1) ) - return await self.predict(image.transpose(2, 0, 1)) + image = np.ascontiguousarray(image) + + result = await self.predict(image) + return result app = BinaryClassifier.bind() # type: ignore[attr-defined] diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index 59fa7fa..650e739 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -1,3 +1,4 @@ +import os from typing import Any, TypedDict import numpy as np @@ -12,6 +13,7 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float + intra_op_num_threads: int fastapi = FastAPI() @@ -27,7 +29,7 @@ def __init__(self) -> None: self.lz4 = lz4.frame - async def reconfigure(self, config: Config) -> None: + def reconfigure(self, config: Config) -> None: import importlib import onnxruntime as ort @@ -35,8 +37,8 @@ async def reconfigure(self, config: Config) -> None: self.tile_size = config["tile_size"] self.mpp = config["mpp"] - module_path, attr_name = config["model"].pop("_target_").split(":") - provider = getattr(importlib.import_module(module_path), attr_name) + cache_path = "/mnt/cache/trt_cache" + os.makedirs(cache_path, exist_ok=True) min_shape = f"input:1x3x{self.tile_size}x{self.tile_size}" opt_shape = ( @@ -45,32 +47,59 @@ async def reconfigure(self, config: Config) -> None: max_shape = ( f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}" ) - providers = [ - ( - "TensorrtExecutionProvider", - { - "device_id": 0, - "trt_fp16_enable": True, - "trt_engine_cache_enable": True, - "trt_engine_cache_path": "./trt_cache", - "trt_profile_min_shapes": min_shape, - "trt_profile_max_shapes": max_shape, - "trt_profile_opt_shapes": opt_shape, - }, - ), - "CUDAExecutionProvider", - "CPUExecutionProvider", - ] + + trt_options = { + "device_id": 0, + "trt_fp16_enable": True, + "trt_engine_cache_enable": True, + "trt_engine_cache_path": cache_path, + "trt_max_workspace_size": 4 * 1024 * 1024 * 1024, # 4GB + "trt_builder_optimization_level": 5, + "trt_timing_cache_enable": True, + "trt_profile_min_shapes": min_shape, + "trt_profile_max_shapes": max_shape, + "trt_profile_opt_shapes": opt_shape, + } + + # Configure ONNX Runtime session + sess_options = ort.SessionOptions() + sess_options.intra_op_num_threads = config["intra_op_num_threads"] + sess_options.inter_op_num_threads = 1 + + # Enable graph optimizations + sess_options.graph_optimization_level = ( + ort.GraphOptimizationLevel.ORT_ENABLE_ALL + ) + sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL + + module_path, attr_name = config["model"].pop("_target_").split(":") + provider = getattr(importlib.import_module(module_path), attr_name) self.session = ort.InferenceSession( - provider(**config["model"]), providers=providers + provider(**config["model"]), + providers=[ + ( + "TensorrtExecutionProvider", + trt_options, + ), + "CUDAExecutionProvider", + "CPUExecutionProvider", + ], + session_options=sess_options, ) + self.input_name = self.session.get_inputs()[0].name self.output_name = self.session.get_outputs()[0].name self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] + # Warmup + dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size) + dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8) + + self.session.run([self.output_name], {self.input_name: dummy_input}) + def get_config(self) -> dict[str, Any]: return {"tile_size": self.tile_size, "mpp": self.mpp} From 46fe8b1b244f339adc204d8d6b5256ac244bb3ab Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sun, 8 Feb 2026 19:28:57 +0100 Subject: [PATCH 06/39] feat: add TensorRT cache to workers --- ray-service.yaml | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/ray-service.yaml b/ray-service.yaml index 2a6a0a2..4361544 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -19,27 +19,26 @@ spec: max_replicas: 4 target_ongoing_requests: 32 ray_actor_options: - num_cpus: 6 - memory: 6442450944 # 6 GiB + num_cpus: 4 + num_gpus: 1 + memory: 4294967296 # 4 GiB runtime_env: env_vars: MLFLOW_TRACKING_URI: http://mlflow.rationai-mlflow:5000 user_config: tile_size: 512 - max_batch_size: 32 - batch_wait_timeout_s: 0.5 - mean: [228.5544, 178.8584, 219.8793] - std: [27.8285, 51.4639, 26.4458] - intra_op_num_threads: 5 + max_batch_size: 16 + batch_wait_timeout_s: 0.01 + intra_op_num_threads: 4 model: _target_: providers.model_provider:mlflow - artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/model.onnx + artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx - name: episeg-1 import_path: models.semantic_segmentation:app route_prefix: /episeg-1 runtime_env: - working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/feature/gpu/model-service-feature-gpu.zip + working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip deployments: - name: SemanticSegmentation max_ongoing_requests: 16 @@ -118,6 +117,9 @@ spec: requests: cpu: 0 memory: 4Gi + env: + - name: HTTPS_PROXY + value: http://proxy.ics.muni.cz:3128 ports: - containerPort: 6379 name: gcs-server @@ -151,11 +153,11 @@ spec: imagePullPolicy: Always resources: limits: - cpu: 16 - memory: 24Gi + cpu: 8 + memory: 16Gi requests: - cpu: 16 - memory: 24Gi + cpu: 8 + memory: 16Gi env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 @@ -177,6 +179,8 @@ spec: mountPath: /mnt/projects - name: bioptic-tree mountPath: /mnt/bioptic_tree + - name: trt-cache-volume + mountPath: /mnt/cache volumes: - name: data @@ -191,6 +195,9 @@ spec: - name: bioptic-tree persistentVolumeClaim: claimName: bioptictree-ro + - name: trt-cache-volume + persistentVolumeClaim: + claimName: tensorrt-cache-pvc - groupName: gpu-workers replicas: 0 @@ -199,8 +206,10 @@ spec: template: spec: securityContext: + fsGroup: 1000 fsGroupChangePolicy: OnRootMismatch runAsNonRoot: true + runAsUser: 1000 seccompProfile: type: RuntimeDefault nodeSelector: @@ -211,12 +220,12 @@ spec: imagePullPolicy: Always resources: limits: - cpu: 4 - memory: 12Gi + cpu: 8 + memory: 24Gi nvidia.com/gpu: 1 requests: - cpu: 4 - memory: 12Gi + cpu: 8 + memory: 24Gi env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 @@ -238,6 +247,8 @@ spec: mountPath: /mnt/projects - name: bioptic-tree mountPath: /mnt/bioptic_tree + - name: trt-cache-volume + mountPath: /mnt/cache volumes: - name: data @@ -252,3 +263,6 @@ spec: - name: bioptic-tree persistentVolumeClaim: claimName: bioptictree-ro + - name: trt-cache-volume + persistentVolumeClaim: + claimName: tensorrt-cache-pvc From f07723e982d684e0e4f2df144d8df45f9566a822 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sun, 8 Feb 2026 19:29:07 +0100 Subject: [PATCH 07/39] add Jiri as coauthor --- pyproject.toml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index acbe907..fbb3a9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,10 @@ [project] name = "model-service" version = "0.1.0" -authors = [{ name = "Matěj Pekár", email = "matejpekar@mail.muni.cz" }] +authors = [ + { name = "Matěj Pekár", email = "matejpekar@mail.muni.cz" }, + { name = "Jiří Štípek", email = "567776@mail.muni.cz" }, +] readme = "README.md" license = { file = "LICENSE" } requires-python = ">=3.12" From 9d6e26503f22fa610dbcf17727ca99bec7c07d26 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sun, 8 Feb 2026 20:17:29 +0100 Subject: [PATCH 08/39] fix: remove gpu number from serve.deployment in code --- models/binary_classifier.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/models/binary_classifier.py b/models/binary_classifier.py index efbe6a7..3b607ba 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -21,10 +21,7 @@ class Config(TypedDict): fastapi = FastAPI() -@serve.deployment( - num_replicas="auto", - ray_actor_options={"num_gpus": 1}, -) +@serve.deployment(num_replicas="auto") @serve.ingress(fastapi) class BinaryClassifier: """Binary classifier for tissue tiles using ONNX Runtime with GPU support.""" From e7612f97cba08d96c7a61cdaa109eef53095830e Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Mon, 9 Feb 2026 19:46:10 +0100 Subject: [PATCH 09/39] fix: warning suppress --- models/binary_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/binary_classifier.py b/models/binary_classifier.py index 3b607ba..2b807ab 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -114,7 +114,7 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]: {self.input_name: batch}, ) - return outputs[0].flatten().tolist() + return outputs[0].flatten().tolist() # pyright: ignore[reportAttributeAccessIssue] @fastapi.post("/") async def root(self, request: Request) -> float: From 5945f1024e8da6f3b89b5205fb4fa8f3be29430f Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Tue, 10 Feb 2026 20:00:58 +0100 Subject: [PATCH 10/39] feat: add jobs to download virchow2 --- misc/virchow2_downloader/download_virchow2.py | 48 ++++++++++++++ .../virchow2_downloader_job.yaml | 64 +++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 misc/virchow2_downloader/download_virchow2.py create mode 100644 misc/virchow2_downloader/virchow2_downloader_job.yaml diff --git a/misc/virchow2_downloader/download_virchow2.py b/misc/virchow2_downloader/download_virchow2.py new file mode 100644 index 0000000..218790b --- /dev/null +++ b/misc/virchow2_downloader/download_virchow2.py @@ -0,0 +1,48 @@ +import os + +from huggingface_hub import login, snapshot_download + + +HF_TOKEN = os.environ.get("HF_TOKEN") +CACHE_DIR = "/mnt/huggingface_cache" +MODEL_ID = "paige-ai/Virchow2" + +os.environ["HF_HOME"] = CACHE_DIR +os.makedirs(CACHE_DIR, exist_ok=True) + +print(f"Starting download for {MODEL_ID} to {CACHE_DIR}") + +if HF_TOKEN: + print("Logging in to Hugging Face...") + login(token=HF_TOKEN) +else: + print("No HF_TOKEN provided! Download might fail for gated models.") + +print("Downloading model snapshot...") +try: + path = snapshot_download( + repo_id=MODEL_ID, + resume_download=True, + local_files_only=False, + ) + print(f"Model downloaded to: {path}") + + print("Verifying model files exist...") + import timm + + try: + model = timm.create_model( + f"hf-hub:{MODEL_ID}", + pretrained=True, + num_classes=0, + ) + print(f"Model successfully loaded! Type: {type(model).__name__}") + del model # Free memory + except Exception as e: + print(f"Verification warning: {e}") + +except Exception as e: + print(f"Download failed: {e}") + exit(1) + +print("DONE. Model is cached and ready for offline use.") diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml new file mode 100644 index 0000000..293401a --- /dev/null +++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml @@ -0,0 +1,64 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: virchow-downloader + namespace: rationai-jobs-ns +spec: + template: + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: downloader + image: python:3.10 + resources: + requests: + memory: "4Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + command: ["/bin/bash", "-c"] + args: + - | + pip install --user --no-cache-dir huggingface_hub transformers torch timm + python3 /mnt/scripts/download_virchow2.py + env: + - name: HOME + value: /tmp + - name: HF_TOKEN + value: "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" + - name: HTTPS_PROXY + value: "http://proxy.ics.muni.cz:3128" + - name: HTTP_PROXY + value: "http://proxy.ics.muni.cz:3128" + - name: TORCH_HOME + value: /tmp/torch + - name: TORCHINDUCTOR_CACHE_DIR + value: /tmp/torch/inductor_cache + volumeMounts: + - name: huggingface-cache + mountPath: /mnt/huggingface_cache + - name: scripts + mountPath: /mnt/scripts + - name: temp + mountPath: /tmp + restartPolicy: Never + volumes: + - name: huggingface-cache + persistentVolumeClaim: + claimName: huggingface-cache-pvc + - name: scripts + configMap: + name: downloader-script + defaultMode: 0777 + - name: temp + emptyDir: {} From 8ef4cc52e75e9f76ce41314db69af9ee4d4c65ff Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Tue, 10 Feb 2026 20:01:12 +0100 Subject: [PATCH 11/39] feat: add model provider for hf --- providers/model_provider.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/providers/model_provider.py b/providers/model_provider.py index faf6e5c..68daafe 100644 --- a/providers/model_provider.py +++ b/providers/model_provider.py @@ -2,3 +2,27 @@ def mlflow(artifact_uri: str) -> str: import mlflow.artifacts return mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri) + + +def huggingface(repo_id: str, filename: str | None = None) -> str: + import os + + from huggingface_hub import hf_hub_download, snapshot_download + + cache_dir = os.environ.get("HF_HOME", "/mnt/huggingface_cache") + os.makedirs(cache_dir, exist_ok=True) + os.environ["HF_HOME"] = cache_dir + + if filename: + return hf_hub_download( + repo_id=repo_id, + filename=filename, + cache_dir=cache_dir, + local_files_only=True, + ) + else: + return snapshot_download( + repo_id=repo_id, + cache_dir=cache_dir, + local_files_only=True, + ) From a6c427eb99a7e7fa1015dc3f405c4f7568ca0650 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Tue, 10 Feb 2026 20:01:29 +0100 Subject: [PATCH 12/39] feat: add pvc for huggingface --- pvc/huggingface-pvc.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 pvc/huggingface-pvc.yaml diff --git a/pvc/huggingface-pvc.yaml b/pvc/huggingface-pvc.yaml new file mode 100644 index 0000000..8cf3047 --- /dev/null +++ b/pvc/huggingface-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: huggingface-cache-pvc + namespace: rationai-jobs-ns +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 15Gi + storageClassName: nfs-csi From 27c78018bfe4fb92c451b8db06df7c4646cfa730 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Tue, 10 Feb 2026 20:56:02 +0100 Subject: [PATCH 13/39] feat: add virchow2 model --- models/virchow2.py | 135 +++++++++++++++++++++++++++++++++++++++++++++ ray-service.yaml | 47 ++++++++++++++++ 2 files changed, 182 insertions(+) create mode 100644 models/virchow2.py diff --git a/models/virchow2.py b/models/virchow2.py new file mode 100644 index 0000000..fb8ecc0 --- /dev/null +++ b/models/virchow2.py @@ -0,0 +1,135 @@ +import asyncio +from typing import Any, TypedDict + +import numpy as np +import timm +import torch +from fastapi import FastAPI, Request +from numpy.typing import NDArray +from ray import serve +from timm.data.config import resolve_data_config +from timm.data.transforms_factory import create_transform +from timm.layers.mlp import SwiGLUPacked + + +class Config(TypedDict): + tile_size: int + model: dict[str, Any] + max_batch_size: int + batch_wait_timeout_s: float + + +fastapi = FastAPI() + + +@serve.deployment( + num_replicas="auto", + ray_actor_options={"num_gpus": 1}, +) +@serve.ingress(fastapi) +class Virchow2: + """Virchow2 foundation model for pathology.""" + + def __init__(self) -> None: + import os + + import lz4.frame + + # Enforce offline mode for timm/huggingface_hub + os.environ["HF_HUB_OFFLINE"] = "1" + + self.lz4 = lz4.frame + self.model: torch.nn.Module | None = None + self.transforms: Any = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + def reconfigure(self, config: Config) -> None: + import importlib + import logging + + logger = logging.getLogger("ray.serve") + self.tile_size = config["tile_size"] + + # Load model using the provider + module_path, attr_name = config["model"].pop("_target_").split(":") + provider = getattr(importlib.import_module(module_path), attr_name) + repo_id = config["model"]["repo_id"] + + logger.info(f"Loading Virchow2 model from {repo_id}...") + provider(**config["model"]) + + self.model = timm.create_model( + f"hf-hub:{repo_id}", + pretrained=True, + num_classes=0, + mlp_layer=SwiGLUPacked, + act_layer=torch.nn.SiLU, + ) + self.model = self.model.to(self.device).eval() + logger.info("Virchow2 model loaded and moved to GPU.") + + self.transforms = create_transform( + **resolve_data_config(self.model.pretrained_cfg, model=self.model) + ) + + # Configure batching + self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] + self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] + + # Warmup + logger.info("Starting warmup...") + with ( + torch.inference_mode(), + torch.autocast(device_type="cuda", dtype=torch.float16), + ): + # Create a dummy input tensor + dummy_input = torch.randn( + 1, 3, self.tile_size, self.tile_size, device=self.device + ) + self.model(dummy_input) + logger.info("Warmup complete.") + + @serve.batch + async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: + from PIL import Image + + if self.model is None or self.transforms is None: + raise RuntimeError("Model or transforms not initialized") + + # Convert numpy arrays to PIL Images and apply transforms + pil_images = [Image.fromarray(img) for img in images] + tensors = torch.stack([self.transforms(img) for img in pil_images]).to( + self.device + ) + + with ( + torch.inference_mode(), + torch.autocast(device_type="cuda", dtype=torch.float16), + ): + output = self.model(tensors) + + class_token = output[:, 0] # size: batch x 1280 + patch_tokens = output[ + :, 5: + ] # size: batch x 256 x 1280 (skip register tokens 1-4) + embedding = torch.cat( + [class_token, patch_tokens.mean(1)], dim=-1 + ) # size: batch x 2560 + embedding = embedding.to(torch.float16) + + return embedding.cpu().tolist() + + @fastapi.post("/") + async def root(self, request: Request) -> list[float]: + data = await asyncio.to_thread(self.lz4.decompress, await request.body()) + + # Reshape to (height, width, channels) - RGB image + image = np.frombuffer(data, dtype=np.uint8).reshape( + self.tile_size, self.tile_size, 3 + ) + + results = await self.predict(image) + return results[0] + + +app = Virchow2.bind() diff --git a/ray-service.yaml b/ray-service.yaml index 4361544..4eeecd4 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -84,6 +84,43 @@ spec: num_threads: 8 max_concurrent_tasks: 24 + - name: virchow2 + import_path: models.virchow2:app + route_prefix: /virchow2 + runtime_env: + config: + setup_timeout_seconds: 1800 + working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip + pip: + - https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl + - https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl + - timm>=1.0.0 + - huggingface-hub>=0.23.0 + env_vars: + HF_TOKEN: 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 + deployments: + - name: Virchow2 + max_ongoing_requests: 32 + max_queued_requests: 64 + autoscaling_config: + min_replicas: 0 + max_replicas: 1 + target_ongoing_requests: 16 + ray_actor_options: + num_cpus: 4 + num_gpus: 1 + memory: 8589934592 # 8 GiB + runtime_env: + env_vars: + HF_HOME: "/mnt/huggingface_cache" + user_config: + tile_size: 224 + max_batch_size: 16 + batch_wait_timeout_s: 0.1 + model: + _target_: providers.model_provider:huggingface + repo_id: paige-ai/Virchow2 + rayClusterConfig: rayVersion: 2.53.0 enableInTreeAutoscaling: true @@ -181,6 +218,8 @@ spec: mountPath: /mnt/bioptic_tree - name: trt-cache-volume mountPath: /mnt/cache + - name: huggingface-cache + mountPath: /mnt/huggingface_cache volumes: - name: data @@ -198,6 +237,9 @@ spec: - name: trt-cache-volume persistentVolumeClaim: claimName: tensorrt-cache-pvc + - name: huggingface-cache + persistentVolumeClaim: + claimName: huggingface-cache-pvc - groupName: gpu-workers replicas: 0 @@ -249,6 +291,8 @@ spec: mountPath: /mnt/bioptic_tree - name: trt-cache-volume mountPath: /mnt/cache + - name: huggingface-cache + mountPath: /mnt/huggingface_cache volumes: - name: data @@ -266,3 +310,6 @@ spec: - name: trt-cache-volume persistentVolumeClaim: claimName: tensorrt-cache-pvc + - name: huggingface-cache + persistentVolumeClaim: + claimName: huggingface-cache-pvc From e5d84cbdd645b6659fe98d014cd50739fad5e649 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Tue, 10 Feb 2026 21:16:01 +0100 Subject: [PATCH 14/39] fix --- models/virchow2.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/models/virchow2.py b/models/virchow2.py index fb8ecc0..95afe29 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -22,10 +22,7 @@ class Config(TypedDict): fastapi = FastAPI() -@serve.deployment( - num_replicas="auto", - ray_actor_options={"num_gpus": 1}, -) +@serve.deployment(num_replicas="auto") @serve.ingress(fastapi) class Virchow2: """Virchow2 foundation model for pathology.""" @@ -129,7 +126,7 @@ async def root(self, request: Request) -> list[float]: ) results = await self.predict(image) - return results[0] + return results # type: ignore[attr-defined] -app = Virchow2.bind() +app = Virchow2.bind() # type: ignore[attr-defined] From e1fcb6ca9c88fd77ea52aea7c78aa9365ecf1086 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Tue, 10 Feb 2026 21:26:37 +0100 Subject: [PATCH 15/39] fix: fine tune --- models/virchow2.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/models/virchow2.py b/models/virchow2.py index 95afe29..781968b 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -39,6 +39,7 @@ def __init__(self) -> None: self.model: torch.nn.Module | None = None self.transforms: Any = None self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.tile_size: int = 0 def reconfigure(self, config: Config) -> None: import importlib @@ -55,6 +56,7 @@ def reconfigure(self, config: Config) -> None: logger.info(f"Loading Virchow2 model from {repo_id}...") provider(**config["model"]) + # Load model with official architecture self.model = timm.create_model( f"hf-hub:{repo_id}", pretrained=True, @@ -63,27 +65,28 @@ def reconfigure(self, config: Config) -> None: act_layer=torch.nn.SiLU, ) self.model = self.model.to(self.device).eval() - logger.info("Virchow2 model loaded and moved to GPU.") + # Get transforms from model config self.transforms = create_transform( **resolve_data_config(self.model.pretrained_cfg, model=self.model) ) + logger.info("Virchow2 model loaded and moved to GPU.") + # Configure batching self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] # Warmup logger.info("Starting warmup...") + dummy_batch = torch.randn( + 1, 3, self.tile_size, self.tile_size, device=self.device + ) with ( torch.inference_mode(), torch.autocast(device_type="cuda", dtype=torch.float16), ): - # Create a dummy input tensor - dummy_input = torch.randn( - 1, 3, self.tile_size, self.tile_size, device=self.device - ) - self.model(dummy_input) + self.model(dummy_batch) logger.info("Warmup complete.") @serve.batch @@ -105,16 +108,18 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: ): output = self.model(tensors) - class_token = output[:, 0] # size: batch x 1280 + # Extract embeddings as per official model card + class_token = output[:, 0] # CLS token: batch x 1280 patch_tokens = output[ :, 5: - ] # size: batch x 256 x 1280 (skip register tokens 1-4) + ] # Skip register tokens (1-4): batch x 256 x 1280 + + # Concatenate CLS token with mean of patch tokens embedding = torch.cat( - [class_token, patch_tokens.mean(1)], dim=-1 - ) # size: batch x 2560 - embedding = embedding.to(torch.float16) + [class_token, patch_tokens.mean(dim=1)], dim=-1 + ) # batch x 2560 - return embedding.cpu().tolist() + return embedding.cpu().float().tolist() @fastapi.post("/") async def root(self, request: Request) -> list[float]: From e7ac073d87c0e0173a1f0c23fbd97126a537fc2e Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sat, 14 Feb 2026 11:30:19 +0100 Subject: [PATCH 16/39] feat: add into dockerfile --- docker/Dockerfile.gpu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index f5bf633..f0f29b6 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -53,4 +53,9 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \ sudo ldconfig -RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" +RUN pip install --no-cache-dir \ + onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ + https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl \ + https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl \ + "timm>=1.0.0" \ + "huggingface-hub>=0.23.0" From 51f07a4befaddb62d5b1a50ca4deba77e206d5bc Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sat, 14 Feb 2026 11:31:28 +0100 Subject: [PATCH 17/39] fix: remove installs from model --- ray-service.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/ray-service.yaml b/ray-service.yaml index 4eeecd4..e795857 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -63,7 +63,6 @@ spec: _target_: providers.model_provider:mlflow artifact_uri: mlflow-artifacts:/10/39f821ed5b964c71a603cc6db196f9fd/artifacts/checkpoints/epoch=19-step=32020/model.onnx/model.onnx - - name: heatmap-builder import_path: builders.heatmap_builder:app route_prefix: /heatmap-builder @@ -91,11 +90,6 @@ spec: config: setup_timeout_seconds: 1800 working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip - pip: - - https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl - - https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl - - timm>=1.0.0 - - huggingface-hub>=0.23.0 env_vars: HF_TOKEN: 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 deployments: From 178f226ce73223911b6bc84f1b04cd622c6f6d88 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sat, 14 Feb 2026 11:31:55 +0100 Subject: [PATCH 18/39] fix: based on official docs --- models/virchow2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/models/virchow2.py b/models/virchow2.py index 781968b..48fd549 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -119,7 +119,8 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: [class_token, patch_tokens.mean(dim=1)], dim=-1 ) # batch x 2560 - return embedding.cpu().float().tolist() + # Convert to fp16 for efficiency as recommended in official docs + return embedding.half().cpu().tolist() @fastapi.post("/") async def root(self, request: Request) -> list[float]: From 5cc123f1096ed7c559a5a9fd6ed3358e597bc4ec Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sat, 14 Feb 2026 11:32:20 +0100 Subject: [PATCH 19/39] fix --- models/virchow2.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/models/virchow2.py b/models/virchow2.py index 48fd549..50717af 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -73,7 +73,6 @@ def reconfigure(self, config: Config) -> None: logger.info("Virchow2 model loaded and moved to GPU.") - # Configure batching self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] @@ -96,7 +95,6 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: if self.model is None or self.transforms is None: raise RuntimeError("Model or transforms not initialized") - # Convert numpy arrays to PIL Images and apply transforms pil_images = [Image.fromarray(img) for img in images] tensors = torch.stack([self.transforms(img) for img in pil_images]).to( self.device @@ -119,7 +117,6 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: [class_token, patch_tokens.mean(dim=1)], dim=-1 ) # batch x 2560 - # Convert to fp16 for efficiency as recommended in official docs return embedding.half().cpu().tolist() @fastapi.post("/") From 964114e73e43ac12160b144b6628eece5ea550e0 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@muni.cz> Date: Sat, 14 Feb 2026 11:45:18 +0100 Subject: [PATCH 20/39] fix: remove comment --- models/virchow2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/virchow2.py b/models/virchow2.py index 50717af..bcf165e 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -129,7 +129,7 @@ async def root(self, request: Request) -> list[float]: ) results = await self.predict(image) - return results # type: ignore[attr-defined] + return results app = Virchow2.bind() # type: ignore[attr-defined] From 181f79e7ed4cf6c42f20ac38fc9665443cff0098 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Fri, 13 Mar 2026 20:35:16 +0100 Subject: [PATCH 21/39] chore: update docker gpu file --- docker/Dockerfile.cpu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index ea4e4dc..9538fda 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -51,4 +51,8 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ # Cleanup RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0" +RUN pip install --no-cache-dir \ + onnxruntime lz4 ratiopath "mlflow<3.0" \ + torch==2.4.0 torchvision==0.19.0 \ + "timm>=1.0.0" \ + "huggingface-hub>=0.23.0" From 156a7d47ef59cef03402acf050ec8e6373ac7bb1 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Fri, 13 Mar 2026 20:36:15 +0100 Subject: [PATCH 22/39] feat: optimalize virchow2 deployment --- ray-service.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/ray-service.yaml b/ray-service.yaml index e795857..7f7386f 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -94,14 +94,14 @@ spec: HF_TOKEN: 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 deployments: - name: Virchow2 - max_ongoing_requests: 32 - max_queued_requests: 64 + max_ongoing_requests: 160 + max_queued_requests: 256 autoscaling_config: min_replicas: 0 max_replicas: 1 - target_ongoing_requests: 16 + target_ongoing_requests: 128 ray_actor_options: - num_cpus: 4 + num_cpus: 8 num_gpus: 1 memory: 8589934592 # 8 GiB runtime_env: @@ -109,8 +109,8 @@ spec: HF_HOME: "/mnt/huggingface_cache" user_config: tile_size: 224 - max_batch_size: 16 - batch_wait_timeout_s: 0.1 + max_batch_size: 128 + batch_wait_timeout_s: 0.05 model: _target_: providers.model_provider:huggingface repo_id: paige-ai/Virchow2 @@ -139,15 +139,15 @@ spec: type: RuntimeDefault containers: - name: ray-head - image: rayproject/ray:2.53.0-py312 + image: cerit.io/rationai/model-service:2.53.0 imagePullPolicy: Always resources: limits: cpu: 0 - memory: 4Gi + memory: 8Gi requests: cpu: 0 - memory: 4Gi + memory: 8Gi env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 @@ -239,6 +239,8 @@ spec: replicas: 0 minReplicas: 0 maxReplicas: 2 + rayStartParams: + num-gpus: "1" template: spec: securityContext: @@ -248,8 +250,6 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A40 containers: - name: ray-worker image: cerit.io/rationai/model-service:2.53.0-gpu @@ -258,7 +258,7 @@ spec: limits: cpu: 8 memory: 24Gi - nvidia.com/gpu: 1 + nvidia.com/mig-2g.20gb: 1 requests: cpu: 8 memory: 24Gi From 57176d64fb5dcf32ca475a1c304a0f8b3b632f45 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 11:38:19 +0100 Subject: [PATCH 23/39] fix: remove hf token, create new secret --- misc/virchow2_downloader/virchow2_downloader_job.yaml | 5 ++++- ray-service.yaml | 7 +++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml index 293401a..2fe2647 100644 --- a/misc/virchow2_downloader/virchow2_downloader_job.yaml +++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml @@ -35,7 +35,10 @@ spec: - name: HOME value: /tmp - name: HF_TOKEN - value: "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" + valueFrom: + secretKeyRef: + name: huggingface-secret + key: token - name: HTTPS_PROXY value: "http://proxy.ics.muni.cz:3128" - name: HTTP_PROXY diff --git a/ray-service.yaml b/ray-service.yaml index 7f7386f..303381c 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -90,8 +90,6 @@ spec: config: setup_timeout_seconds: 1800 working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip - env_vars: - HF_TOKEN: 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 deployments: - name: Virchow2 max_ongoing_requests: 160 @@ -265,6 +263,11 @@ spec: env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: huggingface-secret + key: token securityContext: allowPrivilegeEscalation: false capabilities: From fe51ee24cefed04bcf8d2dc2b2006415092110d1 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 11:43:27 +0100 Subject: [PATCH 24/39] fix --- misc/virchow2_downloader/virchow2_downloader_job.yaml | 4 ++-- models/binary_classifier.py | 2 -- models/semantic_segmentation.py | 2 -- ray-service.yaml | 1 - 4 files changed, 2 insertions(+), 7 deletions(-) diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml index 2fe2647..e6c77af 100644 --- a/misc/virchow2_downloader/virchow2_downloader_job.yaml +++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml @@ -14,7 +14,7 @@ spec: type: RuntimeDefault containers: - name: downloader - image: python:3.10 + image: python:3.12 resources: requests: memory: "4Gi" @@ -62,6 +62,6 @@ spec: - name: scripts configMap: name: downloader-script - defaultMode: 0777 + defaultMode: 0755 - name: temp emptyDir: {} diff --git a/models/binary_classifier.py b/models/binary_classifier.py index 2b807ab..e774713 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -15,7 +15,6 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float - intra_op_num_threads: int fastapi = FastAPI() @@ -67,7 +66,6 @@ def reconfigure(self, config: Config) -> None: # Configure ONNX Runtime session sess_options = ort.SessionOptions() - sess_options.intra_op_num_threads = config["intra_op_num_threads"] sess_options.inter_op_num_threads = 1 # Enable graph optimizations diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index 650e739..9cfebd0 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -13,7 +13,6 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float - intra_op_num_threads: int fastapi = FastAPI() @@ -63,7 +62,6 @@ def reconfigure(self, config: Config) -> None: # Configure ONNX Runtime session sess_options = ort.SessionOptions() - sess_options.intra_op_num_threads = config["intra_op_num_threads"] sess_options.inter_op_num_threads = 1 # Enable graph optimizations diff --git a/ray-service.yaml b/ray-service.yaml index 303381c..ef2561d 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -29,7 +29,6 @@ spec: tile_size: 512 max_batch_size: 16 batch_wait_timeout_s: 0.01 - intra_op_num_threads: 4 model: _target_: providers.model_provider:mlflow artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx From 210c7e6af680ce6e4ac3eb9872fc9a6441d9c6d2 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 11:53:43 +0100 Subject: [PATCH 25/39] fix: remove intra threads --- models/semantic_segmentation.py | 1 - ray-service.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index 7e7845d..21a0674 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -13,7 +13,6 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float - intra_op_num_threads: int trt_cache_path: str diff --git a/ray-service.yaml b/ray-service.yaml index 99596d7..dbcd32a 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -32,7 +32,6 @@ spec: tile_size: 512 max_batch_size: 16 batch_wait_timeout_s: 0.01 - intra_op_num_threads: 4 trt_max_workspace_size: 8589934592 # 8 GiB trt_cache_path: /mnt/cache/trt_cache model: From bf7cff13f234f29b2bff5c08248ff57d44b9c566 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 12:00:27 +0100 Subject: [PATCH 26/39] fix: lint --- models/binary_classifier.py | 2 +- models/semantic_segmentation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/models/binary_classifier.py b/models/binary_classifier.py index 6edd3c1..4a844a7 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -68,7 +68,7 @@ def reconfigure(self, config: Config) -> None: "trt_engine_cache_path": cache_path, "trt_max_workspace_size": config.get( "trt_max_workspace_size", 8 * 1024 * 1024 * 1024 - ), # type: ignore[typeddict-item] + ), "trt_builder_optimization_level": 5, "trt_timing_cache_enable": True, "trt_profile_min_shapes": min_shape, diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index 21a0674..cbc916f 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -64,7 +64,7 @@ def reconfigure(self, config: Config) -> None: "trt_engine_cache_path": cache_path, "trt_max_workspace_size": config.get( "trt_max_workspace_size", 8 * 1024 * 1024 * 1024 - ), # type: ignore[typeddict-item] + ), "trt_builder_optimization_level": 5, "trt_timing_cache_enable": True, "trt_profile_min_shapes": min_shape, From 6813264567a08f2ca2496a111ba6a68c297d095a Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 12:06:35 +0100 Subject: [PATCH 27/39] fix: remove duplicity --- ray-service.yaml | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ray-service.yaml b/ray-service.yaml index dbcd32a..31b37de 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -19,9 +19,6 @@ spec: max_replicas: 4 target_ongoing_requests: 32 ray_actor_options: - num_cpus: 4 - num_gpus: 1 - memory: 4294967296 # 4 GiB num_cpus: 4 num_gpus: 1 memory: 4294967296 # 4 GiB @@ -37,7 +34,6 @@ spec: model: _target_: providers.model_provider:mlflow artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx - artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx - name: episeg-1 import_path: models.semantic_segmentation:app @@ -48,19 +44,14 @@ spec: - name: SemanticSegmentation max_ongoing_requests: 16 max_queued_requests: 32 - max_queued_requests: 32 autoscaling_config: min_replicas: 0 max_replicas: 2 target_ongoing_requests: 8 - max_replicas: 2 - target_ongoing_requests: 8 ray_actor_options: - num_cpus: 4 num_cpus: 4 memory: 12884901888 # 12 GiB num_gpus: 1 - num_gpus: 1 runtime_env: env_vars: MLFLOW_TRACKING_URI: http://mlflow.rationai-mlflow:5000 @@ -87,10 +78,8 @@ spec: autoscaling_config: min_replicas: 0 max_replicas: 4 - max_replicas: 4 target_ongoing_requests: 2 ray_actor_options: - num_cpus: 8 num_cpus: 8 memory: 12884901888 # 12 GiB user_config: From 7510c9f3359946be51a24c222ef28f666143e027 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 12:14:10 +0100 Subject: [PATCH 28/39] fixes --- docker/Dockerfile.gpu | 4 ++-- .../virchow2_downloader/virchow2_downloader_job.yaml | 2 -- models/binary_classifier.py | 5 ----- models/semantic_segmentation.py | 6 ------ models/virchow2.py | 12 ------------ 5 files changed, 2 insertions(+), 27 deletions(-) diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index f0f29b6..9766c77 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -55,7 +55,7 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ RUN pip install --no-cache-dir \ onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ - https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl \ - https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl \ + https://download.pytorch.org/whl/cu121/torch-2.4.0%2Bcu121-cp312-cp312-linux_x86_64.whl \ + https://download.pytorch.org/whl/cu121/torchvision-0.19.0%2Bcu121-cp312-cp312-linux_x86_64.whl \ "timm>=1.0.0" \ "huggingface-hub>=0.23.0" diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml index e6c77af..570e3d0 100644 --- a/misc/virchow2_downloader/virchow2_downloader_job.yaml +++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml @@ -41,8 +41,6 @@ spec: key: token - name: HTTPS_PROXY value: "http://proxy.ics.muni.cz:3128" - - name: HTTP_PROXY - value: "http://proxy.ics.muni.cz:3128" - name: TORCH_HOME value: /tmp/torch - name: TORCHINDUCTOR_CACHE_DIR diff --git a/models/binary_classifier.py b/models/binary_classifier.py index 4a844a7..a343e33 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -111,11 +111,6 @@ def reconfigure(self, config: Config) -> None: self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] - dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size) - dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8) - - self.session.run([self.output_name], {self.input_name: dummy_input}) - @serve.batch async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]: """Run inference on a batch of images.""" diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index cbc916f..951821d 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -106,12 +106,6 @@ def reconfigure(self, config: Config) -> None: self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] - # Warmup - dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size) - dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8) - - self.session.run([self.output_name], {self.input_name: dummy_input}) - def get_config(self) -> dict[str, Any]: return {"tile_size": self.tile_size, "mpp": self.mpp} diff --git a/models/virchow2.py b/models/virchow2.py index bcf165e..e3d2abf 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -76,18 +76,6 @@ def reconfigure(self, config: Config) -> None: self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] - # Warmup - logger.info("Starting warmup...") - dummy_batch = torch.randn( - 1, 3, self.tile_size, self.tile_size, device=self.device - ) - with ( - torch.inference_mode(), - torch.autocast(device_type="cuda", dtype=torch.float16), - ): - self.model(dummy_batch) - logger.info("Warmup complete.") - @serve.batch async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: from PIL import Image From 2eae5032f765d3d0cae641c0993fe551c62b45d5 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 13:38:36 +0100 Subject: [PATCH 29/39] docker files --- docker/Dockerfile.cpu | 5 +---- docker/Dockerfile.gpu | 6 ++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 9538fda..c517093 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -52,7 +52,4 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \ - onnxruntime lz4 ratiopath "mlflow<3.0" \ - torch==2.4.0 torchvision==0.19.0 \ - "timm>=1.0.0" \ - "huggingface-hub>=0.23.0" + onnxruntime lz4 ratiopath "mlflow<3.0" diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index 9766c77..310096f 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -53,9 +53,7 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \ sudo ldconfig -RUN pip install --no-cache-dir \ - onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ - https://download.pytorch.org/whl/cu121/torch-2.4.0%2Bcu121-cp312-cp312-linux_x86_64.whl \ - https://download.pytorch.org/whl/cu121/torchvision-0.19.0%2Bcu121-cp312-cp312-linux_x86_64.whl \ +RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ + torch==2.4.0 torchvision==0.19.0 \ "timm>=1.0.0" \ "huggingface-hub>=0.23.0" From c5095bd767f49bb85dfcc9c7d758448205755e50 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 14:26:02 +0100 Subject: [PATCH 30/39] fix: docker --- docker/Dockerfile.gpu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index 310096f..a50a183 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -53,7 +53,14 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \ sudo ldconfig -RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \ - torch==2.4.0 torchvision==0.19.0 \ +RUN pip install --no-cache-dir \ + --extra-index-url https://pypi.nvidia.com \ + onnxruntime-gpu tensorrt-cu12==10.3.0 lz4 ratiopath "mlflow<3.0" + +RUN pip install --no-cache-dir \ + torch==2.4.0+cu121 torchvision==0.19.0+cu121 \ + --index-url https://download.pytorch.org/whl/cu121 + +RUN pip install --no-cache-dir \ "timm>=1.0.0" \ "huggingface-hub>=0.23.0" From e94baec5b68dec8d56a8d0d15f7426092111f6d0 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 14:57:03 +0100 Subject: [PATCH 31/39] chore: new docker image --- ray-service.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-service.yaml b/ray-service.yaml index 31b37de..531def3 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -253,7 +253,7 @@ spec: type: RuntimeDefault containers: - name: ray-worker - image: cerit.io/rationai/model-service:2.53.0-gpu + image: cerit.io/rationai/model-service:latest-gpu imagePullPolicy: Always resources: limits: From 7cdd29081b45e127ecdf152e9b8a0faa55cdebfc Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 15:29:53 +0100 Subject: [PATCH 32/39] chore: cpu docker --- docker/Dockerfile.cpu | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index c517093..fd9314a 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -52,4 +52,12 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \ - onnxruntime lz4 ratiopath "mlflow<3.0" + onnxruntime lz4 ratiopath "mlflow<3.0" "pillow>=11.3.0" + +RUN pip install --no-cache-dir \ + torch==2.4.0+cpu torchvision==0.19.0+cpu \ + --index-url https://download.pytorch.org/whl/cpu + +RUN pip install --no-cache-dir \ + "timm>=1.0.0" \ + "huggingface-hub>=0.23.0" From 8cce2cb7f8aa79ad40a313bcfcb9e8d932566d62 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 16:14:15 +0100 Subject: [PATCH 33/39] fix --- models/virchow2.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/models/virchow2.py b/models/virchow2.py index e3d2abf..3b01beb 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -2,14 +2,9 @@ from typing import Any, TypedDict import numpy as np -import timm -import torch from fastapi import FastAPI, Request from numpy.typing import NDArray from ray import serve -from timm.data.config import resolve_data_config -from timm.data.transforms_factory import create_transform -from timm.layers.mlp import SwiGLUPacked class Config(TypedDict): @@ -35,8 +30,11 @@ def __init__(self) -> None: # Enforce offline mode for timm/huggingface_hub os.environ["HF_HUB_OFFLINE"] = "1" + import torch + + self.torch = torch self.lz4 = lz4.frame - self.model: torch.nn.Module | None = None + self.model: Any = None self.transforms: Any = None self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.tile_size: int = 0 @@ -45,6 +43,13 @@ def reconfigure(self, config: Config) -> None: import importlib import logging + import timm + from timm.data.config import resolve_data_config + from timm.data.transforms_factory import create_transform + from timm.layers.mlp import SwiGLUPacked + + torch = self.torch + logger = logging.getLogger("ray.serve") self.tile_size = config["tile_size"] @@ -83,6 +88,8 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: if self.model is None or self.transforms is None: raise RuntimeError("Model or transforms not initialized") + torch = self.torch + pil_images = [Image.fromarray(img) for img in images] tensors = torch.stack([self.transforms(img) for img in pil_images]).to( self.device From 7e329a8c53ae60a554816af55575c86f1c93d7c6 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sat, 14 Mar 2026 23:50:45 +0100 Subject: [PATCH 34/39] final changes --- docker/Dockerfile.cpu | 10 +--------- ray-service.yaml | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index fd9314a..7e51375 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -52,12 +52,4 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \ - onnxruntime lz4 ratiopath "mlflow<3.0" "pillow>=11.3.0" - -RUN pip install --no-cache-dir \ - torch==2.4.0+cpu torchvision==0.19.0+cpu \ - --index-url https://download.pytorch.org/whl/cpu - -RUN pip install --no-cache-dir \ - "timm>=1.0.0" \ - "huggingface-hub>=0.23.0" + onnxruntime lz4 ratiopath "mlflow<3.0" \ \ No newline at end of file diff --git a/ray-service.yaml b/ray-service.yaml index 531def3..ab11465 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -92,7 +92,7 @@ spec: runtime_env: config: setup_timeout_seconds: 1800 - working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip + working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/virchow2-model.zip deployments: - name: Virchow2 max_ongoing_requests: 160 From 8dfea82424a7773b1ccc56de00db66f8037f9cb6 Mon Sep 17 00:00:00 2001 From: JiriStipek <567776@mail.muni.cz> Date: Sun, 15 Mar 2026 13:03:50 +0100 Subject: [PATCH 35/39] fix: usage of master branch --- ray-service.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-service.yaml b/ray-service.yaml index ab11465..f9a7507 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -92,7 +92,7 @@ spec: runtime_env: config: setup_timeout_seconds: 1800 - working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/virchow2-model.zip + working_dir: https://github.com/RationAI/model-service/archive/refs/heads/master.zip deployments: - name: Virchow2 max_ongoing_requests: 160 From e6f8603eaac4e192de8e43ffd424be4bf91dee60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?= <91186480+Jurgee@users.noreply.github.com> Date: Sun, 15 Mar 2026 13:10:17 +0100 Subject: [PATCH 36/39] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- docker/Dockerfile.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index 7e51375..9f778f5 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -52,4 +52,4 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* RUN pip install --no-cache-dir \ - onnxruntime lz4 ratiopath "mlflow<3.0" \ \ No newline at end of file + onnxruntime lz4 ratiopath "mlflow<3.0" \ No newline at end of file From fb646c4121cdb094fa2bbb8ad6f706ee510227cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?= <91186480+Jurgee@users.noreply.github.com> Date: Sun, 15 Mar 2026 13:11:23 +0100 Subject: [PATCH 37/39] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- ray-service.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ray-service.yaml b/ray-service.yaml index f9a7507..6a1d223 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -88,7 +88,7 @@ spec: - name: virchow2 import_path: models.virchow2:app - route_prefix: /virchow2 + route_prefix: /virchow2 runtime_env: config: setup_timeout_seconds: 1800 From b2d083c8ab386f8da179d47254f07975fd96a1de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?= <91186480+Jurgee@users.noreply.github.com> Date: Sun, 15 Mar 2026 13:18:46 +0100 Subject: [PATCH 38/39] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- providers/model_provider.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/providers/model_provider.py b/providers/model_provider.py index 68daafe..126f6ea 100644 --- a/providers/model_provider.py +++ b/providers/model_provider.py @@ -9,20 +9,18 @@ def huggingface(repo_id: str, filename: str | None = None) -> str: from huggingface_hub import hf_hub_download, snapshot_download - cache_dir = os.environ.get("HF_HOME", "/mnt/huggingface_cache") - os.makedirs(cache_dir, exist_ok=True) - os.environ["HF_HOME"] = cache_dir + hf_home = os.environ.get("HF_HOME", "/mnt/huggingface_cache") + os.makedirs(hf_home, exist_ok=True) + os.environ["HF_HOME"] = hf_home if filename: return hf_hub_download( repo_id=repo_id, filename=filename, - cache_dir=cache_dir, local_files_only=True, ) else: return snapshot_download( repo_id=repo_id, - cache_dir=cache_dir, local_files_only=True, ) From bfd90a9a020dfba5d6bbe9ce214b64b1302a7b3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?= <91186480+Jurgee@users.noreply.github.com> Date: Sun, 15 Mar 2026 13:20:53 +0100 Subject: [PATCH 39/39] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- models/virchow2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/models/virchow2.py b/models/virchow2.py index 3b01beb..e29f829 100644 --- a/models/virchow2.py +++ b/models/virchow2.py @@ -95,9 +95,12 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: self.device ) + device_type = self.device.type + autocast_dtype = torch.float16 if device_type == "cuda" else torch.bfloat16 + with ( torch.inference_mode(), - torch.autocast(device_type="cuda", dtype=torch.float16), + torch.autocast(device_type=device_type, dtype=autocast_dtype), ): output = self.model(tensors)