From a167a22002d697a392df339d147a4b7adaeb5f58 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Pek=C3=A1r?= <525077@mail.muni.cz>
Date: Sat, 17 Jan 2026 11:47:17 +0100
Subject: [PATCH 01/39] feat: tensorrt support

---
 Dockerfile                      | 16 ++++++++------
 models/semantic_segmentation.py | 39 +++++++++++++++++++++++----------
 2 files changed, 36 insertions(+), 19 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index ea4e4dc..fd31e28 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,14 +31,10 @@ RUN cd /tmp && \
     ninja -C builddir install
 
 
-FROM rayproject/ray:2.53.0-py312
+FROM rayproject/ray:2.53.0-py312-gpu
 
 COPY --from=builder /usr/local/ /usr/local/
 
-# Make sure the dynamic linker can find libraries built into /usr/local/lib, LD_LIBRARY_PATH starts with a colon
-ENV LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH}"
-RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && sudo ldconfig
-
 # Update & Package installation
 RUN sudo apt-get update && sudo apt-get -y upgrade && \
     sudo apt-get install -y --no-install-recommends \
@@ -46,9 +42,15 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \
     # Vips & Openslide packages
     zlib1g-dev libzstd-dev libpng-dev libjpeg-turbo8-dev libtiff-dev \
     libopenjp2-7-dev libgdk-pixbuf2.0-dev libxml2-dev sqlite3 libsqlite3-dev \
-    libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev python3-dev
+    libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev python3-dev 
 
 # Cleanup
 RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
 
-RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0"
+ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib:/home/ray/anaconda3/lib/python3.12/site-packages/tensorrt_libs:/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib"
+RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \
+    sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/tensorrt_libs" > /etc/ld.so.conf.d/trt-libs.conf' && \
+    sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \
+    sudo ldconfig
+
+RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0"
\ No newline at end of file
diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py
index ab2847a..59fa7fa 100644
--- a/models/semantic_segmentation.py
+++ b/models/semantic_segmentation.py
@@ -12,7 +12,6 @@ class Config(TypedDict):
     model: dict[str, Any]
     max_batch_size: int
     batch_wait_timeout_s: float
-    intra_op_num_threads: int
 
 
 fastapi = FastAPI()
@@ -36,16 +35,35 @@ async def reconfigure(self, config: Config) -> None:
         self.tile_size = config["tile_size"]
         self.mpp = config["mpp"]
 
-        sess_options = ort.SessionOptions()
-        sess_options.intra_op_num_threads = config["intra_op_num_threads"]
-        sess_options.inter_op_num_threads = 1
-
         module_path, attr_name = config["model"].pop("_target_").split(":")
         provider = getattr(importlib.import_module(module_path), attr_name)
+
+        min_shape = f"input:1x3x{self.tile_size}x{self.tile_size}"
+        opt_shape = (
+            f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}"
+        )
+        max_shape = (
+            f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}"
+        )
+        providers = [
+            (
+                "TensorrtExecutionProvider",
+                {
+                    "device_id": 0,
+                    "trt_fp16_enable": True,
+                    "trt_engine_cache_enable": True,
+                    "trt_engine_cache_path": "./trt_cache",
+                    "trt_profile_min_shapes": min_shape,
+                    "trt_profile_max_shapes": max_shape,
+                    "trt_profile_opt_shapes": opt_shape,
+                },
+            ),
+            "CUDAExecutionProvider",
+            "CPUExecutionProvider",
+        ]
+
         self.session = ort.InferenceSession(
-            provider(**config["model"]),
-            providers=["CPUExecutionProvider", "CUDAExecutionProvider"],
-            session_options=sess_options,
+            provider(**config["model"]), providers=providers
         )
         self.input_name = self.session.get_inputs()[0].name
         self.output_name = self.session.get_outputs()[0].name
@@ -54,10 +72,7 @@ async def reconfigure(self, config: Config) -> None:
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
     def get_config(self) -> dict[str, Any]:
-        return {
-            "tile_size": self.tile_size,
-            "mpp": self.mpp,
-        }
+        return {"tile_size": self.tile_size, "mpp": self.mpp}
 
     @serve.batch
     async def predict(

From 1d3310f67130d14187eec04ea661f8a30d172fa5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mat=C4=9Bj=20Pek=C3=A1r?= <525077@mail.muni.cz>
Date: Sat, 17 Jan 2026 12:04:39 +0100
Subject: [PATCH 02/39] fix: remove flush

---
 builders/heatmap_builder.py  |  1 +
 misc/tile_heatmap_builder.py |  1 +
 ray-service.yaml             | 93 +++++++++++++++++++++++++++++-------
 3 files changed, 79 insertions(+), 16 deletions(-)

diff --git a/builders/heatmap_builder.py b/builders/heatmap_builder.py
index 4fbb833..5b40636 100644
--- a/builders/heatmap_builder.py
+++ b/builders/heatmap_builder.py
@@ -100,6 +100,7 @@ async def process_tile(x: int, y: int) -> None:
 
             await asyncio.wait(tasks)
 
+        mask_builder.flush()
         mask_builder.save(
             output_path,
             tile_height=output_bigtiff_tile_height,
diff --git a/misc/tile_heatmap_builder.py b/misc/tile_heatmap_builder.py
index 0b8e7b6..843536c 100644
--- a/misc/tile_heatmap_builder.py
+++ b/misc/tile_heatmap_builder.py
@@ -38,6 +38,7 @@ def update(self, tile: np.ndarray, x: int, y: int) -> None:
         self.image[y : y + mm_y, x : x + mm_x] += tile[:mm_y, :mm_x]
         self.count[y : y + mm_y, x : x + mm_x] += 1
 
+    def flush(self) -> None:
         self.image.flush()
         self.count.flush()
 
diff --git a/ray-service.yaml b/ray-service.yaml
index 35265f6..2a6a0a2 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -39,27 +39,27 @@ spec:
         import_path: models.semantic_segmentation:app
         route_prefix: /episeg-1
         runtime_env:
-          working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip
+          working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/feature/gpu/model-service-feature-gpu.zip
         deployments:
           - name: SemanticSegmentation
             max_ongoing_requests: 16
-            max_queued_requests: 64
+            max_queued_requests: 32
             autoscaling_config:
               min_replicas: 0
-              max_replicas: 4
-              target_ongoing_requests: 4
+              max_replicas: 2
+              target_ongoing_requests: 8
             ray_actor_options:
-              num_cpus: 12
+              num_cpus: 4
               memory: 12884901888 # 12 GiB
+              num_gpus: 1
               runtime_env:
                 env_vars:
                   MLFLOW_TRACKING_URI: http://mlflow.rationai-mlflow:5000
             user_config:
               tile_size: 1024
               mpp: 0.468
-              max_batch_size: 2
-              batch_wait_timeout_s: 0.5
-              intra_op_num_threads: 11
+              max_batch_size: 8
+              batch_wait_timeout_s: 0.1
               model:
                 _target_: providers.model_provider:mlflow
                 artifact_uri: mlflow-artifacts:/10/39f821ed5b964c71a603cc6db196f9fd/artifacts/checkpoints/epoch=19-step=32020/model.onnx/model.onnx
@@ -76,14 +76,14 @@ spec:
             max_queued_requests: 64
             autoscaling_config:
               min_replicas: 0
-              max_replicas: 2
+              max_replicas: 4
               target_ongoing_requests: 2
             ray_actor_options:
-              num_cpus: 4
+              num_cpus: 8
               memory: 12884901888 # 12 GiB
             user_config:
-              num_threads: 4
-              max_concurrent_tasks: 8
+              num_threads: 8
+              max_concurrent_tasks: 24
 
   rayClusterConfig:
     rayVersion: 2.53.0
@@ -151,11 +151,72 @@ spec:
                 imagePullPolicy: Always
                 resources:
                   limits:
-                    cpu: 32
-                    memory: 32Gi
+                    cpu: 16
+                    memory: 24Gi
+                  requests:
+                    cpu: 16
+                    memory: 24Gi
+                env:
+                  - name: HTTPS_PROXY
+                    value: http://proxy.ics.muni.cz:3128
+                securityContext:
+                  allowPrivilegeEscalation: false
+                  capabilities:
+                    drop: ["ALL"]
+                  runAsUser: 1000
+                lifecycle:
+                  preStop:
+                    exec:
+                      command: ["/bin/sh", "-c", "ray stop"]
+                volumeMounts:
+                  - name: data
+                    mountPath: /mnt/data
+                  - name: public-data
+                    mountPath: /mnt/data/Public
+                  - name: projects
+                    mountPath: /mnt/projects
+                  - name: bioptic-tree
+                    mountPath: /mnt/bioptic_tree
+
+            volumes:
+              - name: data
+                persistentVolumeClaim:
+                  claimName: data-ro
+              - name: public-data
+                persistentVolumeClaim:
+                  claimName: rationai-data-ro-pvc-jobs
+              - name: projects
+                persistentVolumeClaim:
+                  claimName: projects-rw
+              - name: bioptic-tree
+                persistentVolumeClaim:
+                  claimName: bioptictree-ro
+
+      - groupName: gpu-workers
+        replicas: 0
+        minReplicas: 0
+        maxReplicas: 2
+        template:
+          spec:
+            securityContext:
+              fsGroupChangePolicy: OnRootMismatch
+              runAsNonRoot: true
+              seccompProfile:
+                type: RuntimeDefault
+            nodeSelector:
+              nvidia.com/gpu.product: NVIDIA-A40
+            containers:
+              - name: ray-worker
+                image: cerit.io/rationai/model-service:2.53.0-gpu
+                imagePullPolicy: Always
+                resources:
+                  limits:
+                    cpu: 4
+                    memory: 12Gi
+                    nvidia.com/gpu: 1
                   requests:
-                    cpu: 32
-                    memory: 32Gi
+                    cpu: 4
+                    memory: 12Gi
                 env:
                   - name: HTTPS_PROXY
                     value: http://proxy.ics.muni.cz:3128

From 4e27a4801db9985c8284923dcd882c0fc956e649 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sun, 8 Feb 2026 19:27:35 +0100
Subject: [PATCH 03/39] feat: add docker files for cpu/gpu

---
 docker/Dockerfile.cpu               | 54 +++++++++++++++++++++++++++++
 Dockerfile => docker/Dockerfile.gpu |  2 +-
 2 files changed, 55 insertions(+), 1 deletion(-)
 create mode 100644 docker/Dockerfile.cpu
 rename Dockerfile => docker/Dockerfile.gpu (99%)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
new file mode 100644
index 0000000..ea4e4dc
--- /dev/null
+++ b/docker/Dockerfile.cpu
@@ -0,0 +1,54 @@
+# We need to build libvips and openslide from source to get the required features.
+# The base image needs to extend rayproject/ray image which is based on ubuntu:22.04.
+# Our images are based on ubuntu:24.04, therefore, we can't reuse them.
+FROM ubuntu:22.04 AS builder
+
+ARG VIPS_VERSION=8.17.2
+
+# Install all build-time dependencies in a single layer.
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    build-essential ca-certificates git wget meson ninja-build \
+    zlib1g-dev libzstd-dev libpng-dev libjpeg-turbo8-dev libtiff-dev \
+    libopenjp2-7-dev libgdk-pixbuf2.0-dev libxml2-dev sqlite3 libsqlite3-dev \
+    libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev && \
+    rm -rf /var/lib/apt/lists/*
+
+# Build OpenSlide from the specified GitHub fork.
+RUN git clone https://github.com/iewchen/openslide.git /tmp/openslide-lib && \
+    cd /tmp/openslide-lib && \
+    meson setup builddir --prefix=/usr/local && \
+    meson compile -C builddir && \
+    meson install -C builddir
+
+# Download and build libvips from source.
+RUN cd /tmp && \
+    wget https://github.com/libvips/libvips/releases/download/v${VIPS_VERSION}/vips-${VIPS_VERSION}.tar.xz && \
+    tar xf vips-${VIPS_VERSION}.tar.xz && \
+    cd vips-${VIPS_VERSION} && \
+    meson setup builddir --prefix=/usr/local && \
+    ninja -C builddir && \
+    ninja -C builddir install
+
+
+FROM rayproject/ray:2.53.0-py312
+
+COPY --from=builder /usr/local/ /usr/local/
+
+# Make sure the dynamic linker can find libraries built into /usr/local/lib, LD_LIBRARY_PATH starts with a colon
+ENV LD_LIBRARY_PATH="/usr/local/lib${LD_LIBRARY_PATH}"
+RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && sudo ldconfig
+
+# Update & Package installation
+RUN sudo apt-get update && sudo apt-get -y upgrade && \
+    sudo apt-get install -y --no-install-recommends \
+    gcc \
+    # Vips & Openslide packages
+    zlib1g-dev libzstd-dev libpng-dev libjpeg-turbo8-dev libtiff-dev \
+    libopenjp2-7-dev libgdk-pixbuf2.0-dev libxml2-dev sqlite3 libsqlite3-dev \
+    libcairo2-dev libglib2.0-dev libdcmtk-dev libjxr-dev python3-dev
+
+# Cleanup
+RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
+
+RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0"
diff --git a/Dockerfile b/docker/Dockerfile.gpu
similarity index 99%
rename from Dockerfile
rename to docker/Dockerfile.gpu
index fd31e28..f5bf633 100644
--- a/Dockerfile
+++ b/docker/Dockerfile.gpu
@@ -53,4 +53,4 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \
     sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \
     sudo ldconfig
 
-RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0"
\ No newline at end of file
+RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0"

From fd3154d2628a6eb380c2d33789a41e652a08419f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sun, 8 Feb 2026 19:27:48 +0100
Subject: [PATCH 04/39] feat: add PVC for TensorRT

---
 pvc/tensorrt-cache-pvc.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 pvc/tensorrt-cache-pvc.yaml

diff --git a/pvc/tensorrt-cache-pvc.yaml b/pvc/tensorrt-cache-pvc.yaml
new file mode 100644
index 0000000..01aef8f
--- /dev/null
+++ b/pvc/tensorrt-cache-pvc.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: tensorrt-cache-pvc
+spec:
+  accessModes:
+    - ReadWriteMany
+  storageClassName: nfs-csi
+  resources:
+    requests:
+      storage: 20Gi

From eaac807a60846de706068030901b176ceca50076 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sun, 8 Feb 2026 19:28:06 +0100
Subject: [PATCH 05/39] feat: add support of TensorRT for models

---
 models/binary_classifier.py     | 96 ++++++++++++++++++++++++++-------
 models/semantic_segmentation.py | 69 +++++++++++++++++-------
 2 files changed, 126 insertions(+), 39 deletions(-)

diff --git a/models/binary_classifier.py b/models/binary_classifier.py
index 087a73e..efbe6a7 100644
--- a/models/binary_classifier.py
+++ b/models/binary_classifier.py
@@ -1,3 +1,5 @@
+import asyncio
+import os
 from typing import Any, TypedDict
 
 import numpy as np
@@ -7,9 +9,9 @@
 
 
 class Config(TypedDict):
+    """Configuration for BinaryClassifier deployment."""
+
     tile_size: int
-    mean: list[float]
-    std: list[float]
     model: dict[str, Any]
     max_batch_size: int
     batch_wait_timeout_s: float
@@ -19,63 +21,119 @@ class Config(TypedDict):
 fastapi = FastAPI()
 
 
-@serve.deployment(num_replicas="auto")
+@serve.deployment(
+    num_replicas="auto",
+    ray_actor_options={"num_gpus": 1},
+)
 @serve.ingress(fastapi)
 class BinaryClassifier:
+    """Binary classifier for tissue tiles using ONNX Runtime with GPU support."""
+
     tile_size: int
 
     def __init__(self) -> None:
         import lz4.frame
 
-        self.decompress = lz4.frame.decompress
+        self.lz4 = lz4.frame
 
-    async def reconfigure(self, config: Config) -> None:
+    def reconfigure(self, config: Config) -> None:
+        """Load the ONNX model and configure inference settings."""
         import importlib
 
         import onnxruntime as ort
 
         self.tile_size = config["tile_size"]
 
-        self.mean = np.array(config["mean"], dtype=np.float32).reshape(1, 3, 1, 1)
-        self.inv_std = 1 / np.array(config["std"], dtype=np.float32).reshape(1, 3, 1, 1)
+        cache_path = "/mnt/cache/trt_cache"
+        os.makedirs(cache_path, exist_ok=True)
 
+        min_shape = f"input:1x3x{self.tile_size}x{self.tile_size}"
+        opt_shape = (
+            f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}"
+        )
+        max_shape = (
+            f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}"
+        )
+
+        trt_options = {
+            "device_id": 0,
+            "trt_fp16_enable": True,
+            "trt_engine_cache_enable": True,
+            "trt_engine_cache_path": cache_path,
+            "trt_max_workspace_size": 4 * 1024 * 1024 * 1024,  # 4GB
+            "trt_builder_optimization_level": 5,
+            "trt_timing_cache_enable": True,
+            "trt_profile_min_shapes": min_shape,
+            "trt_profile_max_shapes": max_shape,
+            "trt_profile_opt_shapes": opt_shape,
+        }
+
+        # Configure ONNX Runtime session
         sess_options = ort.SessionOptions()
         sess_options.intra_op_num_threads = config["intra_op_num_threads"]
         sess_options.inter_op_num_threads = 1
 
+        # Enable graph optimizations
+        sess_options.graph_optimization_level = (
+            ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
+        sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+        # Load model from provider (e.g., MLflow)
         module_path, attr_name = config["model"].pop("_target_").split(":")
         provider = getattr(importlib.import_module(module_path), attr_name)
         self.session = ort.InferenceSession(
             provider(**config["model"]),
-            providers=["CPUExecutionProvider"],
+            providers=[
+                (
+                    "TensorrtExecutionProvider",
+                    trt_options,
+                ),
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],
             session_options=sess_options,
         )
+
         self.input_name = self.session.get_inputs()[0].name
         self.output_name = self.session.get_outputs()[0].name
 
+        # Configure batching
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
+        dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size)
+        dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8)
+
+        self.session.run([self.output_name], {self.input_name: dummy_input})
+
     @serve.batch
     async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]:
-        batch = np.stack(images, axis=0).astype(np.float32)
-
-        # Normalization
-        batch -= self.mean
-        batch *= self.inv_std
+        """Run inference on a batch of images."""
+        batch = np.ascontiguousarray(np.stack(images, axis=0).astype(np.uint8))
 
-        outputs = self.session.run([self.output_name], {self.input_name: batch})
+        outputs = self.session.run(
+            [self.output_name],
+            {self.input_name: batch},
+        )
 
-        return outputs[0].squeeze(1).tolist()
+        return outputs[0].flatten().tolist()
 
     @fastapi.post("/")
     async def root(self, request: Request) -> float:
-        data = self.decompress(await request.body())
-        image = np.frombuffer(data, dtype=np.uint8).reshape(
-            self.tile_size, self.tile_size, 3
+        """Handle inference request with LZ4-compressed image."""
+        data = await asyncio.to_thread(self.lz4.decompress, await request.body())
+
+        image = (
+            np.frombuffer(data, dtype=np.uint8)
+            .reshape(self.tile_size, self.tile_size, 3)
+            .transpose(2, 0, 1)
         )
 
-        return await self.predict(image.transpose(2, 0, 1))
+        image = np.ascontiguousarray(image)
+
+        result = await self.predict(image)
+        return result
 
 
 app = BinaryClassifier.bind()  # type: ignore[attr-defined]
diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py
index 59fa7fa..650e739 100644
--- a/models/semantic_segmentation.py
+++ b/models/semantic_segmentation.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any, TypedDict
 
 import numpy as np
@@ -12,6 +13,7 @@ class Config(TypedDict):
     model: dict[str, Any]
     max_batch_size: int
     batch_wait_timeout_s: float
+    intra_op_num_threads: int
 
 
 fastapi = FastAPI()
@@ -27,7 +29,7 @@ def __init__(self) -> None:
 
         self.lz4 = lz4.frame
 
-    async def reconfigure(self, config: Config) -> None:
+    def reconfigure(self, config: Config) -> None:
         import importlib
 
         import onnxruntime as ort
@@ -35,8 +37,8 @@ async def reconfigure(self, config: Config) -> None:
         self.tile_size = config["tile_size"]
         self.mpp = config["mpp"]
 
-        module_path, attr_name = config["model"].pop("_target_").split(":")
-        provider = getattr(importlib.import_module(module_path), attr_name)
+        cache_path = "/mnt/cache/trt_cache"
+        os.makedirs(cache_path, exist_ok=True)
 
         min_shape = f"input:1x3x{self.tile_size}x{self.tile_size}"
         opt_shape = (
@@ -45,32 +47,59 @@ async def reconfigure(self, config: Config) -> None:
         max_shape = (
             f"input:{config['max_batch_size']}x3x{self.tile_size}x{self.tile_size}"
         )
-        providers = [
-            (
-                "TensorrtExecutionProvider",
-                {
-                    "device_id": 0,
-                    "trt_fp16_enable": True,
-                    "trt_engine_cache_enable": True,
-                    "trt_engine_cache_path": "./trt_cache",
-                    "trt_profile_min_shapes": min_shape,
-                    "trt_profile_max_shapes": max_shape,
-                    "trt_profile_opt_shapes": opt_shape,
-                },
-            ),
-            "CUDAExecutionProvider",
-            "CPUExecutionProvider",
-        ]
+
+        trt_options = {
+            "device_id": 0,
+            "trt_fp16_enable": True,
+            "trt_engine_cache_enable": True,
+            "trt_engine_cache_path": cache_path,
+            "trt_max_workspace_size": 4 * 1024 * 1024 * 1024,  # 4GB
+            "trt_builder_optimization_level": 5,
+            "trt_timing_cache_enable": True,
+            "trt_profile_min_shapes": min_shape,
+            "trt_profile_max_shapes": max_shape,
+            "trt_profile_opt_shapes": opt_shape,
+        }
+
+        # Configure ONNX Runtime session
+        sess_options = ort.SessionOptions()
+        sess_options.intra_op_num_threads = config["intra_op_num_threads"]
+        sess_options.inter_op_num_threads = 1
+
+        # Enable graph optimizations
+        sess_options.graph_optimization_level = (
+            ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+        )
+        sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+
+        module_path, attr_name = config["model"].pop("_target_").split(":")
+        provider = getattr(importlib.import_module(module_path), attr_name)
 
         self.session = ort.InferenceSession(
-            provider(**config["model"]), providers=providers
+            provider(**config["model"]),
+            providers=[
+                (
+                    "TensorrtExecutionProvider",
+                    trt_options,
+                ),
+                "CUDAExecutionProvider",
+                "CPUExecutionProvider",
+            ],
+            session_options=sess_options,
         )
+
         self.input_name = self.session.get_inputs()[0].name
         self.output_name = self.session.get_outputs()[0].name
 
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
+        # Warmup
+        dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size)
+        dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8)
+
+        self.session.run([self.output_name], {self.input_name: dummy_input})
+
     def get_config(self) -> dict[str, Any]:
         return {"tile_size": self.tile_size, "mpp": self.mpp}
 

From 46fe8b1b244f339adc204d8d6b5256ac244bb3ab Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sun, 8 Feb 2026 19:28:57 +0100
Subject: [PATCH 06/39] feat: add TensorRT cache to workers

---
 ray-service.yaml | 48 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index 2a6a0a2..4361544 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -19,27 +19,26 @@ spec:
               max_replicas: 4
               target_ongoing_requests: 32
             ray_actor_options:
-              num_cpus: 6
-              memory: 6442450944 # 6 GiB
+              num_cpus: 4
+              num_gpus: 1
+              memory: 4294967296  # 4 GiB
               runtime_env:
                 env_vars:
                   MLFLOW_TRACKING_URI: http://mlflow.rationai-mlflow:5000
             user_config:
               tile_size: 512
-              max_batch_size: 32
-              batch_wait_timeout_s: 0.5
-              mean: [228.5544, 178.8584, 219.8793]
-              std: [27.8285, 51.4639, 26.4458]
-              intra_op_num_threads: 5
+              max_batch_size: 16
+              batch_wait_timeout_s: 0.01
+              intra_op_num_threads: 4
               model:
                 _target_: providers.model_provider:mlflow
-                artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/model.onnx
+                artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx
 
       - name: episeg-1
         import_path: models.semantic_segmentation:app
         route_prefix: /episeg-1
         runtime_env:
-          working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/feature/gpu/model-service-feature-gpu.zip
+          working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip
         deployments:
           - name: SemanticSegmentation
             max_ongoing_requests: 16
@@ -118,6 +117,9 @@ spec:
                 requests:
                   cpu: 0
                   memory: 4Gi
+              env:
+                - name: HTTPS_PROXY
+                  value: http://proxy.ics.muni.cz:3128
               ports:
                 - containerPort: 6379
                   name: gcs-server
@@ -151,11 +153,11 @@ spec:
                 imagePullPolicy: Always
                 resources:
                   limits:
-                    cpu: 16
-                    memory: 24Gi
+                    cpu: 8
+                    memory: 16Gi
                   requests:
-                    cpu: 16
-                    memory: 24Gi
+                    cpu: 8
+                    memory: 16Gi
                 env:
                   - name: HTTPS_PROXY
                     value: http://proxy.ics.muni.cz:3128
@@ -177,6 +179,8 @@ spec:
                     mountPath: /mnt/projects
                   - name: bioptic-tree
                     mountPath: /mnt/bioptic_tree
+                  - name: trt-cache-volume
+                    mountPath: /mnt/cache
 
             volumes:
               - name: data
@@ -191,6 +195,9 @@ spec:
               - name: bioptic-tree
                 persistentVolumeClaim:
                   claimName: bioptictree-ro
+              - name: trt-cache-volume
+                persistentVolumeClaim:
+                  claimName: tensorrt-cache-pvc
 
       - groupName: gpu-workers
         replicas: 0
@@ -199,8 +206,10 @@ spec:
         template:
           spec:
             securityContext:
+              fsGroup: 1000
               fsGroupChangePolicy: OnRootMismatch
               runAsNonRoot: true
+              runAsUser: 1000
               seccompProfile:
                 type: RuntimeDefault
             nodeSelector:
@@ -211,12 +220,12 @@ spec:
                 imagePullPolicy: Always
                 resources:
                   limits:
-                    cpu: 4
-                    memory: 12Gi
+                    cpu: 8
+                    memory: 24Gi
                     nvidia.com/gpu: 1
                   requests:
-                    cpu: 4
-                    memory: 12Gi
+                    cpu: 8
+                    memory: 24Gi
                 env:
                   - name: HTTPS_PROXY
                     value: http://proxy.ics.muni.cz:3128
@@ -238,6 +247,8 @@ spec:
                     mountPath: /mnt/projects
                   - name: bioptic-tree
                     mountPath: /mnt/bioptic_tree
+                  - name: trt-cache-volume
+                    mountPath: /mnt/cache
 
             volumes:
               - name: data
@@ -252,3 +263,6 @@ spec:
               - name: bioptic-tree
                 persistentVolumeClaim:
                   claimName: bioptictree-ro
+              - name: trt-cache-volume
+                persistentVolumeClaim:
+                  claimName: tensorrt-cache-pvc

From f07723e982d684e0e4f2df144d8df45f9566a822 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sun, 8 Feb 2026 19:29:07 +0100
Subject: [PATCH 07/39] add Jiri as coauthor

---
 pyproject.toml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index acbe907..fbb3a9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,10 @@
 [project]
 name = "model-service"
 version = "0.1.0"
-authors = [{ name = "Matěj Pekár", email = "matejpekar@mail.muni.cz" }]
+authors = [
+    { name = "Matěj Pekár", email = "matejpekar@mail.muni.cz" },
+    { name = "Jiří Štípek", email = "567776@mail.muni.cz" },
+]
 readme = "README.md"
 license = { file = "LICENSE" }
 requires-python = ">=3.12"

From 9d6e26503f22fa610dbcf17727ca99bec7c07d26 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sun, 8 Feb 2026 20:17:29 +0100
Subject: [PATCH 08/39] fix: remove gpu number from serve.deployment in code

---
 models/binary_classifier.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/models/binary_classifier.py b/models/binary_classifier.py
index efbe6a7..3b607ba 100644
--- a/models/binary_classifier.py
+++ b/models/binary_classifier.py
@@ -21,10 +21,7 @@ class Config(TypedDict):
 fastapi = FastAPI()
 
 
-@serve.deployment(
-    num_replicas="auto",
-    ray_actor_options={"num_gpus": 1},
-)
+@serve.deployment(num_replicas="auto")
 @serve.ingress(fastapi)
 class BinaryClassifier:
     """Binary classifier for tissue tiles using ONNX Runtime with GPU support."""

From e7612f97cba08d96c7a61cdaa109eef53095830e Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Mon, 9 Feb 2026 19:46:10 +0100
Subject: [PATCH 09/39] fix: warning suppress

---
 models/binary_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/binary_classifier.py b/models/binary_classifier.py
index 3b607ba..2b807ab 100644
--- a/models/binary_classifier.py
+++ b/models/binary_classifier.py
@@ -114,7 +114,7 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]:
             {self.input_name: batch},
         )
 
-        return outputs[0].flatten().tolist()
+        return outputs[0].flatten().tolist()  # pyright: ignore[reportAttributeAccessIssue]
 
     @fastapi.post("/")
     async def root(self, request: Request) -> float:

From 5945f1024e8da6f3b89b5205fb4fa8f3be29430f Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Tue, 10 Feb 2026 20:00:58 +0100
Subject: [PATCH 10/39] feat: add jobs to download virchow2

---
 misc/virchow2_downloader/download_virchow2.py | 48 ++++++++++++++
 .../virchow2_downloader_job.yaml              | 64 +++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 misc/virchow2_downloader/download_virchow2.py
 create mode 100644 misc/virchow2_downloader/virchow2_downloader_job.yaml

diff --git a/misc/virchow2_downloader/download_virchow2.py b/misc/virchow2_downloader/download_virchow2.py
new file mode 100644
index 0000000..218790b
--- /dev/null
+++ b/misc/virchow2_downloader/download_virchow2.py
@@ -0,0 +1,48 @@
+import os
+
+from huggingface_hub import login, snapshot_download
+
+
+HF_TOKEN = os.environ.get("HF_TOKEN")
+CACHE_DIR = "/mnt/huggingface_cache"
+MODEL_ID = "paige-ai/Virchow2"
+
+os.environ["HF_HOME"] = CACHE_DIR
+os.makedirs(CACHE_DIR, exist_ok=True)
+
+print(f"Starting download for {MODEL_ID} to {CACHE_DIR}")
+
+if HF_TOKEN:
+    print("Logging in to Hugging Face...")
+    login(token=HF_TOKEN)
+else:
+    print("No HF_TOKEN provided! Download might fail for gated models.")
+
+print("Downloading model snapshot...")
+try:
+    path = snapshot_download(
+        repo_id=MODEL_ID,
+        resume_download=True,
+        local_files_only=False,
+    )
+    print(f"Model downloaded to: {path}")
+
+    print("Verifying model files exist...")
+    import timm
+
+    try:
+        model = timm.create_model(
+            f"hf-hub:{MODEL_ID}",
+            pretrained=True,
+            num_classes=0,
+        )
+        print(f"Model successfully loaded! Type: {type(model).__name__}")
+        del model  # Free memory
+    except Exception as e:
+        print(f"Verification warning: {e}")
+
+except Exception as e:
+    print(f"Download failed: {e}")
+    exit(1)
+
+print("DONE. Model is cached and ready for offline use.")
diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml
new file mode 100644
index 0000000..293401a
--- /dev/null
+++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml
@@ -0,0 +1,64 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: virchow-downloader
+  namespace: rationai-jobs-ns
+spec:
+  template:
+    spec:
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        fsGroup: 1000
+        seccompProfile:
+          type: RuntimeDefault
+      containers:
+        - name: downloader
+          image: python:3.10
+          resources:
+            requests:
+              memory: "4Gi"
+              cpu: "1"
+            limits:
+              memory: "4Gi"
+              cpu: "2"
+          securityContext:
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop: ["ALL"]
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              pip install --user --no-cache-dir huggingface_hub transformers torch timm
+              python3 /mnt/scripts/download_virchow2.py
+          env:
+            - name: HOME
+              value: /tmp
+            - name: HF_TOKEN
+              value: "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"
+            - name: HTTPS_PROXY
+              value: "http://proxy.ics.muni.cz:3128"
+            - name: HTTP_PROXY
+              value: "http://proxy.ics.muni.cz:3128"
+            - name: TORCH_HOME
+              value: /tmp/torch
+            - name: TORCHINDUCTOR_CACHE_DIR
+              value: /tmp/torch/inductor_cache
+          volumeMounts:
+            - name: huggingface-cache
+              mountPath: /mnt/huggingface_cache
+            - name: scripts
+              mountPath: /mnt/scripts
+            - name: temp
+              mountPath: /tmp
+      restartPolicy: Never
+      volumes:
+        - name: huggingface-cache
+          persistentVolumeClaim:
+            claimName: huggingface-cache-pvc
+        - name: scripts
+          configMap:
+            name: downloader-script
+            defaultMode: 0777
+        - name: temp
+          emptyDir: {}

From 8ef4cc52e75e9f76ce41314db69af9ee4d4c65ff Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Tue, 10 Feb 2026 20:01:12 +0100
Subject: [PATCH 11/39] feat: add model provider for hf

---
 providers/model_provider.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/providers/model_provider.py b/providers/model_provider.py
index faf6e5c..68daafe 100644
--- a/providers/model_provider.py
+++ b/providers/model_provider.py
@@ -2,3 +2,27 @@ def mlflow(artifact_uri: str) -> str:
     import mlflow.artifacts
 
     return mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri)
+
+
+def huggingface(repo_id: str, filename: str | None = None) -> str:
+    import os
+
+    from huggingface_hub import hf_hub_download, snapshot_download
+
+    cache_dir = os.environ.get("HF_HOME", "/mnt/huggingface_cache")
+    os.makedirs(cache_dir, exist_ok=True)
+    os.environ["HF_HOME"] = cache_dir
+
+    if filename:
+        return hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            cache_dir=cache_dir,
+            local_files_only=True,
+        )
+    else:
+        return snapshot_download(
+            repo_id=repo_id,
+            cache_dir=cache_dir,
+            local_files_only=True,
+        )

From a6c427eb99a7e7fa1015dc3f405c4f7568ca0650 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Tue, 10 Feb 2026 20:01:29 +0100
Subject: [PATCH 12/39] feat: add pvc for huggingface

---
 pvc/huggingface-pvc.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 pvc/huggingface-pvc.yaml

diff --git a/pvc/huggingface-pvc.yaml b/pvc/huggingface-pvc.yaml
new file mode 100644
index 0000000..8cf3047
--- /dev/null
+++ b/pvc/huggingface-pvc.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: huggingface-cache-pvc
+  namespace: rationai-jobs-ns
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 15Gi
+  storageClassName: nfs-csi

From 27c78018bfe4fb92c451b8db06df7c4646cfa730 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Tue, 10 Feb 2026 20:56:02 +0100
Subject: [PATCH 13/39] feat: add virchow2 model

---
 models/virchow2.py | 135 +++++++++++++++++++++++++++++++++++++++++++++
 ray-service.yaml   |  47 ++++++++++++++++
 2 files changed, 182 insertions(+)
 create mode 100644 models/virchow2.py

diff --git a/models/virchow2.py b/models/virchow2.py
new file mode 100644
index 0000000..fb8ecc0
--- /dev/null
+++ b/models/virchow2.py
@@ -0,0 +1,135 @@
+import asyncio
+from typing import Any, TypedDict
+
+import numpy as np
+import timm
+import torch
+from fastapi import FastAPI, Request
+from numpy.typing import NDArray
+from ray import serve
+from timm.data.config import resolve_data_config
+from timm.data.transforms_factory import create_transform
+from timm.layers.mlp import SwiGLUPacked
+
+
+class Config(TypedDict):
+    tile_size: int
+    model: dict[str, Any]
+    max_batch_size: int
+    batch_wait_timeout_s: float
+
+
+fastapi = FastAPI()
+
+
+@serve.deployment(
+    num_replicas="auto",
+    ray_actor_options={"num_gpus": 1},
+)
+@serve.ingress(fastapi)
+class Virchow2:
+    """Virchow2 foundation model for pathology."""
+
+    def __init__(self) -> None:
+        import os
+
+        import lz4.frame
+
+        # Enforce offline mode for timm/huggingface_hub
+        os.environ["HF_HUB_OFFLINE"] = "1"
+
+        self.lz4 = lz4.frame
+        self.model: torch.nn.Module | None = None
+        self.transforms: Any = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    def reconfigure(self, config: Config) -> None:
+        import importlib
+        import logging
+
+        logger = logging.getLogger("ray.serve")
+        self.tile_size = config["tile_size"]
+
+        # Load model using the provider
+        module_path, attr_name = config["model"].pop("_target_").split(":")
+        provider = getattr(importlib.import_module(module_path), attr_name)
+        repo_id = config["model"]["repo_id"]
+
+        logger.info(f"Loading Virchow2 model from {repo_id}...")
+        provider(**config["model"])
+
+        self.model = timm.create_model(
+            f"hf-hub:{repo_id}",
+            pretrained=True,
+            num_classes=0,
+            mlp_layer=SwiGLUPacked,
+            act_layer=torch.nn.SiLU,
+        )
+        self.model = self.model.to(self.device).eval()
+        logger.info("Virchow2 model loaded and moved to GPU.")
+
+        self.transforms = create_transform(
+            **resolve_data_config(self.model.pretrained_cfg, model=self.model)
+        )
+
+        # Configure batching
+        self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
+        self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
+
+        # Warmup
+        logger.info("Starting warmup...")
+        with (
+            torch.inference_mode(),
+            torch.autocast(device_type="cuda", dtype=torch.float16),
+        ):
+            # Create a dummy input tensor
+            dummy_input = torch.randn(
+                1, 3, self.tile_size, self.tile_size, device=self.device
+            )
+            self.model(dummy_input)
+        logger.info("Warmup complete.")
+
+    @serve.batch
+    async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
+        from PIL import Image
+
+        if self.model is None or self.transforms is None:
+            raise RuntimeError("Model or transforms not initialized")
+
+        # Convert numpy arrays to PIL Images and apply transforms
+        pil_images = [Image.fromarray(img) for img in images]
+        tensors = torch.stack([self.transforms(img) for img in pil_images]).to(
+            self.device
+        )
+
+        with (
+            torch.inference_mode(),
+            torch.autocast(device_type="cuda", dtype=torch.float16),
+        ):
+            output = self.model(tensors)
+
+            class_token = output[:, 0]  # size: batch x 1280
+            patch_tokens = output[
+                :, 5:
+            ]  # size: batch x 256 x 1280 (skip register tokens 1-4)
+            embedding = torch.cat(
+                [class_token, patch_tokens.mean(1)], dim=-1
+            )  # size: batch x 2560
+            embedding = embedding.to(torch.float16)
+
+        return embedding.cpu().tolist()
+
+    @fastapi.post("/")
+    async def root(self, request: Request) -> list[float]:
+        data = await asyncio.to_thread(self.lz4.decompress, await request.body())
+
+        # Reshape to (height, width, channels) - RGB image
+        image = np.frombuffer(data, dtype=np.uint8).reshape(
+            self.tile_size, self.tile_size, 3
+        )
+
+        results = await self.predict(image)
+        return results[0]
+
+
+app = Virchow2.bind()
diff --git a/ray-service.yaml b/ray-service.yaml
index 4361544..4eeecd4 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -84,6 +84,43 @@ spec:
               num_threads: 8
               max_concurrent_tasks: 24
 
+      - name: virchow2
+        import_path: models.virchow2:app
+        route_prefix: /virchow2        
+        runtime_env:
+          config:
+            setup_timeout_seconds: 1800
+          working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip
+          pip:
+            - https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl
+            - https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl
+            - timm>=1.0.0
+            - huggingface-hub>=0.23.0
+          env_vars:
+            HF_TOKEN: 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
+        deployments:
+          - name: Virchow2
+            max_ongoing_requests: 32
+            max_queued_requests: 64
+            autoscaling_config:
+              min_replicas: 0
+              max_replicas: 1
+              target_ongoing_requests: 16
+            ray_actor_options:
+              num_cpus: 4
+              num_gpus: 1
+              memory: 8589934592 # 8 GiB
+              runtime_env:
+                env_vars:
+                   HF_HOME: "/mnt/huggingface_cache"
+            user_config:
+              tile_size: 224
+              max_batch_size: 16
+              batch_wait_timeout_s: 0.1
+              model:
+                _target_: providers.model_provider:huggingface
+                repo_id: paige-ai/Virchow2
+
   rayClusterConfig:
     rayVersion: 2.53.0
     enableInTreeAutoscaling: true
@@ -181,6 +218,8 @@ spec:
                     mountPath: /mnt/bioptic_tree
                   - name: trt-cache-volume
                     mountPath: /mnt/cache
+                  - name: huggingface-cache
+                    mountPath: /mnt/huggingface_cache
 
             volumes:
               - name: data
@@ -198,6 +237,9 @@ spec:
               - name: trt-cache-volume
                 persistentVolumeClaim:
                   claimName: tensorrt-cache-pvc
+              - name: huggingface-cache
+                persistentVolumeClaim:
+                  claimName: huggingface-cache-pvc
 
       - groupName: gpu-workers
         replicas: 0
@@ -249,6 +291,8 @@ spec:
                     mountPath: /mnt/bioptic_tree
                   - name: trt-cache-volume
                     mountPath: /mnt/cache
+                  - name: huggingface-cache
+                    mountPath: /mnt/huggingface_cache
 
             volumes:
               - name: data
@@ -266,3 +310,6 @@ spec:
               - name: trt-cache-volume
                 persistentVolumeClaim:
                   claimName: tensorrt-cache-pvc
+              - name: huggingface-cache
+                persistentVolumeClaim:
+                  claimName: huggingface-cache-pvc

From e5d84cbdd645b6659fe98d014cd50739fad5e649 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Tue, 10 Feb 2026 21:16:01 +0100
Subject: [PATCH 14/39] fix

---
 models/virchow2.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index fb8ecc0..95afe29 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -22,10 +22,7 @@ class Config(TypedDict):
 fastapi = FastAPI()
 
 
-@serve.deployment(
-    num_replicas="auto",
-    ray_actor_options={"num_gpus": 1},
-)
+@serve.deployment(num_replicas="auto")
 @serve.ingress(fastapi)
 class Virchow2:
     """Virchow2 foundation model for pathology."""
@@ -129,7 +126,7 @@ async def root(self, request: Request) -> list[float]:
         )
 
         results = await self.predict(image)
-        return results[0]
+        return results  # type: ignore[attr-defined]
 
 
-app = Virchow2.bind()
+app = Virchow2.bind()  # type: ignore[attr-defined]

From e1fcb6ca9c88fd77ea52aea7c78aa9365ecf1086 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Tue, 10 Feb 2026 21:26:37 +0100
Subject: [PATCH 15/39] fix: fine tune

---
 models/virchow2.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index 95afe29..781968b 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -39,6 +39,7 @@ def __init__(self) -> None:
         self.model: torch.nn.Module | None = None
         self.transforms: Any = None
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.tile_size: int = 0
 
     def reconfigure(self, config: Config) -> None:
         import importlib
@@ -55,6 +56,7 @@ def reconfigure(self, config: Config) -> None:
         logger.info(f"Loading Virchow2 model from {repo_id}...")
         provider(**config["model"])
 
+        # Load model with official architecture
         self.model = timm.create_model(
             f"hf-hub:{repo_id}",
             pretrained=True,
@@ -63,27 +65,28 @@ def reconfigure(self, config: Config) -> None:
             act_layer=torch.nn.SiLU,
         )
         self.model = self.model.to(self.device).eval()
-        logger.info("Virchow2 model loaded and moved to GPU.")
 
+        # Get transforms from model config
         self.transforms = create_transform(
             **resolve_data_config(self.model.pretrained_cfg, model=self.model)
         )
 
+        logger.info("Virchow2 model loaded and moved to GPU.")
+
         # Configure batching
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
         # Warmup
         logger.info("Starting warmup...")
+        dummy_batch = torch.randn(
+            1, 3, self.tile_size, self.tile_size, device=self.device
+        )
         with (
             torch.inference_mode(),
             torch.autocast(device_type="cuda", dtype=torch.float16),
         ):
-            # Create a dummy input tensor
-            dummy_input = torch.randn(
-                1, 3, self.tile_size, self.tile_size, device=self.device
-            )
-            self.model(dummy_input)
+            self.model(dummy_batch)
         logger.info("Warmup complete.")
 
     @serve.batch
@@ -105,16 +108,18 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
         ):
             output = self.model(tensors)
 
-            class_token = output[:, 0]  # size: batch x 1280
+            # Extract embeddings as per official model card
+            class_token = output[:, 0]  # CLS token: batch x 1280
             patch_tokens = output[
                 :, 5:
-            ]  # size: batch x 256 x 1280 (skip register tokens 1-4)
+            ]  # Skip register tokens (1-4): batch x 256 x 1280
+
+            # Concatenate CLS token with mean of patch tokens
             embedding = torch.cat(
-                [class_token, patch_tokens.mean(1)], dim=-1
-            )  # size: batch x 2560
-            embedding = embedding.to(torch.float16)
+                [class_token, patch_tokens.mean(dim=1)], dim=-1
+            )  # batch x 2560
 
-        return embedding.cpu().tolist()
+        return embedding.cpu().float().tolist()
 
     @fastapi.post("/")
     async def root(self, request: Request) -> list[float]:

From e7ac073d87c0e0173a1f0c23fbd97126a537fc2e Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sat, 14 Feb 2026 11:30:19 +0100
Subject: [PATCH 16/39] feat: add into dockerfile

---
 docker/Dockerfile.gpu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
index f5bf633..f0f29b6 100644
--- a/docker/Dockerfile.gpu
+++ b/docker/Dockerfile.gpu
@@ -53,4 +53,9 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \
     sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \
     sudo ldconfig
 
-RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0"
+RUN pip install --no-cache-dir \
+    onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \
+    https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl \
+    https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl \
+    "timm>=1.0.0" \
+    "huggingface-hub>=0.23.0"

From 51f07a4befaddb62d5b1a50ca4deba77e206d5bc Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sat, 14 Feb 2026 11:31:28 +0100
Subject: [PATCH 17/39] fix: remove installs from model

---
 ray-service.yaml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index 4eeecd4..e795857 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -63,7 +63,6 @@ spec:
                 _target_: providers.model_provider:mlflow
                 artifact_uri: mlflow-artifacts:/10/39f821ed5b964c71a603cc6db196f9fd/artifacts/checkpoints/epoch=19-step=32020/model.onnx/model.onnx
 
-
       - name: heatmap-builder
         import_path: builders.heatmap_builder:app
         route_prefix: /heatmap-builder
@@ -91,11 +90,6 @@ spec:
           config:
             setup_timeout_seconds: 1800
           working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip
-          pip:
-            - https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl
-            - https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl
-            - timm>=1.0.0
-            - huggingface-hub>=0.23.0
           env_vars:
             HF_TOKEN: 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
         deployments:

From 178f226ce73223911b6bc84f1b04cd622c6f6d88 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sat, 14 Feb 2026 11:31:55 +0100
Subject: [PATCH 18/39] fix: based on official docs

---
 models/virchow2.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index 781968b..48fd549 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -119,7 +119,8 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
                 [class_token, patch_tokens.mean(dim=1)], dim=-1
             )  # batch x 2560
 
-        return embedding.cpu().float().tolist()
+        # Convert to fp16 for efficiency as recommended in official docs
+        return embedding.half().cpu().tolist()
 
     @fastapi.post("/")
     async def root(self, request: Request) -> list[float]:

From 5cc123f1096ed7c559a5a9fd6ed3358e597bc4ec Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sat, 14 Feb 2026 11:32:20 +0100
Subject: [PATCH 19/39] fix

---
 models/virchow2.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index 48fd549..50717af 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -73,7 +73,6 @@ def reconfigure(self, config: Config) -> None:
 
         logger.info("Virchow2 model loaded and moved to GPU.")
 
-        # Configure batching
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
@@ -96,7 +95,6 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
         if self.model is None or self.transforms is None:
             raise RuntimeError("Model or transforms not initialized")
 
-        # Convert numpy arrays to PIL Images and apply transforms
         pil_images = [Image.fromarray(img) for img in images]
         tensors = torch.stack([self.transforms(img) for img in pil_images]).to(
             self.device
@@ -119,7 +117,6 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
                 [class_token, patch_tokens.mean(dim=1)], dim=-1
             )  # batch x 2560
 
-        # Convert to fp16 for efficiency as recommended in official docs
         return embedding.half().cpu().tolist()
 
     @fastapi.post("/")

From 964114e73e43ac12160b144b6628eece5ea550e0 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@muni.cz>
Date: Sat, 14 Feb 2026 11:45:18 +0100
Subject: [PATCH 20/39] fix: remove comment

---
 models/virchow2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index 50717af..bcf165e 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -129,7 +129,7 @@ async def root(self, request: Request) -> list[float]:
         )
 
         results = await self.predict(image)
-        return results  # type: ignore[attr-defined]
+        return results
 
 
 app = Virchow2.bind()  # type: ignore[attr-defined]

From 181f79e7ed4cf6c42f20ac38fc9665443cff0098 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Fri, 13 Mar 2026 20:35:16 +0100
Subject: [PATCH 21/39] chore: update docker gpu file

---
 docker/Dockerfile.cpu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index ea4e4dc..9538fda 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -51,4 +51,8 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \
 # Cleanup
 RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
 
-RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0"
+RUN pip install --no-cache-dir \
+    onnxruntime lz4 ratiopath "mlflow<3.0" \
+    torch==2.4.0 torchvision==0.19.0 \
+    "timm>=1.0.0" \
+    "huggingface-hub>=0.23.0"

From 156a7d47ef59cef03402acf050ec8e6373ac7bb1 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Fri, 13 Mar 2026 20:36:15 +0100
Subject: [PATCH 22/39] feat: optimalize virchow2 deployment

---
 ray-service.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index e795857..7f7386f 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -94,14 +94,14 @@ spec:
             HF_TOKEN: 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
         deployments:
           - name: Virchow2
-            max_ongoing_requests: 32
-            max_queued_requests: 64
+            max_ongoing_requests: 160
+            max_queued_requests: 256
             autoscaling_config:
               min_replicas: 0
               max_replicas: 1
-              target_ongoing_requests: 16
+              target_ongoing_requests: 128
             ray_actor_options:
-              num_cpus: 4
+              num_cpus: 8
               num_gpus: 1
               memory: 8589934592 # 8 GiB
               runtime_env:
@@ -109,8 +109,8 @@ spec:
                    HF_HOME: "/mnt/huggingface_cache"
             user_config:
               tile_size: 224
-              max_batch_size: 16
-              batch_wait_timeout_s: 0.1
+              max_batch_size: 128
+              batch_wait_timeout_s: 0.05
               model:
                 _target_: providers.model_provider:huggingface
                 repo_id: paige-ai/Virchow2
@@ -139,15 +139,15 @@ spec:
               type: RuntimeDefault
           containers:
             - name: ray-head
-              image: rayproject/ray:2.53.0-py312
+              image: cerit.io/rationai/model-service:2.53.0
               imagePullPolicy: Always
               resources:
                 limits:
                   cpu: 0
-                  memory: 4Gi
+                  memory: 8Gi
                 requests:
                   cpu: 0
-                  memory: 4Gi
+                  memory: 8Gi
               env:
                 - name: HTTPS_PROXY
                   value: http://proxy.ics.muni.cz:3128
@@ -239,6 +239,8 @@ spec:
         replicas: 0
         minReplicas: 0
         maxReplicas: 2
+        rayStartParams:
+          num-gpus: "1"
         template:
           spec:
             securityContext:
@@ -248,8 +250,6 @@ spec:
               runAsUser: 1000
               seccompProfile:
                 type: RuntimeDefault
-            nodeSelector:
-              nvidia.com/gpu.product: NVIDIA-A40
             containers:
               - name: ray-worker
                 image: cerit.io/rationai/model-service:2.53.0-gpu
@@ -258,7 +258,7 @@ spec:
                   limits:
                     cpu: 8
                     memory: 24Gi
-                    nvidia.com/gpu: 1
+                    nvidia.com/mig-2g.20gb: 1
                   requests:
                     cpu: 8
                     memory: 24Gi

From 57176d64fb5dcf32ca475a1c304a0f8b3b632f45 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 11:38:19 +0100
Subject: [PATCH 23/39] fix: remove hf token, create new secret

---
 misc/virchow2_downloader/virchow2_downloader_job.yaml | 5 ++++-
 ray-service.yaml                                      | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml
index 293401a..2fe2647 100644
--- a/misc/virchow2_downloader/virchow2_downloader_job.yaml
+++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml
@@ -35,7 +35,10 @@ spec:
             - name: HOME
               value: /tmp
             - name: HF_TOKEN
-              value: "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"
+              valueFrom:
+                secretKeyRef:
+                  name: huggingface-secret
+                  key: token
             - name: HTTPS_PROXY
               value: "http://proxy.ics.muni.cz:3128"
             - name: HTTP_PROXY
diff --git a/ray-service.yaml b/ray-service.yaml
index 7f7386f..303381c 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -90,8 +90,6 @@ spec:
           config:
             setup_timeout_seconds: 1800
           working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip
-          env_vars:
-            HF_TOKEN: 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
         deployments:
           - name: Virchow2
             max_ongoing_requests: 160
@@ -265,6 +263,11 @@ spec:
                 env:
                   - name: HTTPS_PROXY
                     value: http://proxy.ics.muni.cz:3128
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: huggingface-secret
+                        key: token
                 securityContext:
                   allowPrivilegeEscalation: false
                   capabilities:

From fe51ee24cefed04bcf8d2dc2b2006415092110d1 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 11:43:27 +0100
Subject: [PATCH 24/39] fix

---
 misc/virchow2_downloader/virchow2_downloader_job.yaml | 4 ++--
 models/binary_classifier.py                           | 2 --
 models/semantic_segmentation.py                       | 2 --
 ray-service.yaml                                      | 1 -
 4 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml
index 2fe2647..e6c77af 100644
--- a/misc/virchow2_downloader/virchow2_downloader_job.yaml
+++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml
@@ -14,7 +14,7 @@ spec:
           type: RuntimeDefault
       containers:
         - name: downloader
-          image: python:3.10
+          image: python:3.12
           resources:
             requests:
               memory: "4Gi"
@@ -62,6 +62,6 @@ spec:
         - name: scripts
           configMap:
             name: downloader-script
-            defaultMode: 0777
+            defaultMode: 0755
         - name: temp
           emptyDir: {}
diff --git a/models/binary_classifier.py b/models/binary_classifier.py
index 2b807ab..e774713 100644
--- a/models/binary_classifier.py
+++ b/models/binary_classifier.py
@@ -15,7 +15,6 @@ class Config(TypedDict):
     model: dict[str, Any]
     max_batch_size: int
     batch_wait_timeout_s: float
-    intra_op_num_threads: int
 
 
 fastapi = FastAPI()
@@ -67,7 +66,6 @@ def reconfigure(self, config: Config) -> None:
 
         # Configure ONNX Runtime session
         sess_options = ort.SessionOptions()
-        sess_options.intra_op_num_threads = config["intra_op_num_threads"]
         sess_options.inter_op_num_threads = 1
 
         # Enable graph optimizations
diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py
index 650e739..9cfebd0 100644
--- a/models/semantic_segmentation.py
+++ b/models/semantic_segmentation.py
@@ -13,7 +13,6 @@ class Config(TypedDict):
     model: dict[str, Any]
     max_batch_size: int
     batch_wait_timeout_s: float
-    intra_op_num_threads: int
 
 
 fastapi = FastAPI()
@@ -63,7 +62,6 @@ def reconfigure(self, config: Config) -> None:
 
         # Configure ONNX Runtime session
         sess_options = ort.SessionOptions()
-        sess_options.intra_op_num_threads = config["intra_op_num_threads"]
         sess_options.inter_op_num_threads = 1
 
         # Enable graph optimizations
diff --git a/ray-service.yaml b/ray-service.yaml
index 303381c..ef2561d 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -29,7 +29,6 @@ spec:
               tile_size: 512
               max_batch_size: 16
               batch_wait_timeout_s: 0.01
-              intra_op_num_threads: 4
               model:
                 _target_: providers.model_provider:mlflow
                 artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx

From 210c7e6af680ce6e4ac3eb9872fc9a6441d9c6d2 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 11:53:43 +0100
Subject: [PATCH 25/39] fix: remove intra threads

---
 models/semantic_segmentation.py | 1 -
 ray-service.yaml                | 1 -
 2 files changed, 2 deletions(-)

diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py
index 7e7845d..21a0674 100644
--- a/models/semantic_segmentation.py
+++ b/models/semantic_segmentation.py
@@ -13,7 +13,6 @@ class Config(TypedDict):
     model: dict[str, Any]
     max_batch_size: int
     batch_wait_timeout_s: float
-    intra_op_num_threads: int
     trt_cache_path: str
 
 
diff --git a/ray-service.yaml b/ray-service.yaml
index 99596d7..dbcd32a 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -32,7 +32,6 @@ spec:
               tile_size: 512
               max_batch_size: 16
               batch_wait_timeout_s: 0.01
-              intra_op_num_threads: 4
               trt_max_workspace_size: 8589934592 # 8 GiB
               trt_cache_path: /mnt/cache/trt_cache
               model:

From bf7cff13f234f29b2bff5c08248ff57d44b9c566 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 12:00:27 +0100
Subject: [PATCH 26/39] fix: lint

---
 models/binary_classifier.py     | 2 +-
 models/semantic_segmentation.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/binary_classifier.py b/models/binary_classifier.py
index 6edd3c1..4a844a7 100644
--- a/models/binary_classifier.py
+++ b/models/binary_classifier.py
@@ -68,7 +68,7 @@ def reconfigure(self, config: Config) -> None:
             "trt_engine_cache_path": cache_path,
             "trt_max_workspace_size": config.get(
                 "trt_max_workspace_size", 8 * 1024 * 1024 * 1024
-            ),  # type: ignore[typeddict-item]
+            ),
             "trt_builder_optimization_level": 5,
             "trt_timing_cache_enable": True,
             "trt_profile_min_shapes": min_shape,
diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py
index 21a0674..cbc916f 100644
--- a/models/semantic_segmentation.py
+++ b/models/semantic_segmentation.py
@@ -64,7 +64,7 @@ def reconfigure(self, config: Config) -> None:
             "trt_engine_cache_path": cache_path,
             "trt_max_workspace_size": config.get(
                 "trt_max_workspace_size", 8 * 1024 * 1024 * 1024
-            ),  # type: ignore[typeddict-item]
+            ),
             "trt_builder_optimization_level": 5,
             "trt_timing_cache_enable": True,
             "trt_profile_min_shapes": min_shape,

From 6813264567a08f2ca2496a111ba6a68c297d095a Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 12:06:35 +0100
Subject: [PATCH 27/39] fix: remove duplicity

---
 ray-service.yaml | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index dbcd32a..31b37de 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -19,9 +19,6 @@ spec:
               max_replicas: 4
               target_ongoing_requests: 32
             ray_actor_options:
-              num_cpus: 4
-              num_gpus: 1
-              memory: 4294967296  # 4 GiB
               num_cpus: 4
               num_gpus: 1
               memory: 4294967296  # 4 GiB
@@ -37,7 +34,6 @@ spec:
               model:
                 _target_: providers.model_provider:mlflow
                 artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx
-                artifact_uri: mlflow-artifacts:/65/aebc892f526047249b972f200bef4381/artifacts/checkpoints/epoch=0-step=6972/prostate_model_norm.onnx
 
       - name: episeg-1
         import_path: models.semantic_segmentation:app
@@ -48,19 +44,14 @@ spec:
           - name: SemanticSegmentation
             max_ongoing_requests: 16
             max_queued_requests: 32
-            max_queued_requests: 32
             autoscaling_config:
               min_replicas: 0
               max_replicas: 2
               target_ongoing_requests: 8
-              max_replicas: 2
-              target_ongoing_requests: 8
             ray_actor_options:
-              num_cpus: 4
               num_cpus: 4
               memory: 12884901888 # 12 GiB
               num_gpus: 1
-              num_gpus: 1
               runtime_env:
                 env_vars:
                   MLFLOW_TRACKING_URI: http://mlflow.rationai-mlflow:5000
@@ -87,10 +78,8 @@ spec:
             autoscaling_config:
               min_replicas: 0
               max_replicas: 4
-              max_replicas: 4
               target_ongoing_requests: 2
             ray_actor_options:
-              num_cpus: 8
               num_cpus: 8
               memory: 12884901888 # 12 GiB
             user_config:

From 7510c9f3359946be51a24c222ef28f666143e027 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 12:14:10 +0100
Subject: [PATCH 28/39] fixes

---
 docker/Dockerfile.gpu                                |  4 ++--
 .../virchow2_downloader/virchow2_downloader_job.yaml |  2 --
 models/binary_classifier.py                          |  5 -----
 models/semantic_segmentation.py                      |  6 ------
 models/virchow2.py                                   | 12 ------------
 5 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
index f0f29b6..9766c77 100644
--- a/docker/Dockerfile.gpu
+++ b/docker/Dockerfile.gpu
@@ -55,7 +55,7 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \
 
 RUN pip install --no-cache-dir \
     onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \
-    https://download.pytorch.org/whl/cu118/torch-2.4.0%2Bcu118-cp312-cp312-linux_x86_64.whl \
-    https://download.pytorch.org/whl/cu118/torchvision-0.19.0%2Bcu118-cp312-cp312-linux_x86_64.whl \
+    https://download.pytorch.org/whl/cu121/torch-2.4.0%2Bcu121-cp312-cp312-linux_x86_64.whl \
+    https://download.pytorch.org/whl/cu121/torchvision-0.19.0%2Bcu121-cp312-cp312-linux_x86_64.whl \
     "timm>=1.0.0" \
     "huggingface-hub>=0.23.0"
diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml
index e6c77af..570e3d0 100644
--- a/misc/virchow2_downloader/virchow2_downloader_job.yaml
+++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml
@@ -41,8 +41,6 @@ spec:
                   key: token
             - name: HTTPS_PROXY
               value: "http://proxy.ics.muni.cz:3128"
-            - name: HTTP_PROXY
-              value: "http://proxy.ics.muni.cz:3128"
             - name: TORCH_HOME
               value: /tmp/torch
             - name: TORCHINDUCTOR_CACHE_DIR
diff --git a/models/binary_classifier.py b/models/binary_classifier.py
index 4a844a7..a343e33 100644
--- a/models/binary_classifier.py
+++ b/models/binary_classifier.py
@@ -111,11 +111,6 @@ def reconfigure(self, config: Config) -> None:
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
-        dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size)
-        dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8)
-
-        self.session.run([self.output_name], {self.input_name: dummy_input})
-
     @serve.batch
     async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]:
         """Run inference on a batch of images."""
diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py
index cbc916f..951821d 100644
--- a/models/semantic_segmentation.py
+++ b/models/semantic_segmentation.py
@@ -106,12 +106,6 @@ def reconfigure(self, config: Config) -> None:
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
-        # Warmup
-        dummy_shape = (config["max_batch_size"], 3, self.tile_size, self.tile_size)
-        dummy_input = np.random.randint(0, 256, dummy_shape, dtype=np.uint8)
-
-        self.session.run([self.output_name], {self.input_name: dummy_input})
-
     def get_config(self) -> dict[str, Any]:
         return {"tile_size": self.tile_size, "mpp": self.mpp}
 
diff --git a/models/virchow2.py b/models/virchow2.py
index bcf165e..e3d2abf 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -76,18 +76,6 @@ def reconfigure(self, config: Config) -> None:
         self.predict.set_max_batch_size(config["max_batch_size"])  # type: ignore[attr-defined]
         self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"])  # type: ignore[attr-defined]
 
-        # Warmup
-        logger.info("Starting warmup...")
-        dummy_batch = torch.randn(
-            1, 3, self.tile_size, self.tile_size, device=self.device
-        )
-        with (
-            torch.inference_mode(),
-            torch.autocast(device_type="cuda", dtype=torch.float16),
-        ):
-            self.model(dummy_batch)
-        logger.info("Warmup complete.")
-
     @serve.batch
     async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
         from PIL import Image

From 2eae5032f765d3d0cae641c0993fe551c62b45d5 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 13:38:36 +0100
Subject: [PATCH 29/39] docker files

---
 docker/Dockerfile.cpu | 5 +----
 docker/Dockerfile.gpu | 6 ++----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 9538fda..c517093 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -52,7 +52,4 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \
 RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
 
 RUN pip install --no-cache-dir \
-    onnxruntime lz4 ratiopath "mlflow<3.0" \
-    torch==2.4.0 torchvision==0.19.0 \
-    "timm>=1.0.0" \
-    "huggingface-hub>=0.23.0"
+    onnxruntime lz4 ratiopath "mlflow<3.0"
diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
index 9766c77..310096f 100644
--- a/docker/Dockerfile.gpu
+++ b/docker/Dockerfile.gpu
@@ -53,9 +53,7 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \
     sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \
     sudo ldconfig
 
-RUN pip install --no-cache-dir \
-    onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \
-    https://download.pytorch.org/whl/cu121/torch-2.4.0%2Bcu121-cp312-cp312-linux_x86_64.whl \
-    https://download.pytorch.org/whl/cu121/torchvision-0.19.0%2Bcu121-cp312-cp312-linux_x86_64.whl \
+RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \
+    torch==2.4.0 torchvision==0.19.0 \
     "timm>=1.0.0" \
     "huggingface-hub>=0.23.0"

From c5095bd767f49bb85dfcc9c7d758448205755e50 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 14:26:02 +0100
Subject: [PATCH 30/39] fix: docker

---
 docker/Dockerfile.gpu | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu
index 310096f..a50a183 100644
--- a/docker/Dockerfile.gpu
+++ b/docker/Dockerfile.gpu
@@ -53,7 +53,14 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \
     sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \
     sudo ldconfig
 
-RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" \
-    torch==2.4.0 torchvision==0.19.0 \
+RUN pip install --no-cache-dir \
+    --extra-index-url https://pypi.nvidia.com \
+    onnxruntime-gpu tensorrt-cu12==10.3.0 lz4 ratiopath "mlflow<3.0"
+
+RUN pip install --no-cache-dir \
+    torch==2.4.0+cu121 torchvision==0.19.0+cu121 \
+    --index-url https://download.pytorch.org/whl/cu121
+
+RUN pip install --no-cache-dir \
     "timm>=1.0.0" \
     "huggingface-hub>=0.23.0"

From e94baec5b68dec8d56a8d0d15f7426092111f6d0 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 14:57:03 +0100
Subject: [PATCH 31/39] chore: new docker image

---
 ray-service.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index 31b37de..531def3 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -253,7 +253,7 @@ spec:
                 type: RuntimeDefault
             containers:
               - name: ray-worker
-                image: cerit.io/rationai/model-service:2.53.0-gpu
+                image: cerit.io/rationai/model-service:latest-gpu
                 imagePullPolicy: Always
                 resources:
                   limits:

From 7cdd29081b45e127ecdf152e9b8a0faa55cdebfc Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 15:29:53 +0100
Subject: [PATCH 32/39] chore: cpu docker

---
 docker/Dockerfile.cpu | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index c517093..fd9314a 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -52,4 +52,12 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \
 RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
 
 RUN pip install --no-cache-dir \
-    onnxruntime lz4 ratiopath "mlflow<3.0"
+    onnxruntime lz4 ratiopath "mlflow<3.0" "pillow>=11.3.0"
+
+RUN pip install --no-cache-dir \
+    torch==2.4.0+cpu torchvision==0.19.0+cpu \
+    --index-url https://download.pytorch.org/whl/cpu
+
+RUN pip install --no-cache-dir \
+    "timm>=1.0.0" \
+    "huggingface-hub>=0.23.0"

From 8cce2cb7f8aa79ad40a313bcfcb9e8d932566d62 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 16:14:15 +0100
Subject: [PATCH 33/39] fix

---
 models/virchow2.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index e3d2abf..3b01beb 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -2,14 +2,9 @@
 from typing import Any, TypedDict
 
 import numpy as np
-import timm
-import torch
 from fastapi import FastAPI, Request
 from numpy.typing import NDArray
 from ray import serve
-from timm.data.config import resolve_data_config
-from timm.data.transforms_factory import create_transform
-from timm.layers.mlp import SwiGLUPacked
 
 
 class Config(TypedDict):
@@ -35,8 +30,11 @@ def __init__(self) -> None:
         # Enforce offline mode for timm/huggingface_hub
         os.environ["HF_HUB_OFFLINE"] = "1"
 
+        import torch
+
+        self.torch = torch
         self.lz4 = lz4.frame
-        self.model: torch.nn.Module | None = None
+        self.model: Any = None
         self.transforms: Any = None
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.tile_size: int = 0
@@ -45,6 +43,13 @@ def reconfigure(self, config: Config) -> None:
         import importlib
         import logging
 
+        import timm
+        from timm.data.config import resolve_data_config
+        from timm.data.transforms_factory import create_transform
+        from timm.layers.mlp import SwiGLUPacked
+
+        torch = self.torch
+
         logger = logging.getLogger("ray.serve")
         self.tile_size = config["tile_size"]
 
@@ -83,6 +88,8 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
         if self.model is None or self.transforms is None:
             raise RuntimeError("Model or transforms not initialized")
 
+        torch = self.torch
+
         pil_images = [Image.fromarray(img) for img in images]
         tensors = torch.stack([self.transforms(img) for img in pil_images]).to(
             self.device

From 7e329a8c53ae60a554816af55575c86f1c93d7c6 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sat, 14 Mar 2026 23:50:45 +0100
Subject: [PATCH 34/39] final changes

---
 docker/Dockerfile.cpu | 10 +---------
 ray-service.yaml      |  2 +-
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index fd9314a..7e51375 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -52,12 +52,4 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \
 RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
 
 RUN pip install --no-cache-dir \
-    onnxruntime lz4 ratiopath "mlflow<3.0" "pillow>=11.3.0"
-
-RUN pip install --no-cache-dir \
-    torch==2.4.0+cpu torchvision==0.19.0+cpu \
-    --index-url https://download.pytorch.org/whl/cpu
-
-RUN pip install --no-cache-dir \
-    "timm>=1.0.0" \
-    "huggingface-hub>=0.23.0"
+    onnxruntime lz4 ratiopath "mlflow<3.0" \
\ No newline at end of file
diff --git a/ray-service.yaml b/ray-service.yaml
index 531def3..ab11465 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -92,7 +92,7 @@ spec:
         runtime_env:
           config:
             setup_timeout_seconds: 1800
-          working_dir: https://gitlab.ics.muni.cz/rationai/infrastructure/model-service/-/archive/master/model-service-master.zip
+          working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/virchow2-model.zip
         deployments:
           - name: Virchow2
             max_ongoing_requests: 160

From 8dfea82424a7773b1ccc56de00db66f8037f9cb6 Mon Sep 17 00:00:00 2001
From: JiriStipek <567776@mail.muni.cz>
Date: Sun, 15 Mar 2026 13:03:50 +0100
Subject: [PATCH 35/39] fix: usage of master branch

---
 ray-service.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index ab11465..f9a7507 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -92,7 +92,7 @@ spec:
         runtime_env:
           config:
             setup_timeout_seconds: 1800
-          working_dir: https://github.com/RationAI/model-service/archive/refs/heads/feature/virchow2-model.zip
+          working_dir: https://github.com/RationAI/model-service/archive/refs/heads/master.zip
         deployments:
           - name: Virchow2
             max_ongoing_requests: 160

From e6f8603eaac4e192de8e43ffd424be4bf91dee60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?=
 <91186480+Jurgee@users.noreply.github.com>
Date: Sun, 15 Mar 2026 13:10:17 +0100
Subject: [PATCH 36/39] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 docker/Dockerfile.cpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index 7e51375..9f778f5 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -52,4 +52,4 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \
 RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* 
 
 RUN pip install --no-cache-dir \
-    onnxruntime lz4 ratiopath "mlflow<3.0" \
\ No newline at end of file
+    onnxruntime lz4 ratiopath "mlflow<3.0"
\ No newline at end of file

From fb646c4121cdb094fa2bbb8ad6f706ee510227cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?=
 <91186480+Jurgee@users.noreply.github.com>
Date: Sun, 15 Mar 2026 13:11:23 +0100
Subject: [PATCH 37/39] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 ray-service.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ray-service.yaml b/ray-service.yaml
index f9a7507..6a1d223 100644
--- a/ray-service.yaml
+++ b/ray-service.yaml
@@ -88,7 +88,7 @@ spec:
 
       - name: virchow2
         import_path: models.virchow2:app
-        route_prefix: /virchow2        
+        route_prefix: /virchow2
         runtime_env:
           config:
             setup_timeout_seconds: 1800

From b2d083c8ab386f8da179d47254f07975fd96a1de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?=
 <91186480+Jurgee@users.noreply.github.com>
Date: Sun, 15 Mar 2026 13:18:46 +0100
Subject: [PATCH 38/39] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 providers/model_provider.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/providers/model_provider.py b/providers/model_provider.py
index 68daafe..126f6ea 100644
--- a/providers/model_provider.py
+++ b/providers/model_provider.py
@@ -9,20 +9,18 @@ def huggingface(repo_id: str, filename: str | None = None) -> str:
 
     from huggingface_hub import hf_hub_download, snapshot_download
 
-    cache_dir = os.environ.get("HF_HOME", "/mnt/huggingface_cache")
-    os.makedirs(cache_dir, exist_ok=True)
-    os.environ["HF_HOME"] = cache_dir
+    hf_home = os.environ.get("HF_HOME", "/mnt/huggingface_cache")
+    os.makedirs(hf_home, exist_ok=True)
+    os.environ["HF_HOME"] = hf_home
 
     if filename:
         return hf_hub_download(
             repo_id=repo_id,
             filename=filename,
-            cache_dir=cache_dir,
             local_files_only=True,
         )
     else:
         return snapshot_download(
             repo_id=repo_id,
-            cache_dir=cache_dir,
             local_files_only=True,
         )

From bfd90a9a020dfba5d6bbe9ce214b64b1302a7b3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20=C5=A0t=C3=ADpek?=
 <91186480+Jurgee@users.noreply.github.com>
Date: Sun, 15 Mar 2026 13:20:53 +0100
Subject: [PATCH 39/39] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 models/virchow2.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/models/virchow2.py b/models/virchow2.py
index 3b01beb..e29f829 100644
--- a/models/virchow2.py
+++ b/models/virchow2.py
@@ -95,9 +95,12 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]:
             self.device
         )
 
+        device_type = self.device.type
+        autocast_dtype = torch.float16 if device_type == "cuda" else torch.bfloat16
+
         with (
             torch.inference_mode(),
-            torch.autocast(device_type="cuda", dtype=torch.float16),
+            torch.autocast(device_type=device_type, dtype=autocast_dtype),
         ):
             output = self.model(tensors)