diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index ea4e4dc..9f778f5 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -51,4 +51,5 @@ RUN sudo apt-get update && sudo apt-get -y upgrade && \ # Cleanup RUN sudo apt-get remove -y --purge systemd systemd-sysv && sudo apt-get autoremove --purge -y && sudo apt-get clean && sudo rm -rf /var/lib/apt/lists/* -RUN pip install --no-cache-dir onnxruntime lz4 ratiopath "mlflow<3.0" +RUN pip install --no-cache-dir \ + onnxruntime lz4 ratiopath "mlflow<3.0" \ No newline at end of file diff --git a/docker/Dockerfile.gpu b/docker/Dockerfile.gpu index f5bf633..a50a183 100644 --- a/docker/Dockerfile.gpu +++ b/docker/Dockerfile.gpu @@ -53,4 +53,14 @@ RUN sudo sh -c 'echo "/usr/local/lib" > /etc/ld.so.conf.d/custom-libs.conf' && \ sudo sh -c 'echo "/home/ray/anaconda3/lib/python3.12/site-packages/nvidia/cudnn/lib" > /etc/ld.so.conf.d/nvidia-libs.conf' && \ sudo ldconfig -RUN pip install --no-cache-dir onnxruntime-gpu tensorrt lz4 ratiopath "mlflow<3.0" +RUN pip install --no-cache-dir \ + --extra-index-url https://pypi.nvidia.com \ + onnxruntime-gpu tensorrt-cu12==10.3.0 lz4 ratiopath "mlflow<3.0" + +RUN pip install --no-cache-dir \ + torch==2.4.0+cu121 torchvision==0.19.0+cu121 \ + --index-url https://download.pytorch.org/whl/cu121 + +RUN pip install --no-cache-dir \ + "timm>=1.0.0" \ + "huggingface-hub>=0.23.0" diff --git a/misc/virchow2_downloader/download_virchow2.py b/misc/virchow2_downloader/download_virchow2.py new file mode 100644 index 0000000..218790b --- /dev/null +++ b/misc/virchow2_downloader/download_virchow2.py @@ -0,0 +1,48 @@ +import os + +from huggingface_hub import login, snapshot_download + + +HF_TOKEN = os.environ.get("HF_TOKEN") +CACHE_DIR = "/mnt/huggingface_cache" +MODEL_ID = "paige-ai/Virchow2" + +os.environ["HF_HOME"] = CACHE_DIR +os.makedirs(CACHE_DIR, exist_ok=True) + +print(f"Starting download for {MODEL_ID} to {CACHE_DIR}") + +if HF_TOKEN: + print("Logging in to Hugging Face...") + login(token=HF_TOKEN) +else: + print("No HF_TOKEN provided! Download might fail for gated models.") + +print("Downloading model snapshot...") +try: + path = snapshot_download( + repo_id=MODEL_ID, + resume_download=True, + local_files_only=False, + ) + print(f"Model downloaded to: {path}") + + print("Verifying model files exist...") + import timm + + try: + model = timm.create_model( + f"hf-hub:{MODEL_ID}", + pretrained=True, + num_classes=0, + ) + print(f"Model successfully loaded! Type: {type(model).__name__}") + del model # Free memory + except Exception as e: + print(f"Verification warning: {e}") + +except Exception as e: + print(f"Download failed: {e}") + exit(1) + +print("DONE. Model is cached and ready for offline use.") diff --git a/misc/virchow2_downloader/virchow2_downloader_job.yaml b/misc/virchow2_downloader/virchow2_downloader_job.yaml new file mode 100644 index 0000000..570e3d0 --- /dev/null +++ b/misc/virchow2_downloader/virchow2_downloader_job.yaml @@ -0,0 +1,65 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: virchow-downloader + namespace: rationai-jobs-ns +spec: + template: + spec: + securityContext: + runAsNonRoot: true + runAsUser: 1000 + fsGroup: 1000 + seccompProfile: + type: RuntimeDefault + containers: + - name: downloader + image: python:3.12 + resources: + requests: + memory: "4Gi" + cpu: "1" + limits: + memory: "4Gi" + cpu: "2" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + command: ["/bin/bash", "-c"] + args: + - | + pip install --user --no-cache-dir huggingface_hub transformers torch timm + python3 /mnt/scripts/download_virchow2.py + env: + - name: HOME + value: /tmp + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: huggingface-secret + key: token + - name: HTTPS_PROXY + value: "http://proxy.ics.muni.cz:3128" + - name: TORCH_HOME + value: /tmp/torch + - name: TORCHINDUCTOR_CACHE_DIR + value: /tmp/torch/inductor_cache + volumeMounts: + - name: huggingface-cache + mountPath: /mnt/huggingface_cache + - name: scripts + mountPath: /mnt/scripts + - name: temp + mountPath: /tmp + restartPolicy: Never + volumes: + - name: huggingface-cache + persistentVolumeClaim: + claimName: huggingface-cache-pvc + - name: scripts + configMap: + name: downloader-script + defaultMode: 0755 + - name: temp + emptyDir: {} diff --git a/models/binary_classifier.py b/models/binary_classifier.py index 39e12c2..a343e33 100644 --- a/models/binary_classifier.py +++ b/models/binary_classifier.py @@ -15,7 +15,6 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float - intra_op_num_threads: int trt_cache_path: str @@ -69,7 +68,7 @@ def reconfigure(self, config: Config) -> None: "trt_engine_cache_path": cache_path, "trt_max_workspace_size": config.get( "trt_max_workspace_size", 8 * 1024 * 1024 * 1024 - ), # type: ignore[typeddict-item] + ), "trt_builder_optimization_level": 5, "trt_timing_cache_enable": True, "trt_profile_min_shapes": min_shape, @@ -79,7 +78,6 @@ def reconfigure(self, config: Config) -> None: # Configure ONNX Runtime session sess_options = ort.SessionOptions() - sess_options.intra_op_num_threads = config["intra_op_num_threads"] sess_options.inter_op_num_threads = 1 # Enable all graph optimizations (constant folding, node fusion, etc.) for maximum inference performance. @@ -118,7 +116,10 @@ async def predict(self, images: list[NDArray[np.uint8]]) -> list[float]: """Run inference on a batch of images.""" batch = np.stack(images, axis=0, dtype=np.uint8) - outputs = self.session.run([self.output_name], {self.input_name: batch}) + outputs = self.session.run( + [self.output_name], + {self.input_name: batch}, + ) return outputs[0].flatten().tolist() # pyright: ignore[reportAttributeAccessIssue] diff --git a/models/semantic_segmentation.py b/models/semantic_segmentation.py index 9d05cce..951821d 100644 --- a/models/semantic_segmentation.py +++ b/models/semantic_segmentation.py @@ -13,7 +13,6 @@ class Config(TypedDict): model: dict[str, Any] max_batch_size: int batch_wait_timeout_s: float - intra_op_num_threads: int trt_cache_path: str @@ -65,7 +64,7 @@ def reconfigure(self, config: Config) -> None: "trt_engine_cache_path": cache_path, "trt_max_workspace_size": config.get( "trt_max_workspace_size", 8 * 1024 * 1024 * 1024 - ), # type: ignore[typeddict-item] + ), "trt_builder_optimization_level": 5, "trt_timing_cache_enable": True, "trt_profile_min_shapes": min_shape, @@ -75,7 +74,6 @@ def reconfigure(self, config: Config) -> None: # Configure ONNX Runtime session sess_options = ort.SessionOptions() - sess_options.intra_op_num_threads = config["intra_op_num_threads"] sess_options.inter_op_num_threads = 1 # Enable all graph optimizations (constant folding, node fusion, etc.) for maximum inference performance. diff --git a/models/virchow2.py b/models/virchow2.py new file mode 100644 index 0000000..e29f829 --- /dev/null +++ b/models/virchow2.py @@ -0,0 +1,133 @@ +import asyncio +from typing import Any, TypedDict + +import numpy as np +from fastapi import FastAPI, Request +from numpy.typing import NDArray +from ray import serve + + +class Config(TypedDict): + tile_size: int + model: dict[str, Any] + max_batch_size: int + batch_wait_timeout_s: float + + +fastapi = FastAPI() + + +@serve.deployment(num_replicas="auto") +@serve.ingress(fastapi) +class Virchow2: + """Virchow2 foundation model for pathology.""" + + def __init__(self) -> None: + import os + + import lz4.frame + + # Enforce offline mode for timm/huggingface_hub + os.environ["HF_HUB_OFFLINE"] = "1" + + import torch + + self.torch = torch + self.lz4 = lz4.frame + self.model: Any = None + self.transforms: Any = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self.tile_size: int = 0 + + def reconfigure(self, config: Config) -> None: + import importlib + import logging + + import timm + from timm.data.config import resolve_data_config + from timm.data.transforms_factory import create_transform + from timm.layers.mlp import SwiGLUPacked + + torch = self.torch + + logger = logging.getLogger("ray.serve") + self.tile_size = config["tile_size"] + + # Load model using the provider + module_path, attr_name = config["model"].pop("_target_").split(":") + provider = getattr(importlib.import_module(module_path), attr_name) + repo_id = config["model"]["repo_id"] + + logger.info(f"Loading Virchow2 model from {repo_id}...") + provider(**config["model"]) + + # Load model with official architecture + self.model = timm.create_model( + f"hf-hub:{repo_id}", + pretrained=True, + num_classes=0, + mlp_layer=SwiGLUPacked, + act_layer=torch.nn.SiLU, + ) + self.model = self.model.to(self.device).eval() + + # Get transforms from model config + self.transforms = create_transform( + **resolve_data_config(self.model.pretrained_cfg, model=self.model) + ) + + logger.info("Virchow2 model loaded and moved to GPU.") + + self.predict.set_max_batch_size(config["max_batch_size"]) # type: ignore[attr-defined] + self.predict.set_batch_wait_timeout_s(config["batch_wait_timeout_s"]) # type: ignore[attr-defined] + + @serve.batch + async def predict(self, images: list[NDArray[np.uint8]]) -> list[list[float]]: + from PIL import Image + + if self.model is None or self.transforms is None: + raise RuntimeError("Model or transforms not initialized") + + torch = self.torch + + pil_images = [Image.fromarray(img) for img in images] + tensors = torch.stack([self.transforms(img) for img in pil_images]).to( + self.device + ) + + device_type = self.device.type + autocast_dtype = torch.float16 if device_type == "cuda" else torch.bfloat16 + + with ( + torch.inference_mode(), + torch.autocast(device_type=device_type, dtype=autocast_dtype), + ): + output = self.model(tensors) + + # Extract embeddings as per official model card + class_token = output[:, 0] # CLS token: batch x 1280 + patch_tokens = output[ + :, 5: + ] # Skip register tokens (1-4): batch x 256 x 1280 + + # Concatenate CLS token with mean of patch tokens + embedding = torch.cat( + [class_token, patch_tokens.mean(dim=1)], dim=-1 + ) # batch x 2560 + + return embedding.half().cpu().tolist() + + @fastapi.post("/") + async def root(self, request: Request) -> list[float]: + data = await asyncio.to_thread(self.lz4.decompress, await request.body()) + + # Reshape to (height, width, channels) - RGB image + image = np.frombuffer(data, dtype=np.uint8).reshape( + self.tile_size, self.tile_size, 3 + ) + + results = await self.predict(image) + return results + + +app = Virchow2.bind() # type: ignore[attr-defined] diff --git a/providers/model_provider.py b/providers/model_provider.py index faf6e5c..126f6ea 100644 --- a/providers/model_provider.py +++ b/providers/model_provider.py @@ -2,3 +2,25 @@ def mlflow(artifact_uri: str) -> str: import mlflow.artifacts return mlflow.artifacts.download_artifacts(artifact_uri=artifact_uri) + + +def huggingface(repo_id: str, filename: str | None = None) -> str: + import os + + from huggingface_hub import hf_hub_download, snapshot_download + + hf_home = os.environ.get("HF_HOME", "/mnt/huggingface_cache") + os.makedirs(hf_home, exist_ok=True) + os.environ["HF_HOME"] = hf_home + + if filename: + return hf_hub_download( + repo_id=repo_id, + filename=filename, + local_files_only=True, + ) + else: + return snapshot_download( + repo_id=repo_id, + local_files_only=True, + ) diff --git a/pvc/huggingface-pvc.yaml b/pvc/huggingface-pvc.yaml new file mode 100644 index 0000000..8cf3047 --- /dev/null +++ b/pvc/huggingface-pvc.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: huggingface-cache-pvc + namespace: rationai-jobs-ns +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 15Gi + storageClassName: nfs-csi diff --git a/ray-service.yaml b/ray-service.yaml index ce19967..6a1d223 100644 --- a/ray-service.yaml +++ b/ray-service.yaml @@ -29,7 +29,6 @@ spec: tile_size: 512 max_batch_size: 16 batch_wait_timeout_s: 0.01 - intra_op_num_threads: 4 trt_max_workspace_size: 8589934592 # 8 GiB trt_cache_path: /mnt/cache/trt_cache model: @@ -67,7 +66,6 @@ spec: _target_: providers.model_provider:mlflow artifact_uri: mlflow-artifacts:/10/39f821ed5b964c71a603cc6db196f9fd/artifacts/checkpoints/epoch=19-step=32020/model.onnx/model.onnx - - name: heatmap-builder import_path: builders.heatmap_builder:app route_prefix: /heatmap-builder @@ -88,6 +86,36 @@ spec: num_threads: 8 max_concurrent_tasks: 24 + - name: virchow2 + import_path: models.virchow2:app + route_prefix: /virchow2 + runtime_env: + config: + setup_timeout_seconds: 1800 + working_dir: https://github.com/RationAI/model-service/archive/refs/heads/master.zip + deployments: + - name: Virchow2 + max_ongoing_requests: 160 + max_queued_requests: 256 + autoscaling_config: + min_replicas: 0 + max_replicas: 1 + target_ongoing_requests: 128 + ray_actor_options: + num_cpus: 8 + num_gpus: 1 + memory: 8589934592 # 8 GiB + runtime_env: + env_vars: + HF_HOME: "/mnt/huggingface_cache" + user_config: + tile_size: 224 + max_batch_size: 128 + batch_wait_timeout_s: 0.05 + model: + _target_: providers.model_provider:huggingface + repo_id: paige-ai/Virchow2 + rayClusterConfig: rayVersion: 2.53.0 enableInTreeAutoscaling: true @@ -112,15 +140,15 @@ spec: type: RuntimeDefault containers: - name: ray-head - image: rayproject/ray:2.53.0-py312 + image: cerit.io/rationai/model-service:2.53.0 imagePullPolicy: Always resources: limits: cpu: 0 - memory: 4Gi + memory: 8Gi requests: cpu: 0 - memory: 4Gi + memory: 8Gi env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 @@ -185,6 +213,8 @@ spec: mountPath: /mnt/bioptic_tree - name: trt-cache-volume mountPath: /mnt/cache + - name: huggingface-cache + mountPath: /mnt/huggingface_cache volumes: - name: data @@ -202,11 +232,16 @@ spec: - name: trt-cache-volume persistentVolumeClaim: claimName: tensorrt-cache-pvc + - name: huggingface-cache + persistentVolumeClaim: + claimName: huggingface-cache-pvc - groupName: gpu-workers replicas: 0 minReplicas: 0 maxReplicas: 2 + rayStartParams: + num-gpus: "1" template: spec: securityContext: @@ -216,23 +251,26 @@ spec: runAsUser: 1000 seccompProfile: type: RuntimeDefault - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A40 containers: - name: ray-worker - image: cerit.io/rationai/model-service:2.53.0-gpu + image: cerit.io/rationai/model-service:latest-gpu imagePullPolicy: Always resources: limits: cpu: 8 memory: 24Gi - nvidia.com/gpu: 1 + nvidia.com/mig-2g.20gb: 1 requests: cpu: 8 memory: 24Gi env: - name: HTTPS_PROXY value: http://proxy.ics.muni.cz:3128 + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: huggingface-secret + key: token securityContext: allowPrivilegeEscalation: false capabilities: @@ -253,6 +291,8 @@ spec: mountPath: /mnt/bioptic_tree - name: trt-cache-volume mountPath: /mnt/cache + - name: huggingface-cache + mountPath: /mnt/huggingface_cache volumes: - name: data @@ -270,3 +310,6 @@ spec: - name: trt-cache-volume persistentVolumeClaim: claimName: tensorrt-cache-pvc + - name: huggingface-cache + persistentVolumeClaim: + claimName: huggingface-cache-pvc