diff --git a/.mypy.ini b/.mypy.ini index abd4983..d895db7 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -3,4 +3,4 @@ strict = True ignore_missing_imports = True disallow_untyped_calls = False disable_error_code = no-any-return -exclude = scripts \ No newline at end of file +exclude = ^(scripts|project_name)/ \ No newline at end of file diff --git a/configs/dataset/processed/ftn.yaml b/configs/dataset/processed/ftn.yaml new file mode 100644 index 0000000..49f2ea6 --- /dev/null +++ b/configs/dataset/processed/ftn.yaml @@ -0,0 +1,5 @@ +defaults: + - /dataset/raw/ftn@_here_ + - _self_ + +uri: mlflow-artifacts:/86/d2b2f1835fc647e2ba3639ce606f4768/artifacts/dataset.csv \ No newline at end of file diff --git a/configs/dataset/processed/ikem.yaml b/configs/dataset/processed/ikem.yaml new file mode 100644 index 0000000..d230341 --- /dev/null +++ b/configs/dataset/processed/ikem.yaml @@ -0,0 +1,5 @@ +defaults: + - /dataset/raw/ikem@_here_ + - _self_ + +uri: mlflow-artifacts:/86/7c6e7cc142494d45b6513185318d4462/artifacts/dataset.csv \ No newline at end of file diff --git a/configs/dataset/processed/knl_patos.yaml b/configs/dataset/processed/knl_patos.yaml new file mode 100644 index 0000000..9f575c8 --- /dev/null +++ b/configs/dataset/processed/knl_patos.yaml @@ -0,0 +1,5 @@ +defaults: + - /dataset/raw/knl_patos@_here_ + - _self_ + +uri: mlflow-artifacts:/86/f690f64ded624da9a7150a7a92385aec/artifacts/dataset.csv \ No newline at end of file diff --git a/configs/preprocessing/tissue_masks.yaml b/configs/preprocessing/tissue_masks.yaml new file mode 100644 index 0000000..284940d --- /dev/null +++ b/configs/preprocessing/tissue_masks.yaml @@ -0,0 +1,11 @@ +# @package _global_ + +level: 3 +max_concurrent: 64 +artifact_path: tissue_masks + +metadata: + run_name: "🎭 Tissue Masks: ${dataset.institution}" + description: Tissue masks for ${dataset.institution} at level ${level} + hyperparams: + level: ${level} \ No newline at end of file diff --git a/preprocessing/.gitkeep b/preprocessing/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/preprocessing/create_dataset.py b/preprocessing/create_dataset.py index af2ef37..551dd39 100644 --- a/preprocessing/create_dataset.py +++ b/preprocessing/create_dataset.py @@ -32,7 +32,7 @@ def get_labels(folder_path: Path, labels: list[str]) -> pd.DataFrame: return labels_df -def get_slides(folder_path: Path, pattern: re.Pattern) -> pd.DataFrame: +def get_slides(folder_path: Path, pattern: re.Pattern[str]) -> pd.DataFrame: slides = [] for slide_path in folder_path.iterdir(): if not pattern.fullmatch(slide_path.name): @@ -48,7 +48,7 @@ def get_slides(folder_path: Path, pattern: re.Pattern) -> pd.DataFrame: def create_dataset( - folder: str, labels: list[str], institution: str, pattern: re.Pattern + folder: str, labels: list[str], institution: str, pattern: re.Pattern[str] ) -> tuple[pd.DataFrame, list[str], list[str]]: folder_path = Path(folder) labels_df = get_labels(folder_path, labels) diff --git a/preprocessing/tissue_masks.py b/preprocessing/tissue_masks.py new file mode 100644 index 0000000..5590edf --- /dev/null +++ b/preprocessing/tissue_masks.py @@ -0,0 +1,56 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import cast + +import hydra +import pandas as pd +import pyvips +import ray +from mlflow.artifacts import download_artifacts +from omegaconf import DictConfig +from openslide import OpenSlide +from rationai.masks import ( + process_items, + slide_resolution, + tissue_mask, + write_big_tiff, +) +from rationai.mlkit import autolog, with_cli_args +from rationai.mlkit.lightning.loggers import MLFlowLogger + + +@ray.remote(memory=4 * 1024**3) +def process_slide(slide_path: str, level: int, output_path: Path) -> None: + with OpenSlide(slide_path) as slide: + mpp_x, mpp_y = slide_resolution(slide, level) + + image = cast("pyvips.Image", pyvips.Image.new_from_file(slide_path, level=level)) + mask = tissue_mask(image, mpp=(mpp_x + mpp_y) / 2) + mask_path = output_path / Path(slide_path).with_suffix(".tiff").name + + write_big_tiff(mask, path=mask_path, mpp_x=mpp_x, mpp_y=mpp_y) + + +@with_cli_args(["+preprocessing=tissue_masks"]) +@hydra.main(config_path="../configs", config_name="preprocessing", version_base=None) +@autolog +def main(config: DictConfig, logger: MLFlowLogger) -> None: + dataset = pd.read_csv(download_artifacts(config.dataset.uri)) + + with TemporaryDirectory() as output_dir: + process_items( + dataset["path"], + process_item=process_slide, + fn_kwargs={ + "level": config.level, + "output_path": Path(output_dir), + }, + max_concurrent=config.max_concurrent, + ) + + logger.log_artifacts(local_dir=output_dir, artifact_path=config.artifact_path) + + +if __name__ == "__main__": + with ray.init(runtime_env={"excludes": [".git", ".venv"]}): + main() diff --git a/pyproject.toml b/pyproject.toml index 9d94243..657fd36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "pandas>=2.3.3", "pyvips<3.1", "rationai-mlkit", + "rationai-masks", "ray>=2.52.1", "torch>=2.9.0", "torchmetrics>=1.8.2", @@ -25,4 +26,5 @@ job = ["rationai-kube-jobs"] [tool.uv.sources] rationai-mlkit = { git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/mlkit.git" } +rationai-masks = { git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/masks.git" } rationai-kube-jobs = { git = "ssh://git@gitlab.ics.muni.cz/rationai/infrastructure/kube-jobs" } diff --git a/scripts/preprocessing/create_dataset.py b/scripts/preprocessing/create_dataset.py index 245eb97..79ec7ba 100644 --- a/scripts/preprocessing/create_dataset.py +++ b/scripts/preprocessing/create_dataset.py @@ -11,7 +11,7 @@ "git clone https://github.com/RationAI/ulcerative-colitis.git workdir", "cd workdir", "uv sync --frozen", - "uv run -m preprocessing.create_dataset +data=raw/...", + "uv run -m preprocessing.create_dataset +dataset=raw/...", ], storage=[storage.secure.DATA], ) diff --git a/scripts/preprocessing/tissue_masks.py b/scripts/preprocessing/tissue_masks.py new file mode 100644 index 0000000..2da75b2 --- /dev/null +++ b/scripts/preprocessing/tissue_masks.py @@ -0,0 +1,18 @@ +from kube_jobs import storage, submit_job + + +submit_job( + job_name="ulcerative-colitis-tissue-masks-...", + username=..., + public=False, + cpu=64, + memory="32Gi", + shm="16Gi", + script=[ + "git clone https://github.com/RationAI/ulcerative-colitis.git workdir", + "cd workdir", + "uv sync --frozen", + "uv run --active -m preprocessing.tissue_masks +dataset=processed/...", + ], + storage=[storage.secure.DATA], +) diff --git a/uv.lock b/uv.lock index bd68d0c..4178183 100644 --- a/uv.lock +++ b/uv.lock @@ -2177,6 +2177,7 @@ dependencies = [ { name = "openpyxl" }, { name = "pandas" }, { name = "pyvips" }, + { name = "rationai-masks" }, { name = "rationai-mlkit" }, { name = "ray" }, { name = "torch" }, @@ -2202,6 +2203,7 @@ requires-dist = [ { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.3.3" }, { name = "pyvips", specifier = "<3.1" }, + { name = "rationai-masks", git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/masks.git" }, { name = "rationai-mlkit", git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/mlkit.git" }, { name = "ray", specifier = ">=2.52.1" }, { name = "torch", specifier = ">=2.9.0" },