diff --git a/.mypy.ini b/.mypy.ini index d895db7..8da987d 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -3,4 +3,4 @@ strict = True ignore_missing_imports = True disallow_untyped_calls = False disable_error_code = no-any-return -exclude = ^(scripts|project_name)/ \ No newline at end of file +exclude = ^(scripts|project_name)/ diff --git a/configs/dataset/processed/ftn.yaml b/configs/dataset/processed/ftn.yaml index 49f2ea6..3467508 100644 --- a/configs/dataset/processed/ftn.yaml +++ b/configs/dataset/processed/ftn.yaml @@ -2,4 +2,4 @@ defaults: - /dataset/raw/ftn@_here_ - _self_ -uri: mlflow-artifacts:/86/d2b2f1835fc647e2ba3639ce606f4768/artifacts/dataset.csv \ No newline at end of file +uri: mlflow-artifacts:/86/d2b2f1835fc647e2ba3639ce606f4768/artifacts/dataset.csv diff --git a/configs/dataset/processed/ikem.yaml b/configs/dataset/processed/ikem.yaml index d230341..0201136 100644 --- a/configs/dataset/processed/ikem.yaml +++ b/configs/dataset/processed/ikem.yaml @@ -2,4 +2,4 @@ defaults: - /dataset/raw/ikem@_here_ - _self_ -uri: mlflow-artifacts:/86/7c6e7cc142494d45b6513185318d4462/artifacts/dataset.csv \ No newline at end of file +uri: mlflow-artifacts:/86/7c6e7cc142494d45b6513185318d4462/artifacts/dataset.csv diff --git a/configs/dataset/processed/knl_patos.yaml b/configs/dataset/processed/knl_patos.yaml index 9f575c8..fa203cb 100644 --- a/configs/dataset/processed/knl_patos.yaml +++ b/configs/dataset/processed/knl_patos.yaml @@ -2,4 +2,4 @@ defaults: - /dataset/raw/knl_patos@_here_ - _self_ -uri: mlflow-artifacts:/86/f690f64ded624da9a7150a7a92385aec/artifacts/dataset.csv \ No newline at end of file +uri: mlflow-artifacts:/86/f690f64ded624da9a7150a7a92385aec/artifacts/dataset.csv diff --git a/configs/dataset/raw/ftn.yaml b/configs/dataset/raw/ftn.yaml index 59b9d69..8f148ec 100644 --- a/configs/dataset/raw/ftn.yaml +++ b/configs/dataset/raw/ftn.yaml @@ -7,4 +7,4 @@ folder: /mnt/data/FTN/colon/IBD_AI regex_pattern: ^[0-9]{1,6}_2[0-5]\.czi$ labels: - IBD_AI_FTN.xlsx - - IBD_AI_FTN_doplnek.xlsx \ No newline at end of file + - IBD_AI_FTN_doplnek.xlsx diff --git a/configs/dataset/raw/ikem.yaml b/configs/dataset/raw/ikem.yaml index 4ae65db..ac1d9f1 100644 --- a/configs/dataset/raw/ikem.yaml +++ b/configs/dataset/raw/ikem.yaml @@ -11,4 +11,4 @@ regex_pattern: ^[0-9]{1,5}_2[1-4]_HE(?:_0[1-6])?\.czi$ labels: - Fab_IBD_AI_12_2024.csv - IBD_AI_2.xlsx - - missing.xlsx \ No newline at end of file + - missing.xlsx diff --git a/configs/dataset/raw/knl_patos.yaml b/configs/dataset/raw/knl_patos.yaml index 450ee36..4a199d2 100644 --- a/configs/dataset/raw/knl_patos.yaml +++ b/configs/dataset/raw/knl_patos.yaml @@ -14,4 +14,4 @@ labels: - IBD_AI_Liberec.xlsx - IBD_AI_Liberec_02.xlsx - IBD_AI_Liberec_10_2025.xlsx - - IBD_AI_Liberec_28_10_2025.xlsx \ No newline at end of file + - IBD_AI_Liberec_28_10_2025.xlsx diff --git a/configs/preprocessing/quality_control.yaml b/configs/preprocessing/quality_control.yaml new file mode 100644 index 0000000..ffd08ce --- /dev/null +++ b/configs/preprocessing/quality_control.yaml @@ -0,0 +1,20 @@ +# @package _global_ + +output_dir: ${project_dir}/quality_control/${dataset.institution} + +request_timeout: 18000 +max_concurrent: 5 + +qc_parameters: + mask_level: 3 + sample_level: 1 + check_residual: True + check_folding: False + check_focus: True + wb_correction: True + + +metadata: + run_name: "🎭 QC Masks: ${dataset.institution}" + description: Quality control masks for ${dataset.institution} institution + hyperparams: ${qc_parameters} diff --git a/configs/preprocessing/tissue_masks.yaml b/configs/preprocessing/tissue_masks.yaml index 284940d..a2c5be6 100644 --- a/configs/preprocessing/tissue_masks.yaml +++ b/configs/preprocessing/tissue_masks.yaml @@ -8,4 +8,4 @@ metadata: run_name: "🎭 Tissue Masks: ${dataset.institution}" description: Tissue masks for ${dataset.institution} at level ${level} hyperparams: - level: ${level} \ No newline at end of file + level: ${level} diff --git a/preprocessing/quality_control.py b/preprocessing/quality_control.py new file mode 100644 index 0000000..ba7a4d8 --- /dev/null +++ b/preprocessing/quality_control.py @@ -0,0 +1,122 @@ +# credits: https://gitlab.ics.muni.cz/rationai/digital-pathology/pathology/lymph-nodes/-/blob/develop/preprocessing/qc.py?ref_type=heads + +import asyncio +from collections.abc import Generator +from pathlib import Path +from typing import TypedDict + +import hydra +import mlflow.artifacts +import pandas as pd +import rationai +from omegaconf import DictConfig +from rationai.mlkit import autolog, with_cli_args +from rationai.mlkit.lightning.loggers import MLFlowLogger +from rationai.types import SlideCheckConfig +from tqdm.asyncio import tqdm + + +class QCParameters(TypedDict): + mask_level: int + sample_level: int + check_residual: bool + check_folding: bool + check_focus: bool + wb_correction: bool + + +def get_qc_masks(qc_parameters: QCParameters) -> Generator[tuple[str, str], None, None]: + if qc_parameters["check_focus"]: + yield ("Piqe_focus_score_piqe_median", "blur_per_tile") + yield ("Piqe_piqe_median_activity_mask", "blur_per_pixel") + + if qc_parameters["check_residual"]: + yield ("ResidualArtifactsAndCoverage_cov_percent_heatmap", "artifacts_per_tile") + yield ("ResidualArtifactsAndCoverage_coverage_mask", "artifacts_per_pixel") + + if qc_parameters["check_folding"]: + yield ("FoldingFunction_folding_test", "folds_per_pixel") + + +def organize_masks(output_path: Path, subdir: str, mask_prefix: str) -> None: + prefix_dir = output_path / subdir + prefix_dir.mkdir(parents=True, exist_ok=True) + + # Glob has to be wrapped in list, because we're modifying the directory!!! + for file in list(output_path.glob(f"{mask_prefix}_*.tiff")): + slide_name = file.name.replace(f"{mask_prefix}_", "") + destination = prefix_dir / slide_name + file.rename(destination) + + +async def qc_main( + output_path: Path, + slides: list[str], + logger: MLFlowLogger, + request_timeout: int, + max_concurrent: int, + qc_parameters: QCParameters, +) -> None: + async with rationai.AsyncClient() as client: # type: ignore[attr-defined] + async for result in tqdm( + client.qc.check_slides( + slides, + output_path, + config=SlideCheckConfig(**qc_parameters), + timeout=request_timeout, + max_concurrent=max_concurrent, + ), + total=len(slides), + ): + if not result.success: + with open(output_path / "qc_errors.log", "a") as log_file: + log_file.write( + f"Failed to process {result.wsi_path}: {result.error}\n" + ) + + # Organize generated masks into subdirectories + for prefix, artifact_name in get_qc_masks(qc_parameters): + organize_masks(Path(output_path), artifact_name, prefix) + + # Merge generated csv files + csvs = list(Path(output_path).glob("*.csv")) + pd.concat([pd.read_csv(f) for f in csvs]).to_csv( + Path(output_path, "qc_metrics.csv"), index=False + ) + + # Remove individual csv files + for f in csvs: + f.unlink() + + logger.log_artifacts(local_dir=str(output_path)) + + +def download_dataset(uri: str) -> pd.DataFrame: + path = mlflow.artifacts.download_artifacts(artifact_uri=uri) + df = pd.read_csv(path) + return df + + +@with_cli_args(["+preprocessing=quality_control"]) +@hydra.main(config_path="../configs", config_name="preprocessing", version_base=None) +@autolog +def main(config: DictConfig, logger: MLFlowLogger) -> None: + df = download_dataset(config.dataset.uri) + + output_path = Path(config.output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + asyncio.run( + qc_main( + output_path=output_path, + slides=df["path"].to_list(), + logger=logger, + request_timeout=config.request_timeout, + max_concurrent=config.max_concurrent, + qc_parameters=config.qc_parameters, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 657fd36..d5d1d7a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ "openpyxl>=3.1.5", "pandas>=2.3.3", "pyvips<3.1", + "rationai-sdk", "rationai-mlkit", "rationai-masks", "ray>=2.52.1", @@ -28,3 +29,4 @@ job = ["rationai-kube-jobs"] rationai-mlkit = { git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/mlkit.git" } rationai-masks = { git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/masks.git" } rationai-kube-jobs = { git = "ssh://git@gitlab.ics.muni.cz/rationai/infrastructure/kube-jobs" } +rationai-sdk = { git = "https://gitlab.ics.muni.cz/rationai/infrastructure/rationai-sdk-python.git" } diff --git a/scripts/preprocessing/quality_control.py b/scripts/preprocessing/quality_control.py new file mode 100644 index 0000000..dcb56b6 --- /dev/null +++ b/scripts/preprocessing/quality_control.py @@ -0,0 +1,17 @@ +from kube_jobs import storage, submit_job + + +submit_job( + job_name="ulcerative-colitis-quality-control-...", + username=..., + public=False, + cpu=2, + memory="4Gi", + script=[ + "git clone https://github.com/RationAI/ulcerative-colitis.git workdir", + "cd workdir", + "uv sync --frozen", + "uv run -m preprocessing.quality_control +dataset=processed/...", + ], + storage=[storage.secure.DATA], +) diff --git a/uv.lock b/uv.lock index 4178183..52b1258 100644 --- a/uv.lock +++ b/uv.lock @@ -589,6 +589,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, ] +[[package]] +name = "httpcore" +version = "1.0.9" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "certifi" }, + { name = "h11" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/06/94/82699a10bca87a5556c9c59b5963f2d039dbd239f25bc2a63907a05a14cb/httpcore-1.0.9.tar.gz", hash = "sha256:6e34463af53fd2ab5d807f399a9b45ea31c3dfa2276f15a2c3f00afff6e176e8", size = 85484, upload-time = "2025-04-24T22:06:22.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/7e/f5/f66802a942d491edb555dd61e3a9961140fd64c90bce1eafd741609d334d/httpcore-1.0.9-py3-none-any.whl", hash = "sha256:2d400746a40668fc9dec9810239072b40b4484b640a8c38fd654a024c7a1bf55", size = 78784, upload-time = "2025-04-24T22:06:20.566Z" }, +] + +[[package]] +name = "httpx" +version = "0.28.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "anyio" }, + { name = "certifi" }, + { name = "httpcore" }, + { name = "idna" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406, upload-time = "2024-12-06T15:37:23.222Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517, upload-time = "2024-12-06T15:37:21.509Z" }, +] + [[package]] name = "hydra-core" version = "1.3.2" @@ -873,6 +901,22 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/de/73/3d757cb3fc16f0f9794dd289bcd0c4a031d9cf54d8137d6b984b2d02edf3/lightning_utilities-0.15.2-py3-none-any.whl", hash = "sha256:ad3ab1703775044bbf880dbf7ddaaac899396c96315f3aa1779cec9d618a9841", size = 29431, upload-time = "2025-08-06T13:57:38.046Z" }, ] +[[package]] +name = "lz4" +version = "4.4.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/57/51/f1b86d93029f418033dddf9b9f79c8d2641e7454080478ee2aab5123173e/lz4-4.4.5.tar.gz", hash = "sha256:5f0b9e53c1e82e88c10d7c180069363980136b9d7a8306c4dca4f760d60c39f0", size = 172886, upload-time = "2025-11-03T13:02:36.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/1b/ac/016e4f6de37d806f7cc8f13add0a46c9a7cfc41a5ddc2bc831d7954cf1ce/lz4-4.4.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:df5aa4cead2044bab83e0ebae56e0944cc7fcc1505c7787e9e1057d6d549897e", size = 207163, upload-time = "2025-11-03T13:01:45.895Z" }, + { url = "https://files.pythonhosted.org/packages/8d/df/0fadac6e5bd31b6f34a1a8dbd4db6a7606e70715387c27368586455b7fc9/lz4-4.4.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6d0bf51e7745484d2092b3a51ae6eb58c3bd3ce0300cf2b2c14f76c536d5697a", size = 207150, upload-time = "2025-11-03T13:01:47.205Z" }, + { url = "https://files.pythonhosted.org/packages/b7/17/34e36cc49bb16ca73fb57fbd4c5eaa61760c6b64bce91fcb4e0f4a97f852/lz4-4.4.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7b62f94b523c251cf32aa4ab555f14d39bd1a9df385b72443fd76d7c7fb051f5", size = 1292045, upload-time = "2025-11-03T13:01:48.667Z" }, + { url = "https://files.pythonhosted.org/packages/90/1c/b1d8e3741e9fc89ed3b5f7ef5f22586c07ed6bb04e8343c2e98f0fa7ff04/lz4-4.4.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2c3ea562c3af274264444819ae9b14dbbf1ab070aff214a05e97db6896c7597e", size = 1279546, upload-time = "2025-11-03T13:01:50.159Z" }, + { url = "https://files.pythonhosted.org/packages/55/d9/e3867222474f6c1b76e89f3bd914595af69f55bf2c1866e984c548afdc15/lz4-4.4.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:24092635f47538b392c4eaeff14c7270d2c8e806bf4be2a6446a378591c5e69e", size = 1368249, upload-time = "2025-11-03T13:01:51.273Z" }, + { url = "https://files.pythonhosted.org/packages/b2/e7/d667d337367686311c38b580d1ca3d5a23a6617e129f26becd4f5dc458df/lz4-4.4.5-cp312-cp312-win32.whl", hash = "sha256:214e37cfe270948ea7eb777229e211c601a3e0875541c1035ab408fbceaddf50", size = 88189, upload-time = "2025-11-03T13:01:52.605Z" }, + { url = "https://files.pythonhosted.org/packages/a5/0b/a54cd7406995ab097fceb907c7eb13a6ddd49e0b231e448f1a81a50af65c/lz4-4.4.5-cp312-cp312-win_amd64.whl", hash = "sha256:713a777de88a73425cf08eb11f742cd2c98628e79a8673d6a52e3c5f0c116f33", size = 99497, upload-time = "2025-11-03T13:01:53.477Z" }, + { url = "https://files.pythonhosted.org/packages/6a/7e/dc28a952e4bfa32ca16fa2eb026e7a6ce5d1411fcd5986cd08c74ec187b9/lz4-4.4.5-cp312-cp312-win_arm64.whl", hash = "sha256:a88cbb729cc333334ccfb52f070463c21560fca63afcf636a9f160a55fac3301", size = 91279, upload-time = "2025-11-03T13:01:54.419Z" }, +] + [[package]] name = "mako" version = "1.3.10" @@ -1765,6 +1809,18 @@ dependencies = [ { name = "torch" }, ] +[[package]] +name = "rationai-sdk" +version = "0.1.0" +source = { git = "https://gitlab.ics.muni.cz/rationai/infrastructure/rationai-sdk-python.git#a4d25084850cd26678783485dda87bbeed949492" } +dependencies = [ + { name = "httpx" }, + { name = "lz4" }, + { name = "numpy" }, + { name = "pillow" }, + { name = "tenacity" }, +] + [[package]] name = "ray" version = "2.53.0" @@ -2025,6 +2081,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tenacity" +version = "9.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, +] + [[package]] name = "threadpoolctl" version = "3.6.0" @@ -2177,6 +2242,7 @@ dependencies = [ { name = "openpyxl" }, { name = "pandas" }, { name = "pyvips" }, + { name = "rationai-sdk" }, { name = "rationai-masks" }, { name = "rationai-mlkit" }, { name = "ray" }, @@ -2203,6 +2269,7 @@ requires-dist = [ { name = "openpyxl", specifier = ">=3.1.5" }, { name = "pandas", specifier = ">=2.3.3" }, { name = "pyvips", specifier = "<3.1" }, + { name = "rationai-sdk", git = "https://gitlab.ics.muni.cz/rationai/infrastructure/rationai-sdk-python.git" }, { name = "rationai-masks", git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/masks.git" }, { name = "rationai-mlkit", git = "https://gitlab.ics.muni.cz/rationai/digital-pathology/libraries/mlkit.git" }, { name = "ray", specifier = ">=2.52.1" },