diff --git a/.gitignore b/.gitignore index 9649798c4..975d1c690 100644 --- a/.gitignore +++ b/.gitignore @@ -169,3 +169,4 @@ data src/polus/plugins/_plugins/manifests/* # allow python scripts insied manifests dir !src/polus/plugins/_plugins/manifests/*.py +uv.lock diff --git a/utils/bbbc-download-plugin/.bumpversion.cfg b/utils/bbbc-download-plugin/.bumpversion.cfg new file mode 100644 index 000000000..ba5924e46 --- /dev/null +++ b/utils/bbbc-download-plugin/.bumpversion.cfg @@ -0,0 +1,27 @@ +[bumpversion] +current_version = 0.1.1-dev0 +commit = True +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:plugin.json] + +[bumpversion:file:VERSION] + +[bumpversion:file:src/polus/plugins/utils/bbbc_download/__init__.py] diff --git a/utils/bbbc-download-plugin/Dockerfile b/utils/bbbc-download-plugin/Dockerfile new file mode 100644 index 000000000..4f10e8d3b --- /dev/null +++ b/utils/bbbc-download-plugin/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.13-slim + +# environment variables defined in polusai/bfio +ENV EXEC_DIR="/opt/executables" +ENV POLUS_IMG_EXT=".ome.tif" +ENV POLUS_TAB_EXT=".csv" +ENV POLUS_LOG="INFO" + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +# When building from repo root: -f utils/bbbc-download-plugin/Dockerfile . +COPY utils/bbbc-download-plugin/pyproject.toml ${EXEC_DIR} +COPY utils/bbbc-download-plugin/VERSION ${EXEC_DIR} +COPY utils/bbbc-download-plugin/README.md ${EXEC_DIR} +COPY utils/bbbc-download-plugin/src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +ENTRYPOINT ["python3", "-m", "polus.plugins.utils.bbbc_download"] +CMD ["--help"] diff --git a/utils/bbbc-download-plugin/README.md b/utils/bbbc-download-plugin/README.md new file mode 100644 index 000000000..d10719a2c --- /dev/null +++ b/utils/bbbc-download-plugin/README.md @@ -0,0 +1,41 @@ +# BBBC Download (0.1.0-dev1) + +This plugin is designed to download the necessary datasets from the Broad Bioimage Benchmark Collection(BBBC) website. + +For information on the BBBC dataset, visit +[BBBC dataset information](https://bbbc.broadinstitute.org/image_sets/). +The tables on this webpage classify datasets by their biological application. Each dataset has a webpage that contains links to the data and describes information about the dataset. Almost every dataset has image data and ground truth data. There are a few datasets that have metadata rather than ground truth data. + +## Building + +To build the Docker image for the download plugin, run +`bash build-docker.sh`. + +## Run the Docker image + +To execute the built docker image for the download plugin, run +`bash run-plugin.sh`. + +## Options + +This plugin takes 1 input arguments and +1 output argument: + +| Name | Description | I/O | Type | +| --------------- | ------------------------------------------------------------ | ------ | ----------- | +| `--name ` | The name of the datasets to be downloaded | Input | String | +| `--outDir` | Directory to store the downloaded datasets | Output | genericData | + +The following are valid names for datasets: +`"All"`- To download all the datasets from the bbbc website +`"IDAndSegmentation"`- To download the datasets from the Identification and segmentation table +`"PhenotypeClassification"`- To download the datasets from the Phenotype classification table +`"ImageBasedProfiling"`- To download the datasets from the Image-based Profiling table + +To download specific datasets from the website, give the name of each dataset in the input argument seperated by a comma. example: `--name="BBBC001,BBBC002,BBBC003"` + +### NOTE +BBBC046 dataset download is not supported by this plugin. + +## Sample docker command: +```docker run -v /home/ec2-user/data/:/home/ec2-user/data/ polusai/bbbc-download-plugin:0.1.0-dev0 --name="BBBC001" --outDir=/home/ec2-user/data/output``` diff --git a/utils/bbbc-download-plugin/VERSION b/utils/bbbc-download-plugin/VERSION new file mode 100644 index 000000000..44bf4db83 --- /dev/null +++ b/utils/bbbc-download-plugin/VERSION @@ -0,0 +1 @@ +0.1.1-dev0 diff --git a/utils/bbbc-download-plugin/build-docker.sh b/utils/bbbc-download-plugin/build-docker.sh new file mode 100644 index 000000000..3bfcb041b --- /dev/null +++ b/utils/bbbc-download-plugin/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$(", + "Matthew McIntyre " + ] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.12" +typer = ">=0.24.0" +pyarrow = ">=23.0.0" +scikit-image = ">=0.25.0" +bfio = ">=2.5.0" +beautifulsoup4 = ">=4.14.3" +numpy = ">=1.26.0" +pandas = ">=2.2.3" +requests = ">=2.32.5" +pydantic = ">=2.12.5" +bump2version = "1.0.1" +mypy = ">=1.19.1" +tqdm = ">=4.67.0" +pytest = ">=9.0.0" +xmlschema = ">=4.3.1" +lxml = ">=6.0.2" + + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/utils/bbbc-download-plugin/run-plugin.sh b/utils/bbbc-download-plugin/run-plugin.sh new file mode 100644 index 000000000..5cb74e231 --- /dev/null +++ b/utils/bbbc-download-plugin/run-plugin.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +version=$( Self: + if not self.path.exists(): + raise ValueError("No metadata") + return self + + @property + def size(self) -> int: + """Returns the size of the dataset's metadata in bytes.""" + + raw_path = root.joinpath(self.name, "raw/Metadata") + standard_path = root.joinpath(self.name, "standard/Metadata") + raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) + standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) + + return raw_sum + standard_sum + + +class GroundTruth(pydantic.BaseModel): + """Class that contains information about a dataset's ground truth.""" + + path: Path + name: str + + @model_validator(mode="after") + def validate_data(self) -> Self: + if not self.path.exists(): + raise ValueError("No ground truth") + + return self + + @property + def size(self) -> int: + """Returns the size of the dataset's ground truth in bytes.""" + + raw_path = root.joinpath(self.name, "raw/Ground_Truth") + standard_path = root.joinpath(self.name, "standard/Ground_Truth") + raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) + standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) + + return raw_sum + standard_sum + + +class Images(pydantic.BaseModel): + """Class that contains information about a dataset's images.""" + + path: Path + name: str + + + def validate_data(self) -> Self: + if not self.path.exists(): + raise ValueError("No images") + + return self + + @property + def size(self) -> int: + """Returns the size of the dataset's images in bytes.""" + + raw_path = root.joinpath(self.name, "raw/Images") + standard_path = root.joinpath(self.name, "standard/Images") + raw_sum = sum(os.path.getsize(file) for file in raw_path.rglob("*")) + standard_sum = sum(os.path.getsize(file) for file in standard_path.rglob("*")) + + return raw_sum + standard_sum + + +class BBBCDataset(pydantic.BaseModel): + """Class that models a BBBC dataset. + + Attributes: + name: The name of the dataset. + images: An Images object that contains information about the dataset's images + ground_truth: A GroundTruth object that contains information about the dataset's ground truth + metadata: A Metadata object that contains information about the dataset's metadata + """ + + name: str + images: Optional[Images] = None + ground_truth: Optional[GroundTruth] = None + metadata: Optional[Metadata] = None + output_path: Optional[Path]= None + + @field_validator("name") + @classmethod + def valid_name(cls, v: str) -> str: + """Validates the name of the dataset. + + Args: + v: The name of the dataset to be downloaded. + + Returns: + The name provided if validation is successful. + """ + + if v not in list(BBBC.combined_table()["Accession"]): + raise ValueError( + v + + " is an invalid dataset name. Valid dataset names belong to an existing BBBC dataset." + ) + + return v + + @classmethod + def create_dataset(cls, name: str) -> Union["BBBCDataset", None]: + """Creates a dataset. + + Args: + name: The name of the dataset to be created. + + Returns: + A new instance of a Dataset object or None if the validation fails. + """ + + try: + if name in exception_sets: + dataset_class = globals()[name] + + return dataset_class(name=name) + else: + return BBBCDataset(name=name) + except ValueError as e: + logger.info(f"{e}") + + return None + + @property + def info(self) -> Dict[str, Union[str, np.int64]]: + """Provides information about the dataset such as its description and total images. + + Returns: + A dictionary that contains information about the dataset. + """ + + table = BBBC.combined_table() + + row = table.loc[table["Accession"] == self.name] + + info = { + "Description": row["Description"].values[0], + "Mode": row["Mode"].values[0], + "Fields per sample": row["Fields per sample"].values[0], + "Total Fields": row["Total Fields"].values[0], + "Total Images": row["Total Images"].values[0], + "Ground truth types": self._ground_truth_types(), + } + + return info + + @property + def size(self) -> int: + """Returns the size of the dataset in bytes.""" + + dataset_path = self.output_path.joinpath("BBBC",self.name) + + return sum(os.path.getsize(file) for file in dataset_path.rglob("*")) + + def _ground_truth_types(self) -> List[str]: + """Provides the types of ground truth used by the dataset. + + Returns: + A list of strings where each string is a type of ground truth. + """ + + res = requests.get("https://bbbc.broadinstitute.org/image_sets") + soup = bs4.BeautifulSoup(res.content, "html.parser") + types = [] + + for t in soup.find_all("table")[:3]: + for row in t.find_all("tr"): + cols = row.find_all("td") + + if len(cols) > 0 and cols[0].text == self.name: + for link in cols[6].find_all("a"): + types.append(link.attrs["href"].split("#")[-1]) + + return types + + def _init_data(self,download_path:Path) -> None: + """Initializes the images, ground_truth, and metadata attributes of the dataset.""" + download_path=download_path.joinpath("BBBC") + + images_path = download_path.joinpath(self.name, "raw/Images") + truth_path = download_path.joinpath(self.name, "raw/Ground_Truth") + meta_path = download_path.joinpath(self.name, "raw/Metadata") + + try: + self.images = Images(path=images_path, name=self.name) + except ValueError: + pass + + try: + self.ground_truth = GroundTruth(path=truth_path, name=self.name) + except ValueError: + pass + + try: + self.metadata = Metadata(path=meta_path, name=self.name) + except ValueError: + pass + + if self.images == None: + logger.info(f"{self.name} has no images") + + if self.ground_truth == None and self.metadata == None: + logger.info(f"{self.name} has no ground truth or metadata") + + return + + def raw(self,download_path: Path) -> None: + """Download the dataset's raw data.""" + self.output_path=download_path + + download(self.name,download_path) + self._init_data(download_path) + + return + + def standard(self, extension: str) -> None: + """Standardize the dataset's raw data. + + Args: + extension: The extension of the standard image. Can be ".ome.tif" or ".ome.zarr". + """ + + if extension not in [".ome.tif", ".ome.zarr"]: + logger.info( + f"ERROR: {extension} is an invalid extension for standardization. Must be .ome.tif or .ome.zarr." + ) + return + + if self.images == None: + logger.info( + f"ERROR: Images for {self.name} have not been downloaded so they cannot be standardized." + ) + return + + standard_folder = Path(root, self.name, "standard") + arrow_file = Path("arrow", self.name + ".arrow") + arrow_table = pq.read_table(arrow_file) + df = arrow_table.to_pandas() + if not standard_folder.exists(): + standard_folder.mkdir(parents=True, exist_ok=True) + + for i, row in df.iterrows(): + func = globals()[self.name + "_mapping"] + out_file = func(row, extension) + raw_image = io.imread(row["Path"]) + num_channels = 1 if len(raw_image.shape) == 2 else raw_image.shape[2] + + if row["Image Type"] == "Intensity": + sub_folder = "Images" + elif row["Image Type"] == "Ground Truth": + sub_folder = "Ground_Truth" + elif row["Image Type"] == "Metadata": + sub_folder = "Metadata" + else: + logger.info(f"ERROR: Invalid value for attribute Image Type") + return + + save_path = standard_folder.joinpath(sub_folder) + + if not save_path.exists(): + save_path.mkdir(parents=True, exist_ok=True) + + with BioWriter(save_path.joinpath(out_file)) as bw: + bw.X, bw.Y, bw.Z, bw.C = ( + raw_image.shape[1], + raw_image.shape[0], + num_channels, + 1, + ) + bw.dtype = raw_image.dtype + bw[:] = raw_image + + logger.info(f"Finished standardizing {self.name}") + + return + + +class BBBC019(BBBCDataset): + def raw(self,download_path:Path) -> None: + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") + + # Separate images from ground truth + save_location = save_location.joinpath("BBBC019") + images_folder = save_location.joinpath("raw/Images") + truth_folder = save_location.joinpath("raw/Ground_Truth") + for set in [ + x + for x in images_folder.iterdir() + if x.name not in [".DS_Store", "__MACOSX"] + ]: + for obj in [ + x + for x in set.iterdir() + if x.name not in ["images", "measures.mat", "desktop.ini", ".DS_Store"] + ]: + src = images_folder.joinpath(set.name, obj.name) + dst = truth_folder.joinpath(set.name, obj.name) + + if dst.exists(): + try: + shutil.rmtree(src) + except NotADirectoryError as e: + logger.info(f"{e}") + else: + shutil.move(src, dst) + + + self._init_data(download_path) + + return + + +class BBBC029(BBBCDataset): + def raw(self,download_path:Path) -> None: + logger.info(f"Started downloading BBBC029") + self.output_path=download_path + save_location=download_path.joinpath("BBBC") + + save_location = save_location.joinpath("BBBC029", "raw") + + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + file_path = save_location.joinpath("Images") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC029/images.zip", + file_path, + "BBBC029", + ) + + file_path = save_location.joinpath("Ground_Truth") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC029/ground_truth.zip", + file_path, + "BBBC029", + ) + + logger.info(f"BBBC029 has finished downloading") + images_folder=save_location.joinpath("Images") + truth_folder=save_location.joinpath("Ground_Truth") + remove_macosx("BBBC029",images_folder) + remove_macosx("BBBC029",truth_folder) + source_directory=images_folder.joinpath("images") + for source_file in source_directory.glob("*"): + destination_file = images_folder / source_file.name + shutil.move(source_file, destination_file) + shutil.rmtree(source_directory) + + source_directory=truth_folder.joinpath("ground_truth") + for source_file in source_directory.glob("*"): + destination_file = truth_folder / source_file.name + shutil.move(source_file, destination_file) + shutil.rmtree(source_directory) + + self._init_data(download_path) + + return + + +class BBBC041(BBBCDataset): + def raw(self,download_path:Path) -> None: + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") + + # Separate images from ground truth + save_location = save_location.joinpath("BBBC041") + file_names = ["test.json", "training.json"] + + if not save_location.joinpath("raw/Ground_Truth").exists(): + save_location.joinpath("raw/Ground_Truth").mkdir( + parents=True, exist_ok=True + ) + + for file in file_names: + src = save_location.joinpath("raw/Images/malaria", file) + dst = save_location.joinpath("raw/Ground_Truth") + + if dst.joinpath(file).exists(): + os.remove(src) + else: + shutil.move(src, dst) + + self._init_data(download_path) + + return + + +class BBBC042(BBBCDataset): + def raw(self,download_path:Path) -> None: + logger.info(f"Started downloading BBBC042") + self.output_path=download_path + save_location=download_path.joinpath("BBBC") + + save_location = save_location.joinpath("BBBC042", "raw") + + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + file_path = save_location.joinpath("Images") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC042/images.zip", + file_path, + "BBBC042", + ) + + file_path = save_location.joinpath("Ground_Truth") + get_url( + "https://data.broadinstitute.org/bbbc/BBBC042/positions.zip", + file_path, + "BBBC042", + ) + + logger.info(f"BBBC042 has finished downloading") + images_folder=save_location.joinpath("Images") + truth_folder=save_location.joinpath("Ground_Truth") + remove_macosx("BBBC029",images_folder) + remove_macosx("BBBC029",truth_folder) + + self._init_data(download_path) + + return + + +class BBBC046(BBBCDataset): + def raw(self, download_path: Path) -> None: + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") + + # Separate images from ground truth + try: + save_location = save_location.joinpath(self.name) + images_folder = save_location.joinpath("raw/Images") + truth_folder = save_location.joinpath("raw/Ground_Truth") + + # Extract these files because they do not extract automatically + for file in ["OE-ID350-AR-1.zip", "OE-ID350-AR-2.zip", "OE-ID350-AR-4.zip", "OE-ID350-AR-8.zip"]: + with ZipFile(images_folder.joinpath(file), "r") as zfile: + zfile.extractall(images_folder) + + os.remove(images_folder.joinpath(file)) + + if not truth_folder.exists(): + truth_folder.mkdir(parents=True, exist_ok=True) + + # Iterate over folders in the images folder + for folder in images_folder.iterdir(): + if not truth_folder.joinpath(folder.name).exists(): + truth_folder.joinpath(folder.name).mkdir( + parents=True, exist_ok=True + ) + + # Move ground truth data to Ground Truth folder + for obj in folder.iterdir(): + if obj.name.endswith((".txt", ".tif")): + src = obj + dst = truth_folder.joinpath(folder.name, obj.name) + + if dst.exists(): + os.remove(src) + else: + shutil.move(src, dst) + + self._init_data(download_path) + except Exception as e: + logger.info( + f"BBBC046 downloaded successfully but an error occurred when organizing raw data." + ) + logger.info(f"ERROR: {str(e)}") + + return + + +class BBBC054(BBBCDataset): + def raw(self, download_path:Path) -> None: + download(self.name,download_path) + self.output_path=download_path + save_location=download_path.joinpath("BBBC") + + # Separate images from ground truth + save_location = save_location.joinpath(self.name) + src = save_location.joinpath("raw/Images", "Replicate1annotation.csv") + dst = save_location.joinpath("raw/Ground_Truth", "Replicate1annotation.csv") + + if not dst.exists(): + dst.mkdir(parents=True, exist_ok=True) + + if dst.exists(): + os.remove(src) + else: + shutil.move(src, dst) + + self._init_data(download_path) + + return + + +class IDAndSegmentation: + """Class that models the Identification and segmentation table on https://bbbc.broadinstitute.org/image_sets. + + Attributes: + name: The name of the table as seen on the BBBC image set webpage + table: The Identification and segmentation table as a pandas DataFrame + """ + + name: str = "Identification and segmentation" + table: pd.DataFrame = tables[0] + + @classmethod + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in the table. + + Returns: + A list containing a Dataset object for each dataset in the table. + """ + + return [BBBCDataset.create_dataset(name) for name in cls.table["Accession"]] + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset in this table""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in IDAndSegmentation.datasets(): + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() + + +class PhenotypeClassification: + """Class that models the Phenotype classification table on https://bbbc.broadinstitute.org/image_sets. + + Attributes: + name: The name of the table as seen on the BBBC image set webpage + table: The Phenotype classification table as a pandas DataFrame + """ + + name: str = "Phenotype classification" + table: pd.DataFrame = tables[1] + + @classmethod + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in the table. + + Returns: + A list containing a Dataset object for each dataset in the table. + """ + + return [BBBCDataset.create_dataset(name) for name in cls.table["Accession"]] + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset in this table""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in PhenotypeClassification.datasets(): + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() + + +class ImageBasedProfiling: + """Class that models the Image-based Profiling table on https://bbbc.broadinstitute.org/image_sets. + + Attributes: + name: The name of the table as seen on the BBBC image set webpage + table: The Image-based Profiling table as a pandas DataFrame + """ + + name: str = "Image-based Profiling" + table: pd.DataFrame = tables[2] + + @classmethod + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in the table. + + Returns: + A list containing a Dataset object for each dataset in the table. + """ + + return [BBBCDataset.create_dataset(name) for name in cls.table["Accession"]] + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset in this table""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in ImageBasedProfiling.datasets(): + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() + + +class BBBC: + """Class that models the Broad Bioimage Benchmark Collection (BBBC). + + BBBC has tables that contain datasets. Datasets are separated into tables + based on how they can be used. Each dataset has images and ground truth. + Read more about BBBC here: https://bbbc.broadinstitute.org. + """ + + @classmethod + def datasets(cls) -> List[BBBCDataset]: + """Returns a list of all datasets in BBBC. + + Returns: + A list containing a Dataset object for each dataset in BBBC. + """ + + table = BBBC.combined_table() + + return [BBBCDataset.create_dataset(name) for name in table["Accession"]] + + @classmethod + def combined_table(cls) -> pd.DataFrame: + """Combines each table on https://bbbc.broadinstitute.org/image_sets into a single table. + + Returns: + A pandas DataFrame representation of the combined table. + """ + + # Combine each table into one table + combined_table = ( + pd.concat(tables) + .drop(columns=["Ground truth"]) + .drop_duplicates("Accession") + ) + + return combined_table + + @classmethod + def raw(cls,download_path:Path) -> None: + """Downloads raw data for every dataset.""" + + num_workers = max(cpu_count(), 2) + threads = [] + + with ThreadPoolExecutor(max_workers=num_workers) as executor: + for dataset in BBBC.datasets(): + threads.append(executor.submit(dataset.raw(download_path))) + + for f in tqdm( + as_completed(threads), desc=f"Downloading data", total=len(threads) + ): + f.result() diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py new file mode 100644 index 000000000..1e5dffd13 --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__init__.py @@ -0,0 +1,2 @@ +"""Bbbc Download.""" +__version__ = "0.1.1-dev0" diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py new file mode 100644 index 000000000..cb99fb011 --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/__main__.py @@ -0,0 +1,100 @@ +"""BBBC Download.""" +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor +from concurrent.futures import as_completed +from multiprocessing import cpu_count +from pathlib import Path +from sys import platform + +import typer +from polus.plugins.utils.bbbc_download.BBBC_model import BBBC +from polus.plugins.utils.bbbc_download.BBBC_model import BBBCDataset +from polus.plugins.utils.bbbc_download.BBBC_model import IDAndSegmentation +from polus.plugins.utils.bbbc_download.BBBC_model import ImageBasedProfiling +from polus.plugins.utils.bbbc_download.BBBC_model import PhenotypeClassification +from tqdm import tqdm + +if platform == "linux" or platform == "linux2": + NUM_THREADS = len(os.sched_getaffinity(0)) # type: ignore +else: + NUM_THREADS = max(cpu_count() // 2, 2) + +app = typer.Typer() + +# Initialize the logger +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +logger = logging.getLogger("polus.plugins.utils.bbbc_download") +logger.setLevel(os.environ.get("POLUS_LOG", logging.INFO)) + + +@app.command() +def main( + name: str = typer.Option( + ..., + "--name", + help="The name of the dataset that is to be downloaded", + ), + out_dir: Path = typer.Option( + ..., + "--outDir", + help="The path for downloading the dataset", + ), +) -> None: + """Download the required dataset from the BBBC dataaset.""" + logger.info(f"name = {name}") + logger.info(f"outDir = {out_dir}") + """Checking if output directory exists. + If it does not exist then a designated path is created.""" + if not out_dir.exists(): + logger.info(f"{out_dir} did not exists. Creating new path.") + out_dir.mkdir() + if not out_dir.exists(): + msg = "Directory does not exist" + raise ValueError(msg) + + with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: + start_time = time.time() + threads = [] + names = name.split(",") + for n in names: + if n == "IDAndSegmentation": + threads.append(executor.submit(IDAndSegmentation.raw, out_dir)) + + elif n == "PhenotypeClassification": + threads.append(executor.submit(PhenotypeClassification.raw, out_dir)) + + elif n == "ImageBasedProfiling": + threads.append(executor.submit(ImageBasedProfiling.raw, out_dir)) + + elif n == "All": + threads.append(executor.submit(BBBC.raw, out_dir)) + + else: + d = executor.submit(BBBCDataset.create_dataset, n) + d_name = d.result() + threads.append(executor.submit(d_name.raw, out_dir)) + + for f in tqdm( + as_completed(threads), + total=len(threads), + mininterval=5, + desc="donwloading the dataset", + initial=0, + unit_scale=True, + colour="cyan", + ): + f.result() + end_time = time.time() + execution_time = end_time - start_time + execution_time_min = execution_time / 60 + logger.info(f"The execution time is {execution_time} in seconds") + logger.info(f"The execution time is {execution_time_min} in minutes") + + +if __name__ == "__main__": + app() diff --git a/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py new file mode 100644 index 000000000..7936a8817 --- /dev/null +++ b/utils/bbbc-download-plugin/src/polus/plugins/utils/bbbc_download/download.py @@ -0,0 +1,182 @@ +from pathlib import Path +import re +from urllib.request import urlretrieve +from urllib.error import URLError +from zipfile import ZipFile +import logging + +import bs4 +import shutil +import requests + +match_str = ( + "Images|Ground truth|Ground Truth|Metadata|Hand-annotated Ground Truth Images" +) +endings = (".txt", ".csv", ".tif", ".xlsx", ".xls", ".lst") +logger = logging.getLogger(__name__) + +def get_lower_tags(tag: bs4.element.Tag) -> list: + """Get all tags between the tag argument and the next tag of the same type. + Args: + tag: Get tags between this tag and the next tag of the same type + """ + + tags = [] + + for sib in tag.find_next_siblings(): + if sib.name == tag.name: + break + else: + tags.append(sib) + + return tags + +def extract_nested_zips(name: str,zip_path:Path, extract_path:Path): + """Unzip nested zip files. + Args: + name: Name of the dataset + zip_path: Path to the zip file + extract_path: The path where the unzipped files will be saved + """ + + with ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(extract_path) + zip_path.unlink() + extracted_folder_name = zip_path.stem # Name with .zip extension + extracted_folder_name = extract_path.joinpath(extracted_folder_name.replace('.zip','')) + remove_macosx(name,extract_path) + + nested_zip_files = list(extracted_folder_name.glob("*.zip")) + + for nested_zip_file in nested_zip_files: + nested_extract_path = nested_zip_file.parent + extract_nested_zips(nested_zip_file, nested_extract_path) + + +def get_url(url: str, save_location: Path, name: str) -> None: + """Get the given url and save it. + Args: + url: The url to get + save_location: The path where the files will be saved + name: The name of the dataset that the url is associated with + """ + + file_name = url.split("/")[-1] + + for download_attempts in range(10): + if url.endswith(endings): + try: + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + urlretrieve(url, save_location.joinpath(file_name)) + except URLError as e: + if download_attempts == 9: + logger.info(f"FAILED TO DOWNLOAD {url} for {name}") + logger.info(f"ERROR {str(e)}") + + continue + elif url.endswith(".zip"): + try: + zip_path, _ = urlretrieve(url) + + with ZipFile(zip_path, "r") as zfile: + zfile.extractall(save_location) + + + + except URLError as e: + if download_attempts == 9: + logger.info(f"FAILED TO DOWNLOAD {url} for {name}") + logger.info(f"ERROR {str(e)}") + + continue + except Exception as e: + logger.info(f"{e}") + + continue + + break + + return + +def remove_macosx(name:str, location:Path)-> None: + """ Remove the __MACOSX folder from the downlpoaded dataset. + Args: + name: The name of the dataset + location: The partent directory of the __MACOSX folder. + """ + folders=[folders for folders in location.iterdir() if folders.is_dir()] + for f in folders: + if f.name=="__MACOSX": + shutil.rmtree(f) + logger.info(f"Deleted the __MACOSX folder in {name}") + +def download(name: str,download_path:Path) -> None: + """Download a single dataset. + Args: + name: The name of the dataset to be downloaded + downlaod_path: Path to donwload the dataset + """ + + logger.info(f"Started downloading {name}") + download_path=download_path.joinpath("BBBC") + + save_location = download_path.joinpath(name, "raw") + + if not save_location.exists(): + save_location.mkdir(parents=True, exist_ok=True) + + dataset_url = "https://bbbc.broadinstitute.org/" + name + + dataset_page = requests.get(dataset_url) + soup = bs4.BeautifulSoup(dataset_page.content, "html.parser") + + for heading in soup.find_all("h3"): + # Ignore headings that we aren't interested in + if re.match(match_str, heading.text.strip()) == None: + continue + + if heading.text.strip() == "Images": + sub_folder = "Images" + elif heading.text.strip() == "Metadata": + sub_folder = "Metadata" + else: + sub_folder = "Ground_Truth" + + # Iterate over every tag under the current heading and above the next heading + for tag in get_lower_tags(heading): + links = tag.find_all("a") + data_links = [ + l for l in links if l.attrs["href"].endswith((".zip", *endings)) + ] + + for link in data_links: + data_url = link.attrs["href"] + file_path = save_location.joinpath(sub_folder) + + get_url(data_url, file_path, name) + + # Manually download BBBC018 ground truth because its webpage structure is incorrect + if name == "BBBC018" and re.match("Ground truth", heading.text.strip()): + url = "https://data.broadinstitute.org/bbbc/BBBC018/BBBC018_v1_outlines.zip" + + file_path = save_location.joinpath(sub_folder) + + get_url(url, file_path, "BBBC018") + + logger.info(f"{name} has finished downloading") + + images_path=save_location.joinpath("Images") + remove_macosx(name,images_path) + ground_path=save_location.joinpath("Ground_Truth") + if ground_path.exists(): + remove_macosx(name,ground_path) + + # unzip nested zip files + zip_files = list(images_path.glob("**/*.zip")) + for zip_file in zip_files: + extract_path = zip_file.parent + extract_nested_zips(name,zip_file, extract_path) + + return diff --git a/utils/bbbc-download-plugin/tests/__init__.py b/utils/bbbc-download-plugin/tests/__init__.py new file mode 100644 index 000000000..437dfbef1 --- /dev/null +++ b/utils/bbbc-download-plugin/tests/__init__.py @@ -0,0 +1 @@ +"""bbbc download plugin.""" diff --git a/utils/bbbc-download-plugin/tests/test_main.py b/utils/bbbc-download-plugin/tests/test_main.py new file mode 100644 index 000000000..fd65f7cb1 --- /dev/null +++ b/utils/bbbc-download-plugin/tests/test_main.py @@ -0,0 +1,94 @@ +import pathlib +import shutil +import tempfile +import numpy as np +import pytest +import requests +import skimage +from bfio import BioReader +from skimage import io +from typer.testing import CliRunner + +from polus.plugins.utils.bbbc_download.__main__ import app as app +from polus.plugins.utils.bbbc_download import BBBC_model, download + +runner = CliRunner() + + +@pytest.fixture +def output_directory(): + """Generate random output directory.""" + out_dir = pathlib.Path(tempfile.mkdtemp(dir=pathlib.Path.cwd())) + yield out_dir + shutil.rmtree(out_dir) + + +@pytest.fixture +def macosx_directory(): + """Generate random directory named __MACOSX.""" + test_dir = pathlib.Path(tempfile.mkdtemp(dir=pathlib.Path.cwd())) + macosx_dir = test_dir.joinpath("Images", "__MACOSX") + macosx_dir.mkdir(parents=True) + yield macosx_dir + shutil.rmtree(macosx_dir.parents[1]) + + +def test_delete_macosx(macosx_directory) -> None: + """Testing the delete_macosx function in download.py""" + mac_dir = macosx_directory + mac_dir = pathlib.Path(mac_dir) + + mac_dir_test = mac_dir.parent + macosx_test_name = "testname" + download.remove_macosx(macosx_test_name, mac_dir_test) + assert mac_dir.exists() == False + + +def test_bbbc_datasets() -> None: + """Test to check if all the datasets on the BBBC website are recognized.""" + d_test = BBBC_model.BBBC.datasets() + assert len(d_test) == 50 + + +def test_raw(output_directory) -> None: + """A function to test the download functionality.""" + d = BBBC_model.BBBCDataset.create_dataset("BBBC054") # change dataset name to test + output_dir = pathlib.Path(output_directory) + d.raw(output_dir) + assert d.size > 0 + + +def test_IDAndSegmentation() -> None: + """Test to check if all the datasets on the Identification and segmentation table are recognized.""" + d_test_IDAndSegmentation = BBBC_model.IDAndSegmentation.datasets() + assert len(d_test_IDAndSegmentation) == 32 + + +def test_PhenotypeClassification() -> None: + """Test to check if all the datasets on the Phenotype CLassification table are recognized.""" + d_test_PhenotypeClassification = BBBC_model.PhenotypeClassification.datasets() + assert len(d_test_PhenotypeClassification) == 14 + + +def test_ImageBasedProfiling() -> None: + """Test to check if all the datasets on the Image based profiling table are recognized.""" + d_test_ImageBasedProfiling = BBBC_model.ImageBasedProfiling.datasets() + assert len(d_test_ImageBasedProfiling) == 6 + + +def test_cli(output_directory) -> None: + """Test Cli.""" + name = "BBBC001,BBBC002" + output_dir = pathlib.Path(output_directory) + + result = runner.invoke( + app, + [ + "--name", + name, + "--outDir", + output_dir, + ], + ) + + assert result.exit_code == 0