From 5004e7d9311935dbf05fe920fe4e50e7cb579b2a Mon Sep 17 00:00:00 2001 From: Zach Barry Date: Sun, 19 Sep 2021 16:03:23 -0400 Subject: [PATCH 1/2] readme updates, remove max value clipping in dataset creation update readme with the proper returned metadata namedtuple structure --- README.md | 6 ++++-- pybbbc/bbbc021.py | 2 +- pybbbc/image.py | 16 +++++++++++++++- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f9fd9dc..ed40d14 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,12 @@ metadata = ( compound, concentration, moa - ) + ), + image_idx ) ``` - +where `image_idx` is the absolute index of the image in the BBBC021 dataset +without filtering applied. ### Filtering The instance of a dataset can be intuitively filtered during the initiation using metadata keyword arguments as follows: diff --git a/pybbbc/bbbc021.py b/pybbbc/bbbc021.py index 9a864cf..f39570b 100644 --- a/pybbbc/bbbc021.py +++ b/pybbbc/bbbc021.py @@ -8,7 +8,7 @@ from typing import Tuple, Union import h5py -import janitor +import janitor # noqa: F401 import numpy as np import pandas as pd diff --git a/pybbbc/image.py b/pybbbc/image.py index f9398c2..655391d 100644 --- a/pybbbc/image.py +++ b/pybbbc/image.py @@ -20,7 +20,21 @@ def correct_illumination( def scale_pixel_intensity(images: np.ndarray) -> np.ndarray: + """ + Scale image intensities from [0...1] such that 0 represents the 0.1th + percentile and 1 the 99.9th percentile intensity in the original image. + + Intensities are clipped from [0, inf]. + + Args: + images: + + Returns: + Copy of `images` with pixel intensities rescaled. + """ low = np.percentile(images, 0.1) high = np.percentile(images, 99.9) + images = (images - low) / (high - low) - return np.clip(images, 0, 1) + + return np.clip(images, 0, None) From 8affebabc0e655e8717fa0b33c13bbabf2e55e69 Mon Sep 17 00:00:00 2001 From: Zach Barry Date: Sun, 19 Sep 2021 16:22:31 -0400 Subject: [PATCH 2/2] add metadata dataframe documentation to readme --- README.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++-- pybbbc/image.py | 12 ++++++-- 2 files changed, 81 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ed40d14..0abfc9f 100644 --- a/README.md +++ b/README.md @@ -61,8 +61,12 @@ without filtering applied. ### Filtering The instance of a dataset can be intuitively filtered during the initiation using metadata keyword arguments as follows: ```python -# get the samples without MoA -bbbc021 = BBBC021(moa='null') +# get the samples without known MoA +bbbc021_null = BBBC021(moa='null') +``` +```python +# get only samples with known MoAs +bbbc021_moa = BBBC021(moa=[moa for moa in BBBC021.MOA if moa != "null"]) ``` or using a list of metadata values: ```python @@ -90,6 +94,72 @@ img = bbbc021.images[0] # bbbc021.moa ``` +### View the metadata `DataFrame`s + +The metadata is compiled into two Pandas `DataFrame`s, `image_df` and `moa_df`, +which contain only metadata from the selected subset of the BBBC021 dataset. + +`image_df` contains metadata information on an individual image level. +Each row corresponds to an image in the subset of BBBC021 you selected: + +```python +> bbbc021_moa.image_df + + site well replicate plate compound concentration \ +0 1 B05 1 Week5_28901 AZ-J 1.000000 +1 1 B04 1 Week5_28901 AZ-J 3.000000 +2 1 B03 1 Week5_28901 AZ-J 10.000000 +3 1 B11 1 Week5_28901 taxol 0.300049 +4 1 C11 1 Week5_28901 taxol 0.300049 +... ... ... ... ... ... ... +3843 4 C02 1 Week4_27481 DMSO 0.000000 +3844 4 D02 1 Week4_27481 DMSO 0.000000 +3845 4 E11 1 Week4_27481 DMSO 0.000000 +3846 4 F11 1 Week4_27481 DMSO 0.000000 +3847 4 G11 1 Week4_27481 DMSO 0.000000 + + moa image_idx relative_image_idx +0 Epithelial 45 0 +1 Epithelial 46 1 +2 Epithelial 47 2 +3 Microtubule stabilizers 48 3 +4 Microtubule stabilizers 49 4 +... ... ... ... +3843 DMSO 13195 3843 +3844 DMSO 13196 3844 +3845 DMSO 13197 3845 +3846 DMSO 13198 3846 +3847 DMSO 13199 3847 + +[3848 rows x 9 columns] +``` + +`image_idx` corresponds to the absolute index of the image in the full BBBC021 dataset. +`relative_image_idx` is the index you would use to access the given image as in: + +`image, metadata = your_bbbc021_obj[relative_image_idx]` + +`moa_df` is a metadata `DataFrame` which provides you with all the compound-concentration pairs in the selected BBBC021 subset: + +```python +> bbbc021_moa.moa_df + + compound concentration moa +0 ALLN 3.000000 Protein degradation +1 ALLN 100.000000 Protein degradation +2 AZ-A 0.099976 Aurora kinase inhibitors +3 AZ-A 0.300049 Aurora kinase inhibitors +4 AZ-A 1.000000 Aurora kinase inhibitors +.. ... ... ... +99 vincristine 0.029999 Microtubule destabilizers +100 vincristine 0.099976 Microtubule destabilizers +101 vincristine 0.300049 Microtubule destabilizers +102 vincristine 1.000000 Microtubule destabilizers +103 vincristine 3.000000 Microtubule destabilizers + +[104 rows x 3 columns] +``` + ## Data download The raw data can be downloaded after importing the BBBC021 dataset as follows: ```python diff --git a/pybbbc/image.py b/pybbbc/image.py index 655391d..449bfd2 100644 --- a/pybbbc/image.py +++ b/pybbbc/image.py @@ -9,13 +9,22 @@ def correct_illumination( images: np.ndarray, sigma=500, min_percentile=0.02 ) -> np.ndarray: + # calculate average of images belonging to the same site and plate img_avg = images.mean(axis=0) + + # apply Gaussian filter img_mask = gaussian_filter(img_avg.astype(np.float32), sigma=sigma).astype( np.float16 ) + + # calculate robust minimum robust_min = np.percentile(img_mask[img_mask > 0], min_percentile) + + # clip and scale pixel intensities img_mask[img_mask < robust_min] = robust_min img_mask = img_mask / robust_min + + # return corrected images return images / img_mask @@ -26,9 +35,6 @@ def scale_pixel_intensity(images: np.ndarray) -> np.ndarray: Intensities are clipped from [0, inf]. - Args: - images: - Returns: Copy of `images` with pixel intensities rescaled. """