From 5004e7d9311935dbf05fe920fe4e50e7cb579b2a Mon Sep 17 00:00:00 2001
From: Zach Barry <zbarry@protonmail.com>
Date: Sun, 19 Sep 2021 16:03:23 -0400
Subject: [PATCH 1/2] readme updates, remove max value clipping in dataset
 creation update readme with the proper returned metadata namedtuple structure

---
 README.md         |  6 ++++--
 pybbbc/bbbc021.py |  2 +-
 pybbbc/image.py   | 16 +++++++++++++++-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f9fd9dc..ed40d14 100644
--- a/README.md
+++ b/README.md
@@ -51,10 +51,12 @@ metadata = (
         compound,
         concentration,
         moa
-    )
+    ),
+    image_idx
 )
 ```
-
+where `image_idx` is the absolute index of the image in the BBBC021 dataset
+without filtering applied.
 
 ### Filtering
 The instance of a dataset can be intuitively filtered during the initiation using metadata keyword arguments as follows:
diff --git a/pybbbc/bbbc021.py b/pybbbc/bbbc021.py
index 9a864cf..f39570b 100644
--- a/pybbbc/bbbc021.py
+++ b/pybbbc/bbbc021.py
@@ -8,7 +8,7 @@
 from typing import Tuple, Union
 
 import h5py
-import janitor
+import janitor  # noqa: F401
 import numpy as np
 import pandas as pd
 
diff --git a/pybbbc/image.py b/pybbbc/image.py
index f9398c2..655391d 100644
--- a/pybbbc/image.py
+++ b/pybbbc/image.py
@@ -20,7 +20,21 @@ def correct_illumination(
 
 
 def scale_pixel_intensity(images: np.ndarray) -> np.ndarray:
+    """
+    Scale image intensities from [0...1] such that 0 represents the 0.1th
+    percentile and 1 the 99.9th percentile intensity in the original image.
+
+    Intensities are clipped from [0, inf].
+
+    Args:
+        images:
+
+    Returns:
+        Copy of `images` with pixel intensities rescaled.
+    """
     low = np.percentile(images, 0.1)
     high = np.percentile(images, 99.9)
+
     images = (images - low) / (high - low)
-    return np.clip(images, 0, 1)
+
+    return np.clip(images, 0, None)

From 8affebabc0e655e8717fa0b33c13bbabf2e55e69 Mon Sep 17 00:00:00 2001
From: Zach Barry <zbarry@protonmail.com>
Date: Sun, 19 Sep 2021 16:22:31 -0400
Subject: [PATCH 2/2] add metadata dataframe documentation to readme

---
 README.md       | 74 +++++++++++++++++++++++++++++++++++++++++++++++--
 pybbbc/image.py | 12 ++++++--
 2 files changed, 81 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index ed40d14..0abfc9f 100644
--- a/README.md
+++ b/README.md
@@ -61,8 +61,12 @@ without filtering applied.
 ### Filtering
 The instance of a dataset can be intuitively filtered during the initiation using metadata keyword arguments as follows:
 ```python
-# get the samples without MoA
-bbbc021 = BBBC021(moa='null')
+# get the samples without known MoA
+bbbc021_null = BBBC021(moa='null')
+```
+```python
+# get only samples with known MoAs
+bbbc021_moa = BBBC021(moa=[moa for moa in BBBC021.MOA if moa != "null"])
 ```
 or using a list of metadata values:
 ```python
@@ -90,6 +94,72 @@ img = bbbc021.images[0]
 # bbbc021.moa
 ```
 
+### View the metadata `DataFrame`s
+
+The metadata is compiled into two Pandas `DataFrame`s, `image_df` and `moa_df`,
+which contain only metadata from the selected subset of the BBBC021 dataset.
+
+`image_df` contains metadata information on an individual image level.
+Each row corresponds to an image in the subset of BBBC021 you selected:
+
+```python
+> bbbc021_moa.image_df
+
+      site well  replicate        plate compound  concentration  \
+0        1  B05          1  Week5_28901     AZ-J       1.000000
+1        1  B04          1  Week5_28901     AZ-J       3.000000
+2        1  B03          1  Week5_28901     AZ-J      10.000000
+3        1  B11          1  Week5_28901    taxol       0.300049
+4        1  C11          1  Week5_28901    taxol       0.300049
+...    ...  ...        ...          ...      ...            ...
+3843     4  C02          1  Week4_27481     DMSO       0.000000
+3844     4  D02          1  Week4_27481     DMSO       0.000000
+3845     4  E11          1  Week4_27481     DMSO       0.000000
+3846     4  F11          1  Week4_27481     DMSO       0.000000
+3847     4  G11          1  Week4_27481     DMSO       0.000000
+
+                          moa  image_idx  relative_image_idx
+0                  Epithelial         45                   0
+1                  Epithelial         46                   1
+2                  Epithelial         47                   2
+3     Microtubule stabilizers         48                   3
+4     Microtubule stabilizers         49                   4
+...                       ...        ...                 ...
+3843                     DMSO      13195                3843
+3844                     DMSO      13196                3844
+3845                     DMSO      13197                3845
+3846                     DMSO      13198                3846
+3847                     DMSO      13199                3847
+
+[3848 rows x 9 columns]
+```
+
+`image_idx` corresponds to the absolute index of the image in the full BBBC021 dataset.
+`relative_image_idx` is the index you would use to access the given image as in:
+
+`image, metadata = your_bbbc021_obj[relative_image_idx]`
+
+`moa_df` is a metadata `DataFrame` which provides you with all the compound-concentration pairs in the selected BBBC021 subset:
+
+```python
+> bbbc021_moa.moa_df
+
+        compound  concentration                        moa
+0           ALLN       3.000000        Protein degradation
+1           ALLN     100.000000        Protein degradation
+2           AZ-A       0.099976   Aurora kinase inhibitors
+3           AZ-A       0.300049   Aurora kinase inhibitors
+4           AZ-A       1.000000   Aurora kinase inhibitors
+..           ...            ...                        ...
+99   vincristine       0.029999  Microtubule destabilizers
+100  vincristine       0.099976  Microtubule destabilizers
+101  vincristine       0.300049  Microtubule destabilizers
+102  vincristine       1.000000  Microtubule destabilizers
+103  vincristine       3.000000  Microtubule destabilizers
+
+[104 rows x 3 columns]
+```
+
 ## Data download
 The raw data can be downloaded after importing the BBBC021 dataset as follows:
 ```python
diff --git a/pybbbc/image.py b/pybbbc/image.py
index 655391d..449bfd2 100644
--- a/pybbbc/image.py
+++ b/pybbbc/image.py
@@ -9,13 +9,22 @@
 def correct_illumination(
     images: np.ndarray, sigma=500, min_percentile=0.02
 ) -> np.ndarray:
+    # calculate average of images belonging to the same site and plate
     img_avg = images.mean(axis=0)
+
+    # apply Gaussian filter
     img_mask = gaussian_filter(img_avg.astype(np.float32), sigma=sigma).astype(
         np.float16
     )
+
+    # calculate robust minimum
     robust_min = np.percentile(img_mask[img_mask > 0], min_percentile)
+
+    # clip and scale pixel intensities
     img_mask[img_mask < robust_min] = robust_min
     img_mask = img_mask / robust_min
+
+    # return corrected images
     return images / img_mask
 
 
@@ -26,9 +35,6 @@ def scale_pixel_intensity(images: np.ndarray) -> np.ndarray:
 
     Intensities are clipped from [0, inf].
 
-    Args:
-        images:
-
     Returns:
         Copy of `images` with pixel intensities rescaled.
     """