giacomodeodato · zbarry · Sep 19, 2021 · Sep 19, 2021
diff --git a/README.md b/README.md
@@ -51,16 +51,22 @@ metadata = (
         compound,
         concentration,
         moa
-    )
+    ),
+    image_idx
 )
 ```
-
+where `image_idx` is the absolute index of the image in the BBBC021 dataset
+without filtering applied.
 
 ### Filtering
 The instance of a dataset can be intuitively filtered during the initiation using metadata keyword arguments as follows:
 ```python
-# get the samples without MoA
-bbbc021 = BBBC021(moa='null')
+# get the samples without known MoA
+bbbc021_null = BBBC021(moa='null')
+```
+```python
+# get only samples with known MoAs
+bbbc021_moa = BBBC021(moa=[moa for moa in BBBC021.MOA if moa != "null"])
 ```
 or using a list of metadata values:
 ```python
@@ -88,6 +94,72 @@ img = bbbc021.images[0]
 # bbbc021.moa
 ```
 
+### View the metadata `DataFrame`s
+
+The metadata is compiled into two Pandas `DataFrame`s, `image_df` and `moa_df`,
+which contain only metadata from the selected subset of the BBBC021 dataset.
+
+`image_df` contains metadata information on an individual image level.
+Each row corresponds to an image in the subset of BBBC021 you selected:
+
+```python
+> bbbc021_moa.image_df
+
+      site well  replicate        plate compound  concentration  \
+0        1  B05          1  Week5_28901     AZ-J       1.000000
+1        1  B04          1  Week5_28901     AZ-J       3.000000
+2        1  B03          1  Week5_28901     AZ-J      10.000000
+3        1  B11          1  Week5_28901    taxol       0.300049
+4        1  C11          1  Week5_28901    taxol       0.300049
+...    ...  ...        ...          ...      ...            ...
+3843     4  C02          1  Week4_27481     DMSO       0.000000
+3844     4  D02          1  Week4_27481     DMSO       0.000000
+3845     4  E11          1  Week4_27481     DMSO       0.000000
+3846     4  F11          1  Week4_27481     DMSO       0.000000
+3847     4  G11          1  Week4_27481     DMSO       0.000000
+
+                          moa  image_idx  relative_image_idx
+0                  Epithelial         45                   0
+1                  Epithelial         46                   1
+2                  Epithelial         47                   2
+3     Microtubule stabilizers         48                   3
+4     Microtubule stabilizers         49                   4
+...                       ...        ...                 ...
+3843                     DMSO      13195                3843
+3844                     DMSO      13196                3844
+3845                     DMSO      13197                3845
+3846                     DMSO      13198                3846
+3847                     DMSO      13199                3847
+
+[3848 rows x 9 columns]
+```
+
+`image_idx` corresponds to the absolute index of the image in the full BBBC021 dataset.
+`relative_image_idx` is the index you would use to access the given image as in:
+
+`image, metadata = your_bbbc021_obj[relative_image_idx]`
+
+`moa_df` is a metadata `DataFrame` which provides you with all the compound-concentration pairs in the selected BBBC021 subset:
+
+```python
+> bbbc021_moa.moa_df
+
+        compound  concentration                        moa
+0           ALLN       3.000000        Protein degradation
+1           ALLN     100.000000        Protein degradation
+2           AZ-A       0.099976   Aurora kinase inhibitors
+3           AZ-A       0.300049   Aurora kinase inhibitors
+4           AZ-A       1.000000   Aurora kinase inhibitors
+..           ...            ...                        ...
+99   vincristine       0.029999  Microtubule destabilizers
+100  vincristine       0.099976  Microtubule destabilizers
+101  vincristine       0.300049  Microtubule destabilizers
+102  vincristine       1.000000  Microtubule destabilizers
+103  vincristine       3.000000  Microtubule destabilizers
+
+[104 rows x 3 columns]
+```
+
 ## Data download
 The raw data can be downloaded after importing the BBBC021 dataset as follows:
 ```python

diff --git a/pybbbc/bbbc021.py b/pybbbc/bbbc021.py
@@ -8,7 +8,7 @@
 from typing import Tuple, Union
 
 import h5py
-import janitor
+import janitor  # noqa: F401
 import numpy as np
 import pandas as pd
 

diff --git a/pybbbc/image.py b/pybbbc/image.py
@@ -9,18 +9,38 @@
 def correct_illumination(
     images: np.ndarray, sigma=500, min_percentile=0.02
 ) -> np.ndarray:
+    # calculate average of images belonging to the same site and plate
     img_avg = images.mean(axis=0)
+
+    # apply Gaussian filter
     img_mask = gaussian_filter(img_avg.astype(np.float32), sigma=sigma).astype(
         np.float16
     )
+
+    # calculate robust minimum
     robust_min = np.percentile(img_mask[img_mask > 0], min_percentile)
+
+    # clip and scale pixel intensities
     img_mask[img_mask < robust_min] = robust_min
     img_mask = img_mask / robust_min
+
+    # return corrected images
     return images / img_mask
 
 
 def scale_pixel_intensity(images: np.ndarray) -> np.ndarray:
+    """
+    Scale image intensities from [0...1] such that 0 represents the 0.1th
+    percentile and 1 the 99.9th percentile intensity in the original image.
+
+    Intensities are clipped from [0, inf].
+
+    Returns:
+        Copy of `images` with pixel intensities rescaled.
+    """
     low = np.percentile(images, 0.1)
     high = np.percentile(images, 99.9)
+
     images = (images - low) / (high - low)
-    return np.clip(images, 0, 1)
+
+    return np.clip(images, 0, None)