Fix missing label handling

MGlauer · MGlauer · commit b537b7fd776e · 2025-01-17T13:57:14.000+01:00
diff --git a/chebai/models/electra.py b/chebai/models/electra.py
@@ -287,9 +287,13 @@ def _process_for_loss(
             tuple: A tuple containing the processed model output, labels, and loss arguments.
         """
         kwargs_copy = dict(loss_kwargs)
+        output = model_output["logits"]
         if labels is not None:
             labels = labels.float()
-        return model_output["logits"], labels, kwargs_copy
+        if "missing_labels" in kwargs_copy:
+            missing_labels = kwargs_copy.pop("missing_labels")
+            output = output * (~missing_labels).int()
+        return output, labels, kwargs_copy
 
     def _get_prediction_and_labels(
         self, data: Dict[str, Any], labels: Tensor, model_output: Dict[str, Tensor]
@@ -310,6 +314,11 @@ def _get_prediction_and_labels(
         if "non_null_labels" in loss_kwargs:
             n = loss_kwargs["non_null_labels"]
             d = d[n]
+
+        if "missing_labels" in loss_kwargs:
+            missing_labels = loss_kwargs["missing_labels"]
+            labels = labels * (~missing_labels).int()
+
         return torch.sigmoid(d), labels.int() if labels is not None else None
 
     def forward(self, data: Dict[str, Tensor], **kwargs: Any) -> Dict[str, Any]:
diff --git a/chebai/preprocessing/collate.py b/chebai/preprocessing/collate.py
@@ -64,7 +64,7 @@ def __call__(self, data: List[Union[Dict, Tuple]]) -> XYData:
         Handles both fully and partially labeled data, where some samples may have `None` as their label. The indices
         of non-null labels are stored in the `non_null_labels` field, which is used to filter out predictions for
         unlabeled data during evaluation (e.g., F1, MSE). For models supporting partially labeled data, this method
-        ensures alignment between features and labels.
+        ensures alignment between features and labels. Missing labels are passed as a loss keyword.
 
         Args:
             data (List[Union[Dict, Tuple]]): List of ragged data samples. Each sample can be a dictionary or tuple
@@ -81,10 +81,13 @@ def __call__(self, data: List[Union[Dict, Tuple]]) -> XYData:
         if isinstance(data[0], tuple):
             # For legacy data
             x, y, idents = zip(*data)
+            missing_labels = None
         else:
             x, y, idents = zip(
                 *((d["features"], d["labels"], d.get("ident")) for d in data)
             )
+            missing_labels = [d.get("missing_labels", [False for _ in y[0]]) for d in data]
+
         if any(x is not None for x in y):
             # If any label is not None: (None, None, `1`, None)
             if any(x is None for x in y):
@@ -97,11 +100,13 @@ def __call__(self, data: List[Union[Dict, Tuple]]) -> XYData:
             else:
                 # If all labels are not None: (`0`, `2`, `1`, `3`)
                 y = self.process_label_rows(y)
+
         else:
             # If all labels are None : (`None`, `None`, `None`, `None`)
             y = None
             loss_kwargs["non_null_labels"] = []
 
+        loss_kwargs["missing_labels"] = torch.tensor(missing_labels)
         # Calculate the lengths of each sequence, create a binary mask for valid (non-padded) positions
         lens = torch.tensor(list(map(len, x)))
         model_kwargs["mask"] = torch.arange(max(lens))[None, :] < lens[:, None]
diff --git a/chebai/preprocessing/datasets/tox21.py b/chebai/preprocessing/datasets/tox21.py
@@ -68,8 +68,8 @@ def download(self) -> None:
     def setup_processed(self) -> None:
         """Processes and splits the dataset."""
         print("Create splits")
-        data = self._load_data_from_file(os.path.join(self.raw_dir, f"tox21.csv"))
-        groups = np.array([d["group"] for d in data])
+        data = list(self._load_data_from_file(os.path.join(self.raw_dir, f"tox21.csv")))
+        groups = np.array([d.get("group") for d in data])
         if not all(g is None for g in groups):
             split_size = int(len(set(groups)) * self.train_split)
             os.makedirs(self.processed_dir, exist_ok=True)
@@ -129,7 +129,7 @@ def setup(self, **kwargs) -> None:
         ):
             self.setup_processed()
 
-    def _load_data_from_file(self, input_file_path: str) -> List[Dict]:
+    def _load_dict(self, input_file_path: str) -> List[Dict]:
         """Loads data from a CSV file.
 
         Args:
diff --git a/chebai/preprocessing/reader.py b/chebai/preprocessing/reader.py
@@ -92,13 +92,18 @@ def _read_group(self, raw: Any) -> Any:
         return raw
 
     def _read_components(self, row: Dict[str, Any]) -> Dict[str, Any]:
-        """Read and return components from the row."""
+        """Read and return components from the row. If the data contains any missing labels (`None`), they are tracked
+        under the additional `missing_labels` keyword."""
+        labels = self._get_raw_label(row)
+        additional_kwargs = self._get_additional_kwargs(row)
+        if any(l is None for l in labels):
+            additional_kwargs["missing_labels"] = [l is None for l in labels]
         return dict(
             features=self._get_raw_data(row),
-            labels=self._get_raw_label(row),
+            labels=labels,
             ident=self._get_raw_id(row),
             group=self._get_raw_group(row),
-            additional_kwargs=self._get_additional_kwargs(row),
+            additional_kwargs=additional_kwargs,
         )
 
     def to_data(self, row: Dict[str, Any]) -> Dict[str, Any]: