daisybio · JudithBernett · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 9, 2026
diff --git a/drevalpy/cli.py b/drevalpy/cli.py
@@ -7,3 +7,7 @@ def cli_main():
     """Command line interface entry point for the drug response evaluation pipeline."""
     args = get_parser().parse_args()
     main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/drevalpy/datasets/curvecurator.py b/drevalpy/datasets/curvecurator.py
@@ -15,10 +15,12 @@
 import subprocess
 import warnings
 from pathlib import Path
+from typing import Union
 
 import numpy as np
 import pandas as pd
 import toml
+from pandas.core.groupby import DataFrameGroupBy
 
 from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER
 
@@ -40,7 +42,7 @@ def _prepare_raw_data(curve_df: pd.DataFrame, output_dir: Path, prefix: str = ""
             UserWarning,
             stacklevel=1,
         )
-        curve_df = curve_df.groupby(["sample", "drug", "dose", "replicate"], as_index=False)["response"].mean()
+        curve_df = curve_df.groupby(["sample", "drug", "dose", "replicate"], as_index=False)[["response"]].mean()
 
     df = curve_df.pivot(index=["sample", "drug"], columns=pivot_columns, values="response")
 
@@ -177,7 +179,7 @@ def ic50(front, back, slope, pec50):
     back = model_params_df["Back"].values
     slope = model_params_df["Slope"].values
     # we need the pEC50 in uM; now it is in M: -log10(EC50[M] * 10^6) = -log10(EC50[M])-6 = pEC50 -6
-    pec50 = model_params_df["pEC50_curvecurator"].values - 6
+    pec50 = model_params_df["pEC50_curvecurator"].to_numpy(dtype=float) - 6
 
     model_params_df["IC50_curvecurator"] = ic50(front, back, slope, pec50)
     model_params_df["LN_IC50_curvecurator"] = np.log(model_params_df["IC50_curvecurator"].values)
@@ -233,6 +235,7 @@ def preprocess(input_file: str, output_dir: str, dataset_name: str, cores: int,
         if curve_df["nreplicates"].nunique() > 1:
             groupby.append("nreplicates")
 
+    drug_df_groups: Union[DataFrameGroupBy, list[tuple[str, pd.DataFrame]]]
     if len(groupby) > 0:
         drug_df_groups = curve_df.groupby(groupby)
     else:
@@ -301,17 +304,17 @@ def postprocess(output_folder: str, dataset_name: str):
     with open(output_path / f"{dataset_name}.csv", "w") as f:
         first_file = True
         for output_file in curvecurator_output_files:
-            fitted_curve_data = pd.read_csv(output_file, sep="\t", usecols=required_columns).rename(
+            fitted_curve_data = pd.read_csv(output_file, sep="\t", usecols=list(required_columns.values())).rename(
                 columns=required_columns
             )
-            fitted_curve_data[[CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER]] = fitted_curve_data.Name.str.split(
+            fitted_curve_data[[CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER]] = fitted_curve_data["Name"].str.split(
                 "|", expand=True
             )
             fitted_curve_data["EC50_curvecurator"] = (
-                np.power(10, -fitted_curve_data["pEC50_curvecurator"].values) * 10**6
+                np.power(10, -fitted_curve_data["pEC50_curvecurator"].to_numpy(dtype=float)) * 10**6
             )  # in CurveCurator 10^-pEC50 = EC50
             _calc_ic50(fitted_curve_data)
-            fitted_curve_data.to_csv(f, index=None, header=first_file, mode="a")
+            fitted_curve_data.to_csv(f, index=False, header=first_file, mode="a")
             first_file = False
         f.close()
 

diff --git a/drevalpy/datasets/featurizer/create_ppi_graphs.py b/drevalpy/datasets/featurizer/create_ppi_graphs.py
@@ -0,0 +1,156 @@
+"""
+Preprocesses PPI network CSV files into graph representations for PPIGraphGNN.
+
+This script takes a dataset name as input, reads the corresponding
+PPI network CSV file, and converts it into a torch_geometric.data.Data object.
+The PPI CSV should have columns: gene_id_1, gene_id_2, and optionally interaction_score.
+"""
+
+import argparse
+from pathlib import Path
+
+import pandas as pd
+import torch
+from torch_geometric.data import Data
+
+
+def _load_ppi_network(ppi_file: Path, gene_list_file: Path) -> Data:
+    """
+    Load PPI network from CSV and create a PyTorch Geometric Data object.
+
+    The gene order in the PPI graph will match the order in the gene list file.
+    This ensures consistency when gene expression features are set as node features.
+
+    :param ppi_file: Path to the PPI network CSV file with columns [gene_id_1, gene_id_2, (optional) interaction_score]
+    :param gene_list_file: Path to the gene list CSV (e.g., landmark_genes_reduced.csv) that defines gene order
+    :raises ValueError: If the PPI CSV does not contain the required columns or if the gene list file is not found
+    :return: A Data object representing the PPI network graph
+    """
+    # Load the gene list to get the ordered list of genes (same as will be used for gene expression)
+    gene_list_df = pd.read_csv(gene_list_file)
+    if "Symbol" in gene_list_df.columns:
+        genes = gene_list_df["Symbol"].tolist()
+    elif "gene" in gene_list_df.columns:
+        genes = gene_list_df["gene"].tolist()
+    else:
+        # Gene expression file -> columns are genes
+        genes = list(gene_list_df.columns)
+        genes = [g for g in genes if g not in ["cellosaurus_id", "cell_line_name"]]
+
+    # Create a mapping from gene name to index
+    gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}
+
+    # Load PPI network
+    ppi_df = pd.read_csv(ppi_file)
+
+    # Validate columns
+    required_cols = {"gene_id_1", "gene_id_2"}
+    if not required_cols.issubset(ppi_df.columns):
+        raise ValueError(f"PPI CSV must contain columns 'gene_id_1' and 'gene_id_2'. Found: {ppi_df.columns.tolist()}")
+
+    # Build edge list (only include genes that exist in gene expression)
+    edge_list = []
+    edge_weights = []
+
+    has_weights = "interaction_score" in ppi_df.columns
+
+    for _, row in ppi_df.iterrows():
+        gene1 = str(row["gene_id_1"])
+        gene2 = str(row["gene_id_2"])
+
+        # Only add edge if both genes exist in gene expression data
+        if gene1 in gene_to_idx and gene2 in gene_to_idx:
+            idx1 = gene_to_idx[gene1]
+            idx2 = gene_to_idx[gene2]
+
+            # Add both directions for undirected graph
+            edge_list.append([idx1, idx2])
+            edge_list.append([idx2, idx1])
+
+            if has_weights:
+                weight = float(row["interaction_score"])
+                edge_weights.extend([weight, weight])
+
+    if not edge_list:
+        raise ValueError("No valid edges found in PPI network (genes don't match gene expression)")
+
+    # Convert to tensors
+    edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
+
+    # Create node feature placeholder (will be filled with gene expression at runtime)
+    num_nodes = len(genes)
+    x = torch.zeros((num_nodes, 1), dtype=torch.float)
+
+    # Edge attributes
+    if has_weights:
+        edge_attr = torch.tensor(edge_weights, dtype=torch.float).view(-1, 1)
+    else:
+        edge_attr = None
+
+    # Store gene names as metadata
+    graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+    graph.gene_names = genes  # Store for reference
+
+    return graph
+
+
+def main():
+    """Main function to run the PPI preprocessing."""
+    parser = argparse.ArgumentParser(description="Preprocess PPI network to graph.")
+    parser.add_argument("dataset_name", type=str, help="The name of the dataset to process.")
+    parser.add_argument("--path_data", type=str, default="data", help="Path to the data folder")
+    parser.add_argument(
+        "--ppi_file",
+        type=str,
+        default=None,
+        help="Path to PPI CSV file (default: {path_data}/{dataset_name}/ppi_network.csv)",
+    )
+    parser.add_argument(
+        "--gene_list",
+        type=str,
+        default="gene_expression.csv",
+        help="Gene list name to use (default: gene_expression.csv; will take the columns)",
+    )
+    args = parser.parse_args()
+
+    dataset_name = args.dataset_name
+    data_dir = Path(args.path_data).resolve()
+
+    # Determine PPI file path
+    if args.ppi_file:
+        ppi_file = Path(args.ppi_file)
+    else:
+        ppi_file = data_dir / dataset_name / "ppi_network.csv"
+
+    # Gene list file
+    gene_list_file = data_dir / dataset_name / f"{args.gene_list}"
+    output_file = data_dir / dataset_name / "ppi_graph.pt"
+
+    if not ppi_file.exists():
+        print(f"Error: {ppi_file} not found.")
+        return
+
+    if not gene_list_file.exists():
+        print(f"Error: {gene_list_file} not found.")
+        print(f"Available gene lists should be in {data_dir / 'meta' / 'gene_lists'}/")
+        return
+
+    print(f"Processing PPI network for dataset {dataset_name}...")
+    print(f"Using gene list: {args.gene_list}")
+
+    try:
+        graph = _load_ppi_network(ppi_file, gene_list_file)
+        torch.save(graph, output_file)
+        print(f"PPI graph saved to {output_file}")
+        print(f"  Nodes (genes): {graph.num_nodes}")
+        print(f"  Edges (interactions): {graph.num_edges}")
+        if graph.edge_attr is not None:
+            print("  Edge attributes: Yes")
+        print(f"\nGene order matches: {args.gene_list}")
+        print(f"First 5 genes: {graph.gene_names[:5]}")
+    except Exception as e:
+        print(f"Error processing PPI network: {e}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/drevalpy/evaluation.py b/drevalpy/evaluation.py
@@ -52,7 +52,7 @@ def pearson(y_pred: np.ndarray, y_true: np.ndarray) -> float:
     if _check_constant_target_or_small_sample(y_true):
         return np.nan
 
-    return pearsonr(y_pred, y_true)[0]
+    return pearsonr(y_pred, y_true).statistic
 
 
 def spearman(y_pred: np.ndarray, y_true: np.ndarray) -> float:
@@ -72,7 +72,7 @@ def spearman(y_pred: np.ndarray, y_true: np.ndarray) -> float:
     if _check_constant_target_or_small_sample(y_true):
         return np.nan
 
-    return spearmanr(y_pred, y_true)[0]
+    return spearmanr(y_pred, y_true).statistic
 
 
 def kendall(y_pred: np.ndarray, y_true: np.ndarray) -> float:

diff --git a/drevalpy/models/DIPK/data_utils.py b/drevalpy/models/DIPK/data_utils.py
@@ -53,14 +53,16 @@ def load_bionic_features(data_path: str, dataset_name: str, gene_add_num: int =
         # Aggregate BIONIC features for selected genes
         selected_features = [bionic_gene_dict[gene] for gene in top_genes if gene in bionic_gene_dict]
         if selected_features:
-            aggregated_feature = np.mean(selected_features, axis=0)
+            aggregated_feature = np.mean(np.array(selected_features), axis=0)
         else:
             # Handle case where no features are found (padding with zeros)
             aggregated_feature = np.zeros(next(iter(bionic_gene_dict.values())).shape)
 
         bionic_feature_dict[cell_line] = aggregated_feature
 
-    feature_data = {cell_line: {"bionic_features": features} for cell_line, features in bionic_feature_dict.items()}
+    feature_data = {
+        str(cell_line): {"bionic_features": features} for cell_line, features in bionic_feature_dict.items()
+    }
     return FeatureDataset(features=feature_data)
 
 

diff --git a/drevalpy/models/DrugGNN/drug_gnn.py b/drevalpy/models/DrugGNN/drug_gnn.py
@@ -231,7 +231,7 @@ class DrugGNN(DRPModel):
     def __init__(self):
         """Initialize the DrugGNN model."""
         super().__init__()
-        self.model: DrugGNNModule | None = None
+        self.model = None
         self.hyperparameters = {}
 
     @classmethod
@@ -456,41 +456,58 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase
 
         return FeatureDataset(features=feature_dict)
 
-    def save_model(self, path: str | Path, drug_name=None):
-        """Save the model.
+    def save(self, directory: str) -> None:
+        """
+        Save the trained model, hyperparameters, and gene expression scaler to the given directory.
+
+        This enables full reconstruction of the model using `load`.
+
+        Files saved:
+        - model.pt: PyTorch state_dict of the trained model
+        - hyperparameters.json: Dictionary containing all relevant model hyperparameters
 
-        :param path: The path to save the model to.
-        :param drug_name: The name of the drug.
-        :raises RuntimeError: If there is no model to save.
+        :param directory: Target directory to store all model artifacts
         """
-        if self.model is None:
-            raise RuntimeError("No model to save.")
-        path = Path(path)
+        path = Path(directory)
         path.mkdir(parents=True, exist_ok=True)
 
-        trainer = pl.Trainer()
-        trainer.save_checkpoint(path / "model.ckpt", weights_only=True)
+        torch.save(self.model.state_dict(), path / "model.pt")  # noqa: S614
+
+        with open(path / "hyperparameters.json", "w") as f:
+            json.dump(self.hyperparameters, f)
 
-        with open(path / "config.json", "w") as f:
-            json.dump(self.hyperparameters, f, indent=4)
+    @classmethod
+    def load(cls, directory: str) -> "DrugGNN":
+        """
+        Load a trained DrugGNN model from the given directory.
 
-    def load_model(self, path: str | Path, drug_name=None):
-        """Load the model.
+        This includes:
+        - model.pt: PyTorch state_dict of the trained model
+        - hyperparameters.json: Dictionary containing all relevant model hyperparameters
 
-        :param path: The path to load the model from.
-        :param drug_name: The name of the drug.
+        :param directory: The path to load the model from.
+        :return: The loaded DrugGNN model.
+        :raises FileNotFoundError: If any of the required files are not found.
         """
-        path = Path(path)
+        path = Path(directory)
 
-        config_path = path / "config.json"
-        with open(config_path) as f:
-            self.hyperparameters = json.load(f)
+        hpam_path = path / "hyperparameters.json"
+        model_path = path / "model.pt"
+        if not hpam_path.exists() or not model_path.exists():
+            raise FileNotFoundError(f"Required files not found in {directory}.")
 
-        self.model = DrugGNNModule.load_from_checkpoint(
-            path / "model.ckpt",
-            num_node_features=self.hyperparameters["num_node_features"],
-            num_cell_features=self.hyperparameters["num_cell_features"],
-            hidden_dim=self.hyperparameters.get("hidden_dim", 64),
-            dropout=self.hyperparameters.get("dropout", 0.2),
-            learning_rate=self.hyperparameters.get("learning_rate", 0.001),
+        instance = cls()
+
+        with open(hpam_path) as f:
+            instance.hyperparameters = json.load(f)
+
+        instance.model = DrugGNNModule(
+            num_node_features=instance.hyperparameters["num_node_features"],
+            num_cell_features=instance.hyperparameters["num_cell_features"],
+            hidden_dim=instance.hyperparameters.get("hidden_dim", 64),
+            dropout=instance.hyperparameters.get("dropout", 0.2),
+            learning_rate=instance.hyperparameters.get("learning_rate", 0.001),
         )
+        instance.model.load_state_dict(torch.load(model_path, weights_only=True))
+        instance.model.eval()
+        return instance