Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions drevalpy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ def cli_main():
"""Command line interface entry point for the drug response evaluation pipeline."""
args = get_parser().parse_args()
main(args)


if __name__ == "__main__":
cli_main()
15 changes: 9 additions & 6 deletions drevalpy/datasets/curvecurator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@
import subprocess
import warnings
from pathlib import Path
from typing import Union

import numpy as np
import pandas as pd
import toml
from pandas.core.groupby import DataFrameGroupBy

from drevalpy.datasets.utils import CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER

Expand All @@ -40,7 +42,7 @@ def _prepare_raw_data(curve_df: pd.DataFrame, output_dir: Path, prefix: str = ""
UserWarning,
stacklevel=1,
)
curve_df = curve_df.groupby(["sample", "drug", "dose", "replicate"], as_index=False)["response"].mean()
curve_df = curve_df.groupby(["sample", "drug", "dose", "replicate"], as_index=False)[["response"]].mean()

df = curve_df.pivot(index=["sample", "drug"], columns=pivot_columns, values="response")

Expand Down Expand Up @@ -177,7 +179,7 @@ def ic50(front, back, slope, pec50):
back = model_params_df["Back"].values
slope = model_params_df["Slope"].values
# we need the pEC50 in uM; now it is in M: -log10(EC50[M] * 10^6) = -log10(EC50[M])-6 = pEC50 -6
pec50 = model_params_df["pEC50_curvecurator"].values - 6
pec50 = model_params_df["pEC50_curvecurator"].to_numpy(dtype=float) - 6

model_params_df["IC50_curvecurator"] = ic50(front, back, slope, pec50)
model_params_df["LN_IC50_curvecurator"] = np.log(model_params_df["IC50_curvecurator"].values)
Expand Down Expand Up @@ -233,6 +235,7 @@ def preprocess(input_file: str, output_dir: str, dataset_name: str, cores: int,
if curve_df["nreplicates"].nunique() > 1:
groupby.append("nreplicates")

drug_df_groups: Union[DataFrameGroupBy, list[tuple[str, pd.DataFrame]]]
if len(groupby) > 0:
drug_df_groups = curve_df.groupby(groupby)
else:
Expand Down Expand Up @@ -301,17 +304,17 @@ def postprocess(output_folder: str, dataset_name: str):
with open(output_path / f"{dataset_name}.csv", "w") as f:
first_file = True
for output_file in curvecurator_output_files:
fitted_curve_data = pd.read_csv(output_file, sep="\t", usecols=required_columns).rename(
fitted_curve_data = pd.read_csv(output_file, sep="\t", usecols=list(required_columns.values())).rename(
columns=required_columns
)
fitted_curve_data[[CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER]] = fitted_curve_data.Name.str.split(
fitted_curve_data[[CELL_LINE_IDENTIFIER, DRUG_IDENTIFIER]] = fitted_curve_data["Name"].str.split(
"|", expand=True
)
fitted_curve_data["EC50_curvecurator"] = (
np.power(10, -fitted_curve_data["pEC50_curvecurator"].values) * 10**6
np.power(10, -fitted_curve_data["pEC50_curvecurator"].to_numpy(dtype=float)) * 10**6
) # in CurveCurator 10^-pEC50 = EC50
_calc_ic50(fitted_curve_data)
fitted_curve_data.to_csv(f, index=None, header=first_file, mode="a")
fitted_curve_data.to_csv(f, index=False, header=first_file, mode="a")
first_file = False
f.close()

Expand Down
156 changes: 156 additions & 0 deletions drevalpy/datasets/featurizer/create_ppi_graphs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""
Preprocesses PPI network CSV files into graph representations for PPIGraphGNN.

This script takes a dataset name as input, reads the corresponding
PPI network CSV file, and converts it into a torch_geometric.data.Data object.
The PPI CSV should have columns: gene_id_1, gene_id_2, and optionally interaction_score.
"""

import argparse
from pathlib import Path

import pandas as pd
import torch
from torch_geometric.data import Data


def _load_ppi_network(ppi_file: Path, gene_list_file: Path) -> Data:
"""
Load PPI network from CSV and create a PyTorch Geometric Data object.

The gene order in the PPI graph will match the order in the gene list file.
This ensures consistency when gene expression features are set as node features.

:param ppi_file: Path to the PPI network CSV file with columns [gene_id_1, gene_id_2, (optional) interaction_score]
:param gene_list_file: Path to the gene list CSV (e.g., landmark_genes_reduced.csv) that defines gene order
:raises ValueError: If the PPI CSV does not contain the required columns or if the gene list file is not found
:return: A Data object representing the PPI network graph
"""
# Load the gene list to get the ordered list of genes (same as will be used for gene expression)
gene_list_df = pd.read_csv(gene_list_file)
if "Symbol" in gene_list_df.columns:
genes = gene_list_df["Symbol"].tolist()
elif "gene" in gene_list_df.columns:
genes = gene_list_df["gene"].tolist()
else:
# Gene expression file -> columns are genes
genes = list(gene_list_df.columns)
genes = [g for g in genes if g not in ["cellosaurus_id", "cell_line_name"]]

# Create a mapping from gene name to index
gene_to_idx = {gene: idx for idx, gene in enumerate(genes)}

# Load PPI network
ppi_df = pd.read_csv(ppi_file)

# Validate columns
required_cols = {"gene_id_1", "gene_id_2"}
if not required_cols.issubset(ppi_df.columns):
raise ValueError(f"PPI CSV must contain columns 'gene_id_1' and 'gene_id_2'. Found: {ppi_df.columns.tolist()}")

# Build edge list (only include genes that exist in gene expression)
edge_list = []
edge_weights = []

has_weights = "interaction_score" in ppi_df.columns

for _, row in ppi_df.iterrows():
gene1 = str(row["gene_id_1"])
gene2 = str(row["gene_id_2"])

# Only add edge if both genes exist in gene expression data
if gene1 in gene_to_idx and gene2 in gene_to_idx:
idx1 = gene_to_idx[gene1]
idx2 = gene_to_idx[gene2]

# Add both directions for undirected graph
edge_list.append([idx1, idx2])
edge_list.append([idx2, idx1])

if has_weights:
weight = float(row["interaction_score"])
edge_weights.extend([weight, weight])

if not edge_list:
raise ValueError("No valid edges found in PPI network (genes don't match gene expression)")

# Convert to tensors
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

# Create node feature placeholder (will be filled with gene expression at runtime)
num_nodes = len(genes)
x = torch.zeros((num_nodes, 1), dtype=torch.float)

# Edge attributes
if has_weights:
edge_attr = torch.tensor(edge_weights, dtype=torch.float).view(-1, 1)
else:
edge_attr = None

# Store gene names as metadata
graph = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
graph.gene_names = genes # Store for reference

return graph


def main():
"""Main function to run the PPI preprocessing."""
parser = argparse.ArgumentParser(description="Preprocess PPI network to graph.")
parser.add_argument("dataset_name", type=str, help="The name of the dataset to process.")
parser.add_argument("--path_data", type=str, default="data", help="Path to the data folder")
parser.add_argument(
"--ppi_file",
type=str,
default=None,
help="Path to PPI CSV file (default: {path_data}/{dataset_name}/ppi_network.csv)",
)
parser.add_argument(
"--gene_list",
type=str,
default="gene_expression.csv",
help="Gene list name to use (default: gene_expression.csv; will take the columns)",
)
args = parser.parse_args()

dataset_name = args.dataset_name
data_dir = Path(args.path_data).resolve()

# Determine PPI file path
if args.ppi_file:
ppi_file = Path(args.ppi_file)
else:
ppi_file = data_dir / dataset_name / "ppi_network.csv"

# Gene list file
gene_list_file = data_dir / dataset_name / f"{args.gene_list}"
output_file = data_dir / dataset_name / "ppi_graph.pt"

if not ppi_file.exists():
print(f"Error: {ppi_file} not found.")
return

if not gene_list_file.exists():
print(f"Error: {gene_list_file} not found.")
print(f"Available gene lists should be in {data_dir / 'meta' / 'gene_lists'}/")
return

print(f"Processing PPI network for dataset {dataset_name}...")
print(f"Using gene list: {args.gene_list}")

try:
graph = _load_ppi_network(ppi_file, gene_list_file)
torch.save(graph, output_file)
print(f"PPI graph saved to {output_file}")
print(f" Nodes (genes): {graph.num_nodes}")
print(f" Edges (interactions): {graph.num_edges}")
if graph.edge_attr is not None:
print(" Edge attributes: Yes")
print(f"\nGene order matches: {args.gene_list}")
print(f"First 5 genes: {graph.gene_names[:5]}")
except Exception as e:
print(f"Error processing PPI network: {e}")


if __name__ == "__main__":
main()
4 changes: 2 additions & 2 deletions drevalpy/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def pearson(y_pred: np.ndarray, y_true: np.ndarray) -> float:
if _check_constant_target_or_small_sample(y_true):
return np.nan

return pearsonr(y_pred, y_true)[0]
return pearsonr(y_pred, y_true).statistic


def spearman(y_pred: np.ndarray, y_true: np.ndarray) -> float:
Expand All @@ -72,7 +72,7 @@ def spearman(y_pred: np.ndarray, y_true: np.ndarray) -> float:
if _check_constant_target_or_small_sample(y_true):
return np.nan

return spearmanr(y_pred, y_true)[0]
return spearmanr(y_pred, y_true).statistic


def kendall(y_pred: np.ndarray, y_true: np.ndarray) -> float:
Expand Down
6 changes: 4 additions & 2 deletions drevalpy/models/DIPK/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,16 @@ def load_bionic_features(data_path: str, dataset_name: str, gene_add_num: int =
# Aggregate BIONIC features for selected genes
selected_features = [bionic_gene_dict[gene] for gene in top_genes if gene in bionic_gene_dict]
if selected_features:
aggregated_feature = np.mean(selected_features, axis=0)
aggregated_feature = np.mean(np.array(selected_features), axis=0)
else:
# Handle case where no features are found (padding with zeros)
aggregated_feature = np.zeros(next(iter(bionic_gene_dict.values())).shape)

bionic_feature_dict[cell_line] = aggregated_feature

feature_data = {cell_line: {"bionic_features": features} for cell_line, features in bionic_feature_dict.items()}
feature_data = {
str(cell_line): {"bionic_features": features} for cell_line, features in bionic_feature_dict.items()
}
return FeatureDataset(features=feature_data)


Expand Down
73 changes: 45 additions & 28 deletions drevalpy/models/DrugGNN/drug_gnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ class DrugGNN(DRPModel):
def __init__(self):
"""Initialize the DrugGNN model."""
super().__init__()
self.model: DrugGNNModule | None = None
self.model = None
self.hyperparameters = {}

@classmethod
Expand Down Expand Up @@ -456,41 +456,58 @@ def load_drug_features(self, data_path: str, dataset_name: str) -> FeatureDatase

return FeatureDataset(features=feature_dict)

def save_model(self, path: str | Path, drug_name=None):
"""Save the model.
def save(self, directory: str) -> None:
"""
Save the trained model, hyperparameters, and gene expression scaler to the given directory.

This enables full reconstruction of the model using `load`.

Files saved:
- model.pt: PyTorch state_dict of the trained model
- hyperparameters.json: Dictionary containing all relevant model hyperparameters

:param path: The path to save the model to.
:param drug_name: The name of the drug.
:raises RuntimeError: If there is no model to save.
:param directory: Target directory to store all model artifacts
"""
if self.model is None:
raise RuntimeError("No model to save.")
path = Path(path)
path = Path(directory)
path.mkdir(parents=True, exist_ok=True)

trainer = pl.Trainer()
trainer.save_checkpoint(path / "model.ckpt", weights_only=True)
torch.save(self.model.state_dict(), path / "model.pt") # noqa: S614

with open(path / "hyperparameters.json", "w") as f:
json.dump(self.hyperparameters, f)

with open(path / "config.json", "w") as f:
json.dump(self.hyperparameters, f, indent=4)
@classmethod
def load(cls, directory: str) -> "DrugGNN":
"""
Load a trained DrugGNN model from the given directory.

def load_model(self, path: str | Path, drug_name=None):
"""Load the model.
This includes:
- model.pt: PyTorch state_dict of the trained model
- hyperparameters.json: Dictionary containing all relevant model hyperparameters

:param path: The path to load the model from.
:param drug_name: The name of the drug.
:param directory: The path to load the model from.
:return: The loaded DrugGNN model.
:raises FileNotFoundError: If any of the required files are not found.
"""
path = Path(path)
path = Path(directory)

config_path = path / "config.json"
with open(config_path) as f:
self.hyperparameters = json.load(f)
hpam_path = path / "hyperparameters.json"
model_path = path / "model.pt"
if not hpam_path.exists() or not model_path.exists():
raise FileNotFoundError(f"Required files not found in {directory}.")

self.model = DrugGNNModule.load_from_checkpoint(
path / "model.ckpt",
num_node_features=self.hyperparameters["num_node_features"],
num_cell_features=self.hyperparameters["num_cell_features"],
hidden_dim=self.hyperparameters.get("hidden_dim", 64),
dropout=self.hyperparameters.get("dropout", 0.2),
learning_rate=self.hyperparameters.get("learning_rate", 0.001),
instance = cls()

with open(hpam_path) as f:
instance.hyperparameters = json.load(f)

instance.model = DrugGNNModule(
num_node_features=instance.hyperparameters["num_node_features"],
num_cell_features=instance.hyperparameters["num_cell_features"],
hidden_dim=instance.hyperparameters.get("hidden_dim", 64),
dropout=instance.hyperparameters.get("dropout", 0.2),
learning_rate=instance.hyperparameters.get("learning_rate", 0.001),
)
instance.model.load_state_dict(torch.load(model_path, weights_only=True))
instance.model.eval()
return instance
Loading
Loading