Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions .vscode/settings.json

This file was deleted.

10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,16 @@

* Removed `methods/cytovi` from the benchmark. The implementation is preserved in the `add-cytovi-implementation` branch to be revisited in the near future (PR #124).

* Updated `metrics/lisi`:
* iLISI is now computed per biological group. Groups where batch is fully confounded by group are skipped and return NaN.
* Added `helper.py` with `compute_ilisi_per_group` and `compute_clisi` functions.

* Updated `methods/harmonypy` to use the PyTorch GPU backend (harmonypy>=0.2.0):
* Switched Docker image to `openproblems/base_pytorch_nvidia:1.0.0`.
* Added `device=None` to `run_harmony` for automatic GPU detection (CUDA -> MPS -> CPU).
* Removed epsilon workaround as numerical stability is now handled internally by harmonypy.
* Updated Nextflow label to `lowcpu, gpu`.

* Updated file schema (PR #18):
* Add is_control obs to indicate whether a cell should be used as control when correcting batch effect.
* Removed donor_id obs from unintegrated censored.
Expand Down
2 changes: 1 addition & 1 deletion common
7 changes: 4 additions & 3 deletions src/methods/harmonypy/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ resources:

engines:
- type: docker
image: openproblems/base_python:1
image: openproblems/base_pytorch_nvidia:1.0.0
setup:
- type: python
packages:
- harmonypy
- harmonypy>=0.2.0
- torch

runners:
- type: executable
- type: nextflow
directives:
label: [midtime,midmem,midcpu]
label: [midtime,midmem,lowcpu,gpu]
7 changes: 2 additions & 5 deletions src/methods/harmonypy/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,11 @@

print("Run harmony", flush=True)

# TODO numerical instability in kmeans causing problem with harmony.
# so adding a very small value to all entries to make sure there are no zeros
epsilon = 1e-20

out = harmonypy.run_harmony(
data_mat=adata_to_correct.layers["preprocessed"] + epsilon,
data_mat=adata_to_correct.layers["preprocessed"],
meta_data=adata_to_correct.obs,
vars_use="batch_str",
device="cuda",
)

# have to add in the uncorrected markers as well
Expand Down
1 change: 1 addition & 0 deletions src/metrics/lisi/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ resources:
# The script of your component (required)
- type: python_script
path: script.py
- path: helper.py
- path: /src/utils/helper_functions.py

engines:
Expand Down
72 changes: 72 additions & 0 deletions src/metrics/lisi/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import anndata as ad
import numpy as np
import scib_metrics as sm


def compute_ilisi_per_group(integrated: ad.AnnData, split_label: str):
"""
Compute iLISI per biological group, skipping groups where batch is confounded.

Args:
integrated (ad.AnnData): Integrated AnnData with obs columns 'group' and 'batch'.
split_label (str): Label for the split (used in print statements).

Returns:
ilisi_per_group (dict): Per-group normalised iLISI scalar values.
Empty dict if all groups are skipped due to batch-group confounding.
ilisi_per_cell_per_group (dict): Per-group arrays of raw per-cell iLISI values.
Empty dict if all groups are skipped due to batch-group confounding.
"""
groups = integrated.obs["group"].unique()
ilisi_per_group = {}
ilisi_per_cell_per_group = {}

for group in groups:
group_adata = integrated[integrated.obs["group"] == group]
n_batches_in_group = len(group_adata.obs["batch"].unique())

if n_batches_in_group < 2:
print(
f"{split_label} - Group {group}: batch is confounded by group "
f"(only 1 batch present). Skipping.",
flush=True,
)
continue

knn = sm.nearest_neighbors.pynndescent(
group_adata.layers["integrated"], n_neighbors=100, random_state=0
)
ilisi_per_cell = sm.lisi_knn(knn, group_adata.obs["batch"])
ilisi = (np.nanmedian(ilisi_per_cell) - 1) / (n_batches_in_group - 1)

ilisi_per_group[group] = ilisi
ilisi_per_cell_per_group[group] = ilisi_per_cell

if len(ilisi_per_group) == 0:
print(
f"{split_label}: batch is confounded by group in all groups. "
f"Returning NaN for iLISI.",
flush=True,
)

return ilisi_per_group, ilisi_per_cell_per_group


def compute_clisi(integrated: ad.AnnData):
"""
Compute cLISI for an integrated AnnData object.

Args:
integrated (ad.AnnData): Integrated AnnData with obs column 'cell_type'.

Returns:
clisi (float): Normalised cLISI score.
clisi_per_cell (np.ndarray): Raw per-cell cLISI values.
"""
n_celltypes = len(integrated.obs["cell_type"].unique())
knn = sm.nearest_neighbors.pynndescent(
integrated.layers["integrated"], n_neighbors=100, random_state=0
)
clisi_per_cell = sm.lisi_knn(knn, integrated.obs["cell_type"])
clisi = (n_celltypes - np.nanmedian(clisi_per_cell)) / (n_celltypes - 1)
return clisi, clisi_per_cell
93 changes: 55 additions & 38 deletions src/metrics/lisi/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import anndata as ad
import numpy as np
import scib_metrics as sm

## VIASH START
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
Expand All @@ -20,6 +19,7 @@
## VIASH END

sys.path.append(meta["resources_dir"])
import helper as lisi_helper
from helper_functions import (
get_obs_var_for_integrated,
remove_unlabelled,
Expand Down Expand Up @@ -48,55 +48,72 @@
integrated_s2 = remove_unlabelled(integrated_s2)

print("Compute metrics", flush=True)
n_batches = len(integrated_s1.obs.batch.unique())
n_celltypes = len(integrated_s1.obs.cell_type.unique())

print("Compute iLisi and cLisi for split 1", flush=True)
knn = sm.nearest_neighbors.pynndescent(
integrated_s1.layers["integrated"], n_neighbors=100, random_state=0
print("Compute iLisi per group for split 1", flush=True)
ilisi_s1_per_group, ilisi_s1_per_cell_per_group = lisi_helper.compute_ilisi_per_group(
integrated_s1, "split 1"
)

ilisi_s1_per_cell = sm.lisi_knn(knn, integrated_s1.obs.batch)
ilisi_s1 = (np.nanmedian(ilisi_s1_per_cell) - 1) / (n_batches - 1)
print("Compute iLisi per group for split 2", flush=True)
ilisi_s2_per_group, ilisi_s2_per_cell_per_group = lisi_helper.compute_ilisi_per_group(
integrated_s2, "split 2"
)

clisi_s1_per_cell = sm.lisi_knn(knn, integrated_s1.obs.cell_type)
clisi_s1 = (n_celltypes - np.nanmedian(clisi_s1_per_cell)) / (n_celltypes - 1)
# Compute final iLISI as the mean across all groups from both splits.
# If all groups across both splits were skipped (batch confounded by group), return NaN.
all_ilisi_values = list(ilisi_s1_per_group.values()) + list(ilisi_s2_per_group.values())
ilisi = np.nanmean(all_ilisi_values) if len(all_ilisi_values) > 0 else np.nan

print("Compute iLisi and cLisi for split 2", flush=True)
knn = sm.nearest_neighbors.pynndescent(
integrated_s2.layers["integrated"], n_neighbors=100, random_state=0
)
ilisi_s2_per_cell = sm.lisi_knn(knn, integrated_s2.obs.batch)
ilisi_s2 = (np.nanmedian(ilisi_s2_per_cell) - 1) / (n_batches - 1)
print("Compute cLisi for split 1", flush=True)
clisi_s1, clisi_s1_per_cell = lisi_helper.compute_clisi(integrated_s1)

clisi_s2_per_cell = sm.lisi_knn(knn, integrated_s2.obs.cell_type)
clisi_s2 = (n_celltypes - np.nanmedian(clisi_s2_per_cell)) / (n_celltypes - 1)
print("Compute cLisi for split 2", flush=True)
clisi_s2, clisi_s2_per_cell = lisi_helper.compute_clisi(integrated_s2)

ilisi = np.mean([ilisi_s1, ilisi_s2])
clisi = np.mean([clisi_s1, clisi_s2])
uns_metric_ids = ["iLisi", "cLisi"]
uns_metric_values = [ilisi, clisi]

print("iLisi split 1:", ilisi_s1, flush=True)
print("iLisi split 2:", ilisi_s2, flush=True)
# Print all results to stdout for logging purposes.
for group, val in ilisi_s1_per_group.items():
print(f"iLisi split 1 group {group}: {val}", flush=True)
print(f"iLisi split 2 group {group}: {val}", flush=True)

print("cLisi split 1:", clisi_s1, flush=True)
print("cLisi split 2:", clisi_s2, flush=True)
print("Mean iLisi:", ilisi, flush=True)
print("Mean cLisi:", clisi, flush=True)
print("Metric IDs:", uns_metric_ids, flush=True)
print("Metric values:", uns_metric_values, flush=True)

print("Write output AnnData to file", flush=True)
output = ad.AnnData(
uns={
"dataset_id": integrated_s1.uns["dataset_id"],
"method_id": integrated_s1.uns["method_id"],
"metric_ids": uns_metric_ids,
"metric_values": uns_metric_values,
"ilisi_s1_index": ilisi_s1_per_cell,
"ilisi_s2_index": ilisi_s2_per_cell,
"clisi_s1_index": clisi_s1_per_cell,
"clisi_s2_index": clisi_s2_per_cell,
}
)
# uns layout:
# iLisi = mean iLISI across all groups and both splits.
# cLisi = mean cLISI across both splits.
# ilisi_per_split: {split_1, split_2} -> {group: iLISI score}.
# Groups where batch is fully confounded by group are skipped and absent.
# ilisi_per_cell_per_split: same structure, but values are raw per-cell iLISI score
# clisi_per_split: {split_1, split_2} -> cLISI score for that split.
# clisi_per_cell_per_split: {split_1, split_2} -> raw per-cell cLISI score for that split.
uns = {
"dataset_id": integrated_s1.uns["dataset_id"],
"method_id": integrated_s1.uns["method_id"],
"metric_ids": ["iLisi", "cLisi"],
"metric_values": [ilisi, clisi],
"ilisi_per_split": {
"split_1": ilisi_s1_per_group,
"split_2": ilisi_s2_per_group,
},
"ilisi_per_cell_per_split": {
"split_1": ilisi_s1_per_cell_per_group,
"split_2": ilisi_s2_per_cell_per_group,
},
"clisi_per_split": {
"split_1": clisi_s1,
"split_2": clisi_s2,
},
"clisi_per_cell_per_split": {
"split_1": clisi_s1_per_cell,
"split_2": clisi_s2_per_cell,
},
}

output = ad.AnnData(uns=uns)
output.write_h5ad(par["output"], compression="gzip")

print(f"Anndata written to {par['output']}", flush=True)
Loading