diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 752921067..000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "yaml.schemas": { - "common/schemas/api_component_spec.yaml": "**/api/comp_*.yaml", - "common/schemas/api_file_format.yaml": "**/api/file_*.yaml", - "common/schemas/task_config.yaml": "_viash.yaml", - "common/schemas/task_method.yaml": "**/methods/**/config.vsh.yaml", - "common/schemas/task_control_method.yaml": "**/control_methods/**/config.vsh.yaml", - "common/schemas/task_metric.yaml": "**/metrics/**/config.vsh.yaml" - } -} diff --git a/CHANGELOG.md b/CHANGELOG.md index 317c6394f..c461b66dc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,6 +77,16 @@ * Removed `methods/cytovi` from the benchmark. The implementation is preserved in the `add-cytovi-implementation` branch to be revisited in the near future (PR #124). +* Updated `metrics/lisi`: + * iLISI is now computed per biological group. Groups where batch is fully confounded by group are skipped and return NaN. + * Added `helper.py` with `compute_ilisi_per_group` and `compute_clisi` functions. + +* Updated `methods/harmonypy` to use the PyTorch GPU backend (harmonypy>=0.2.0): + * Switched Docker image to `openproblems/base_pytorch_nvidia:1.0.0`. + * Added `device=None` to `run_harmony` for automatic GPU detection (CUDA -> MPS -> CPU). + * Removed epsilon workaround as numerical stability is now handled internally by harmonypy. + * Updated Nextflow label to `lowcpu, gpu`. + * Updated file schema (PR #18): * Add is_control obs to indicate whether a cell should be used as control when correcting batch effect. * Removed donor_id obs from unintegrated censored. diff --git a/common b/common index 67da19a36..91a35a2e8 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 67da19a36ae56ea068804d15ccadec88a06da920 +Subproject commit 91a35a2e808da7029e222456c0e3ca70ab3a06dd diff --git a/src/methods/harmonypy/config.vsh.yaml b/src/methods/harmonypy/config.vsh.yaml index 06ccceac5..9072bf929 100644 --- a/src/methods/harmonypy/config.vsh.yaml +++ b/src/methods/harmonypy/config.vsh.yaml @@ -22,14 +22,15 @@ resources: engines: - type: docker - image: openproblems/base_python:1 + image: openproblems/base_pytorch_nvidia:1.0.0 setup: - type: python packages: - - harmonypy + - harmonypy>=0.2.0 + - torch runners: - type: executable - type: nextflow directives: - label: [midtime,midmem,midcpu] + label: [midtime,midmem,lowcpu,gpu] diff --git a/src/methods/harmonypy/script.py b/src/methods/harmonypy/script.py index 9f2e634ae..99af3052c 100644 --- a/src/methods/harmonypy/script.py +++ b/src/methods/harmonypy/script.py @@ -30,14 +30,11 @@ print("Run harmony", flush=True) -# TODO numerical instability in kmeans causing problem with harmony. -# so adding a very small value to all entries to make sure there are no zeros -epsilon = 1e-20 - out = harmonypy.run_harmony( - data_mat=adata_to_correct.layers["preprocessed"] + epsilon, + data_mat=adata_to_correct.layers["preprocessed"], meta_data=adata_to_correct.obs, vars_use="batch_str", + device="cuda", ) # have to add in the uncorrected markers as well diff --git a/src/metrics/lisi/config.vsh.yaml b/src/metrics/lisi/config.vsh.yaml index f6ebdddf6..253f0cf5c 100644 --- a/src/metrics/lisi/config.vsh.yaml +++ b/src/metrics/lisi/config.vsh.yaml @@ -95,6 +95,7 @@ resources: # The script of your component (required) - type: python_script path: script.py + - path: helper.py - path: /src/utils/helper_functions.py engines: diff --git a/src/metrics/lisi/helper.py b/src/metrics/lisi/helper.py new file mode 100644 index 000000000..f61e22527 --- /dev/null +++ b/src/metrics/lisi/helper.py @@ -0,0 +1,72 @@ +import anndata as ad +import numpy as np +import scib_metrics as sm + + +def compute_ilisi_per_group(integrated: ad.AnnData, split_label: str): + """ + Compute iLISI per biological group, skipping groups where batch is confounded. + + Args: + integrated (ad.AnnData): Integrated AnnData with obs columns 'group' and 'batch'. + split_label (str): Label for the split (used in print statements). + + Returns: + ilisi_per_group (dict): Per-group normalised iLISI scalar values. + Empty dict if all groups are skipped due to batch-group confounding. + ilisi_per_cell_per_group (dict): Per-group arrays of raw per-cell iLISI values. + Empty dict if all groups are skipped due to batch-group confounding. + """ + groups = integrated.obs["group"].unique() + ilisi_per_group = {} + ilisi_per_cell_per_group = {} + + for group in groups: + group_adata = integrated[integrated.obs["group"] == group] + n_batches_in_group = len(group_adata.obs["batch"].unique()) + + if n_batches_in_group < 2: + print( + f"{split_label} - Group {group}: batch is confounded by group " + f"(only 1 batch present). Skipping.", + flush=True, + ) + continue + + knn = sm.nearest_neighbors.pynndescent( + group_adata.layers["integrated"], n_neighbors=100, random_state=0 + ) + ilisi_per_cell = sm.lisi_knn(knn, group_adata.obs["batch"]) + ilisi = (np.nanmedian(ilisi_per_cell) - 1) / (n_batches_in_group - 1) + + ilisi_per_group[group] = ilisi + ilisi_per_cell_per_group[group] = ilisi_per_cell + + if len(ilisi_per_group) == 0: + print( + f"{split_label}: batch is confounded by group in all groups. " + f"Returning NaN for iLISI.", + flush=True, + ) + + return ilisi_per_group, ilisi_per_cell_per_group + + +def compute_clisi(integrated: ad.AnnData): + """ + Compute cLISI for an integrated AnnData object. + + Args: + integrated (ad.AnnData): Integrated AnnData with obs column 'cell_type'. + + Returns: + clisi (float): Normalised cLISI score. + clisi_per_cell (np.ndarray): Raw per-cell cLISI values. + """ + n_celltypes = len(integrated.obs["cell_type"].unique()) + knn = sm.nearest_neighbors.pynndescent( + integrated.layers["integrated"], n_neighbors=100, random_state=0 + ) + clisi_per_cell = sm.lisi_knn(knn, integrated.obs["cell_type"]) + clisi = (n_celltypes - np.nanmedian(clisi_per_cell)) / (n_celltypes - 1) + return clisi, clisi_per_cell diff --git a/src/metrics/lisi/script.py b/src/metrics/lisi/script.py index a6f1c61e3..b40d0aba1 100644 --- a/src/metrics/lisi/script.py +++ b/src/metrics/lisi/script.py @@ -2,7 +2,6 @@ import anndata as ad import numpy as np -import scib_metrics as sm ## VIASH START # Note: this section is auto-generated by viash at runtime. To edit it, make changes @@ -20,6 +19,7 @@ ## VIASH END sys.path.append(meta["resources_dir"]) +import helper as lisi_helper from helper_functions import ( get_obs_var_for_integrated, remove_unlabelled, @@ -48,55 +48,72 @@ integrated_s2 = remove_unlabelled(integrated_s2) print("Compute metrics", flush=True) -n_batches = len(integrated_s1.obs.batch.unique()) -n_celltypes = len(integrated_s1.obs.cell_type.unique()) - -print("Compute iLisi and cLisi for split 1", flush=True) -knn = sm.nearest_neighbors.pynndescent( - integrated_s1.layers["integrated"], n_neighbors=100, random_state=0 +print("Compute iLisi per group for split 1", flush=True) +ilisi_s1_per_group, ilisi_s1_per_cell_per_group = lisi_helper.compute_ilisi_per_group( + integrated_s1, "split 1" ) -ilisi_s1_per_cell = sm.lisi_knn(knn, integrated_s1.obs.batch) -ilisi_s1 = (np.nanmedian(ilisi_s1_per_cell) - 1) / (n_batches - 1) +print("Compute iLisi per group for split 2", flush=True) +ilisi_s2_per_group, ilisi_s2_per_cell_per_group = lisi_helper.compute_ilisi_per_group( + integrated_s2, "split 2" +) -clisi_s1_per_cell = sm.lisi_knn(knn, integrated_s1.obs.cell_type) -clisi_s1 = (n_celltypes - np.nanmedian(clisi_s1_per_cell)) / (n_celltypes - 1) +# Compute final iLISI as the mean across all groups from both splits. +# If all groups across both splits were skipped (batch confounded by group), return NaN. +all_ilisi_values = list(ilisi_s1_per_group.values()) + list(ilisi_s2_per_group.values()) +ilisi = np.nanmean(all_ilisi_values) if len(all_ilisi_values) > 0 else np.nan -print("Compute iLisi and cLisi for split 2", flush=True) -knn = sm.nearest_neighbors.pynndescent( - integrated_s2.layers["integrated"], n_neighbors=100, random_state=0 -) -ilisi_s2_per_cell = sm.lisi_knn(knn, integrated_s2.obs.batch) -ilisi_s2 = (np.nanmedian(ilisi_s2_per_cell) - 1) / (n_batches - 1) +print("Compute cLisi for split 1", flush=True) +clisi_s1, clisi_s1_per_cell = lisi_helper.compute_clisi(integrated_s1) -clisi_s2_per_cell = sm.lisi_knn(knn, integrated_s2.obs.cell_type) -clisi_s2 = (n_celltypes - np.nanmedian(clisi_s2_per_cell)) / (n_celltypes - 1) +print("Compute cLisi for split 2", flush=True) +clisi_s2, clisi_s2_per_cell = lisi_helper.compute_clisi(integrated_s2) -ilisi = np.mean([ilisi_s1, ilisi_s2]) clisi = np.mean([clisi_s1, clisi_s2]) -uns_metric_ids = ["iLisi", "cLisi"] -uns_metric_values = [ilisi, clisi] -print("iLisi split 1:", ilisi_s1, flush=True) -print("iLisi split 2:", ilisi_s2, flush=True) +# Print all results to stdout for logging purposes. +for group, val in ilisi_s1_per_group.items(): + print(f"iLisi split 1 group {group}: {val}", flush=True) + print(f"iLisi split 2 group {group}: {val}", flush=True) + print("cLisi split 1:", clisi_s1, flush=True) print("cLisi split 2:", clisi_s2, flush=True) print("Mean iLisi:", ilisi, flush=True) print("Mean cLisi:", clisi, flush=True) -print("Metric IDs:", uns_metric_ids, flush=True) -print("Metric values:", uns_metric_values, flush=True) print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - "dataset_id": integrated_s1.uns["dataset_id"], - "method_id": integrated_s1.uns["method_id"], - "metric_ids": uns_metric_ids, - "metric_values": uns_metric_values, - "ilisi_s1_index": ilisi_s1_per_cell, - "ilisi_s2_index": ilisi_s2_per_cell, - "clisi_s1_index": clisi_s1_per_cell, - "clisi_s2_index": clisi_s2_per_cell, - } -) +# uns layout: +# iLisi = mean iLISI across all groups and both splits. +# cLisi = mean cLISI across both splits. +# ilisi_per_split: {split_1, split_2} -> {group: iLISI score}. +# Groups where batch is fully confounded by group are skipped and absent. +# ilisi_per_cell_per_split: same structure, but values are raw per-cell iLISI score +# clisi_per_split: {split_1, split_2} -> cLISI score for that split. +# clisi_per_cell_per_split: {split_1, split_2} -> raw per-cell cLISI score for that split. +uns = { + "dataset_id": integrated_s1.uns["dataset_id"], + "method_id": integrated_s1.uns["method_id"], + "metric_ids": ["iLisi", "cLisi"], + "metric_values": [ilisi, clisi], + "ilisi_per_split": { + "split_1": ilisi_s1_per_group, + "split_2": ilisi_s2_per_group, + }, + "ilisi_per_cell_per_split": { + "split_1": ilisi_s1_per_cell_per_group, + "split_2": ilisi_s2_per_cell_per_group, + }, + "clisi_per_split": { + "split_1": clisi_s1, + "split_2": clisi_s2, + }, + "clisi_per_cell_per_split": { + "split_1": clisi_s1_per_cell, + "split_2": clisi_s2_per_cell, + }, +} + +output = ad.AnnData(uns=uns) output.write_h5ad(par["output"], compression="gzip") + +print(f"Anndata written to {par['output']}", flush=True)