diff --git a/README.md b/README.md index a0a382e..301a691 100644 --- a/README.md +++ b/README.md @@ -28,9 +28,9 @@ should convince readers of the significance and relevance of your task. ## Authors & contributors -| Name | Roles | Orcid | Twitter | Github | Email | Linkedin | +| Name | Roles | Twitter | Email | Orcid | Github | Linkedin | |:---|:---|:---|:---|:---|:---|:---| -| John Doe | author, maintainer | 0000-0000-0000-0000 | johndoe | johndoe | john@doe.me | johndoe | +| John Doe | author, maintainer | johndoe | john@doe.me | 0000-0000-0000-0000 | johndoe | johndoe | ## API @@ -38,25 +38,32 @@ should convince readers of the significance and relevance of your task. flowchart TB file_common_ist("Common iST Dataset") comp_data_processor[/"Data processor"/] - file_spatial_dataset("Raw iST Dataset") + file_spatial_unlabelled("Unlabelled Spatial Dataset") + file_spatial_solution("Spatial Segmentation Solution") file_scrnaseq_reference("scRNA-seq Reference") comp_control_method[/"Control Method"/] comp_method[/"Method"/] + comp_output_processor[/"Output processor"/] comp_metric[/"Metric"/] file_prediction("Predicted data") + file_processed_prediction("Processed prediction") file_score("Score") file_common_scrnaseq("Common SC Dataset") file_common_ist---comp_data_processor - comp_data_processor-->file_spatial_dataset + comp_data_processor-->file_spatial_unlabelled + comp_data_processor-->file_spatial_solution comp_data_processor-->file_scrnaseq_reference - file_spatial_dataset---comp_control_method - file_spatial_dataset---comp_method - file_scrnaseq_reference---comp_control_method - file_scrnaseq_reference---comp_metric + file_spatial_unlabelled---comp_control_method + file_spatial_unlabelled---comp_method + file_spatial_unlabelled---comp_output_processor + file_spatial_solution---comp_control_method + file_spatial_solution---comp_metric comp_control_method-->file_prediction comp_method-->file_prediction + comp_output_processor-->file_processed_prediction comp_metric-->file_score - file_prediction---comp_metric + file_prediction---comp_output_processor + file_processed_prediction---comp_metric file_common_scrnaseq---comp_data_processor ``` @@ -175,22 +182,24 @@ Arguments: |:---|:---|:---| | `--input_sp` | `file` | An unprocessed spatial imaging dataset stored as a zarr file. | | `--input_sc` | `file` | An unprocessed dataset as output by a dataset loader. | -| `--output_spatial_dataset` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output_spatial_unlabelled` | `file` | (*Output*) A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output_spatial_solution` | `file` | (*Output*) Ground truth segmentation labels for evaluating spatial segmentation methods. | | `--output_scrnaseq_reference` | `file` | (*Output*) A single-cell reference dataset, preprocessed for this benchmark. | -## File format: Raw iST Dataset +## File format: Unlabelled Spatial Dataset A spatial transcriptomics dataset, preprocessed for this benchmark. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr` +`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr` Description: -This dataset contains preprocessed images, labels, points, shapes, and -tables for spatial transcriptomics data. +This dataset contains preprocessed images and transcript point clouds +for spatial transcriptomics data. Ground truth segmentation labels are +intentionally excluded to prevent methods from cheating. Format: @@ -198,9 +207,7 @@ Format: SpatialData object images: 'morphology_mip' - labels: 'cell_labels', 'nucleus_labels' points: 'transcripts' - shapes: 'cell_boundaries', 'nucleus_boundaries' tables: 'table' coordinate_systems: 'global' @@ -212,16 +219,9 @@ Data structure: *images* -| Name | Description | -|:-----------------|:--------------------| -| `morphology_mip` | The raw image data. | - -*labels* - -| Name | Description | -|:-----------------|:---------------------------------------| -| `cell_labels` | (*Optional*) Cell segmentation labels. | -| `nucleus_labels` | (*Optional*) Cell segmentation labels. | +| Name | Description | +|:-----------------|:---------------------------------------------------------| +| `morphology_mip` | The raw morphology image (maximum intensity projection). | *points* @@ -233,26 +233,9 @@ Data structure: | `y` | `float` | y-coordinate of the point. | | `z` | `float` | (*Optional*) z-coordinate of the point. | | `feature_name` | `categorical` | Name of the feature. | -| `cell_id` | `integer` | (*Optional*) Unique identifier of the cell. | -| `nucleus_id` | `integer` | (*Optional*) Unique identifier of the nucleus. | -| `cell_type` | `string` | (*Optional*) Cell type of the cell. | | `qv` | `float` | (*Optional*) Quality value of the point. | | `transcript_id` | `long` | Unique identifier of the transcript. | -| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with a nucleus. | - -*shapes* - -`cell_boundaries`: Cell boundaries. - -| Column | Type | Description | -|:-----------|:---------|:-------------------------------| -| `geometry` | `object` | Geometry of the cell boundary. | - -`nucleus_boundaries`: Nucleus boundaries. - -| Column | Type | Description | -|:-----------|:---------|:----------------------------------| -| `geometry` | `object` | Geometry of the nucleus boundary. | +| `overlaps_nucleus` | `boolean` | (*Optional*) Whether the point overlaps with the nucleus (derived from morphology). | *tables* @@ -260,10 +243,6 @@ Data structure: | Slot | Type | Description | |:---|:---|:---| -| `obs["cell_id"]` | `string` | A unique identifier for the cell. | -| `var["gene_ids"]` | `string` | Unique identifier for the gene. | -| `var["feature_types"]` | `string` | Type of the feature. | -| `obsm["spatial"]` | `double` | Spatial coordinates of the cell. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["dataset_name"]` | `string` | A human-readable name for the dataset. | | `uns["dataset_url"]` | `string` | Link to the original source of the dataset. | @@ -271,7 +250,7 @@ Data structure: | `uns["dataset_summary"]` | `string` | Short description of the dataset. | | `uns["dataset_description"]` | `string` | Long description of the dataset. | | `uns["dataset_organism"]` | `string` | The organism of the sample in the dataset. | -| `uns["segmentation_id"]` | `string` | A unique identifier for the segmentation. | +| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). | *coordinate_systems* @@ -281,6 +260,71 @@ Data structure: +## File format: Spatial Segmentation Solution + +Ground truth segmentation labels for evaluating spatial segmentation +methods. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr` + +Description: + +This dataset contains the ground truth cell and nucleus segmentation +labels, cell boundaries, and a reference table matching each cell to its +label region. + +Format: + +
+ + SpatialData object + labels: 'cell_labels', 'nucleus_labels' + shapes: 'cell_boundaries', 'nucleus_boundaries' + tables: 'table' + +
+ +Data structure: + +
+ +*labels* + +| Name | Description | +|:-----------------|:-------------------------------------------------------| +| `cell_labels` | Ground truth cell segmentation labels. | +| `nucleus_labels` | (*Optional*) Ground truth nucleus segmentation labels. | + +*shapes* + +`cell_boundaries`: Ground truth cell boundary shapes. + +| Column | Type | Description | +|:-----------|:---------|:-------------------------------| +| `geometry` | `object` | Geometry of the cell boundary. | + +`nucleus_boundaries`: Ground truth nucleus boundary shapes. + +| Column | Type | Description | +|:-----------|:---------|:----------------------------------| +| `geometry` | `object` | Geometry of the nucleus boundary. | + +*tables* + +`table`: Reference cell metadata table. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `integer` | Unique cell identifier, matching instance IDs in the label images. | +| `obs["region"]` | `string` | Name of the label image this cell belongs to (e.g. ‘cell_labels’). | +| `obs["cell_area"]` | `double` | (*Optional*) Area of the cell in pixels. | +| `obs["transcript_counts"]` | `integer` | (*Optional*) Total number of transcripts assigned to this cell. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["orig_dataset_id"]` | `string` | The identifier of the original dataset from which this dataset was derived (if applicable). | + +
+ ## File format: scRNA-seq Reference A single-cell reference dataset, preprocessed for this benchmark. @@ -347,7 +391,7 @@ Arguments: | Name | Type | Description | |:---|:---|:---| | `--input` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | -| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--input_solution` | `file` | Ground truth segmentation labels for evaluating spatial segmentation methods. | | `--output` | `file` | (*Output*) A predicted dataset as output by a method. | @@ -367,6 +411,22 @@ Arguments: +## Component type: Output processor + +An output processor for the prediction. + +Arguments: + +
+ +| Name | Type | Description | +|:---|:---|:---| +| `--input_prediction` | `file` | A predicted dataset as output by a method. | +| `--input_spatial_unlabelled` | `file` | A spatial transcriptomics dataset, preprocessed for this benchmark. | +| `--output` | `file` | (*Output*) A processed predicted dataset, ready to be used as input for the evaluation. | + +
+ ## Component type: Metric A task template metric. @@ -377,8 +437,8 @@ Arguments: | Name | Type | Description | |:---|:---|:---| -| `--input_prediction` | `file` | A predicted dataset as output by a method. | -| `--input_scrnaseq_reference` | `file` | A single-cell reference dataset, preprocessed for this benchmark. | +| `--input_prediction` | `file` | A processed predicted dataset, ready to be used as input for the evaluation. | +| `--input_solution` | `file` | Ground truth segmentation labels for evaluating spatial segmentation methods. | | `--output` | `file` | (*Output*) File indicating the score of a metric. | @@ -388,7 +448,7 @@ Arguments: A predicted dataset as output by a method. Example file: -`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad` +`resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr` Format: @@ -416,13 +476,59 @@ Data structure: | Slot | Type | Description | |:--------------------|:---------|:-------------------------------------| -| `obs["cell_id"]` | `string` | Cell ID. | -| `obs["region"]` | `string` | Region. | | `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | | `uns["method_id"]` | `string` | A unique identifier for the method. | +## File format: Processed prediction + +A processed predicted dataset, ready to be used as input for the +evaluation. + +Example file: +`resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.h5ad` + +Format: + +
+ + SpatialData object + labels: 'segmentation' + tables: 'table' + +
+ +Data structure: + +
+ +*labels* + +| Name | Description | +|:---------------|:--------------------------| +| `segmentation` | Segmentation of the data. | + +*tables* + +`table`: AnnData table. + +| Slot | Type | Description | +|:---|:---|:---| +| `obs["cell_id"]` | `string` | Cell ID. | +| `obs["region"]` | `string` | Region. | +| `var["feature_id"]` | `string` | (*Optional*) Unique identifier for the feature, usually a ENSEMBL gene id. | +| `var["feature_name"]` | `string` | A human-readable name for the feature, usually a gene symbol. | +| `var["hvg"]` | `boolean` | Whether or not the feature is considered to be a ‘highly variable gene’. | +| `layers["counts"]` | `integer` | Raw counts. | +| `layers["normalized"]` | `double` | Normalized expression values. | +| `layers["normalized_log"]` | `double` | Log1p normalized expression values. | +| `layers["normalized_log_scaled"]` | `double` | Log1p normalized expression values scaled to unit variance and zero mean. | +| `uns["dataset_id"]` | `string` | A unique identifier for the dataset. | +| `uns["method_id"]` | `string` | A unique identifier for the method. | + +
+ ## File format: Score File indicating the score of a metric. diff --git a/common b/common index 91a35a2..f0816e1 160000 --- a/common +++ b/common @@ -1 +1 @@ -Subproject commit 91a35a2e808da7029e222456c0e3ca70ab3a06dd +Subproject commit f0816e178a2b44749fdfb1a9cdfc76887dcf7462 diff --git a/scripts/create_resources/resources.sh b/scripts/create_resources/resources.sh index 52ee226..4a921f8 100755 --- a/scripts/create_resources/resources.sh +++ b/scripts/create_resources/resources.sh @@ -16,9 +16,9 @@ exit 1 cat > /tmp/params.yaml << 'HERE' input_states: s3://openproblems-data/resources/datasets/**/state.yaml -rename_keys: 'input:output_dataset' +rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: '$id/state.yaml' -settings: '{"output_spatial_dataset": "$id/output_spatial_dataset.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}' +settings: '{"output_spatial_unlabelled": "$id/output_spatial_unlabelled.zarr", "output_spatial_solution": "$id/output_spatial_solution.zarr", "output_scrnaseq": "$id/output_scrnaseq.h5ad"}' publish_dir: s3://openproblems-data/resources/task_template/datasets/ HERE diff --git a/scripts/create_resources/test_resources.sh b/scripts/create_resources/test_resources.sh index b11d437..7f5c0e5 100755 --- a/scripts/create_resources/test_resources.sh +++ b/scripts/create_resources/test_resources.sh @@ -24,7 +24,8 @@ mkdir -p $DATASET_DIR viash run src/data_processors/process_dataset/config.vsh.yaml -- \ --input_sp $RAW_DATA/2023_10x_mouse_brain_xenium_rep1/dataset.zarr \ --input_sc $RAW_DATA/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad \ - --output_spatial_dataset $DATASET_DIR/spatial_dataset.zarr \ + --output_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \ + --output_spatial_solution $DATASET_DIR/spatial_solution.zarr \ --output_scrnaseq_reference $DATASET_DIR/scrnaseq_reference.h5ad \ --dataset_id mouse_brain_combined \ --dataset_name "Test data mouse brain combined 2023 tenx Xenium replicate 1 2023 Yao scRNAseq" \ @@ -36,8 +37,14 @@ viash run src/data_processors/process_dataset/config.vsh.yaml -- \ # run one method viash run src/methods/cellpose/config.vsh.yaml -- \ - --input $DATASET_DIR/spatial_dataset.zarr \ - --output $DATASET_DIR/prediction.h5ad + --input $DATASET_DIR/spatial_unlabelled.zarr \ + --output $DATASET_DIR/prediction.zarr + +# run prediction processor +viash run src/data_processors/process_prediction/config.vsh.yaml -- \ + --input_prediction $DATASET_DIR/prediction.zarr \ + --input_spatial_unlabelled $DATASET_DIR/spatial_unlabelled.zarr \ + --output $DATASET_DIR/processed_prediction.zarr # run one metric # TODO: implement this! @@ -49,9 +56,11 @@ viash run src/methods/cellpose/config.vsh.yaml -- \ # write manual state.yaml. this is not actually necessary but you never know it might be useful cat > $DATASET_DIR/state.yaml << HERE id: $DATASET_ID -spatial_dataset: spatial_dataset.zarr +spatial_unlabelled: spatial_unlabelled.zarr +spatial_solution: spatial_solution.zarr scrnaseq_reference: scrnaseq_reference.h5ad prediction: prediction.h5ad +processed_prediction: processed_prediction.zarr score: score.h5ad HERE diff --git a/scripts/run_benchmark/run_full_local.sh b/scripts/run_benchmark/run_full_local.sh index 26bba56..808df7f 100755 --- a/scripts/run_benchmark/run_full_local.sh +++ b/scripts/run_benchmark/run_full_local.sh @@ -31,7 +31,7 @@ publish_dir="resources/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: resources/datasets/**/state.yaml -rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference' +rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/scripts/run_benchmark/run_full_seqeracloud.sh b/scripts/run_benchmark/run_full_seqeracloud.sh index 3c31e74..745aa77 100755 --- a/scripts/run_benchmark/run_full_seqeracloud.sh +++ b/scripts/run_benchmark/run_full_seqeracloud.sh @@ -23,7 +23,7 @@ publish_dir="s3://openproblems-data/resources/task_template/results/${RUN_ID}" # write the parameters to file cat > /tmp/params.yaml << HERE input_states: s3://openproblems-data/resources/task_template/datasets/**/state.yaml -rename_keys: 'input_spatial_dataset:output_spatial_dataset,input_scrnaseq_reference:output_scrnaseq_reference' +rename_keys: 'input_spatial_unlabelled:output_spatial_unlabelled,input_spatial_solution:output_spatial_solution,input_scrnaseq_reference:output_scrnaseq_reference' output_state: "state.yaml" publish_dir: "$publish_dir" HERE diff --git a/src/api/comp_control_method.yaml b/src/api/comp_control_method.yaml index 3f4fa2e..990602d 100644 --- a/src/api/comp_control_method.yaml +++ b/src/api/comp_control_method.yaml @@ -13,11 +13,11 @@ info: in the task. arguments: - name: --input - __merge__: file_spatial_dataset.yaml + __merge__: file_spatial_unlabelled.yaml required: true direction: input - - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + - name: "--input_solution" + __merge__: file_spatial_solution.yaml direction: input required: true - name: --output diff --git a/src/api/comp_data_processor.yaml b/src/api/comp_data_processor.yaml index ecd3f9c..69deecc 100644 --- a/src/api/comp_data_processor.yaml +++ b/src/api/comp_data_processor.yaml @@ -19,8 +19,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: file_spatial_dataset.yaml + - name: "--output_spatial_unlabelled" + __merge__: file_spatial_unlabelled.yaml + direction: output + required: true + - name: "--output_spatial_solution" + __merge__: file_spatial_solution.yaml direction: output required: true - name: "--output_scrnaseq_reference" @@ -80,4 +84,3 @@ test_resources: dest: resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2 - type: python_script path: /common/component_tests/run_and_check_output.py - diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml index 633a8a1..2acf7ef 100644 --- a/src/api/comp_method.yaml +++ b/src/api/comp_method.yaml @@ -8,7 +8,7 @@ info: A method to predict the task effects. arguments: - name: --input - __merge__: file_spatial_dataset.yaml + __merge__: file_spatial_unlabelled.yaml required: true direction: input - name: --output diff --git a/src/api/comp_metric.yaml b/src/api/comp_metric.yaml index a7470e9..6102919 100644 --- a/src/api/comp_metric.yaml +++ b/src/api/comp_metric.yaml @@ -8,11 +8,11 @@ info: A metric for evaluating method predictions. arguments: - name: "--input_prediction" - __merge__: file_prediction.yaml + __merge__: file_processed_prediction.yaml direction: input required: true - - name: "--input_scrnaseq_reference" - __merge__: file_scrnaseq_reference.yaml + - name: "--input_solution" + __merge__: file_spatial_solution.yaml direction: input required: true - name: "--output" diff --git a/src/api/comp_output_processor.yaml b/src/api/comp_output_processor.yaml new file mode 100644 index 0000000..a5db8ec --- /dev/null +++ b/src/api/comp_output_processor.yaml @@ -0,0 +1,30 @@ +namespace: "data_processors" +info: + type: data_processor + type_info: + label: Output processor + summary: An output processor for the prediction. + description: | + A component for a prediction dataset into a processed prediction dataset that can be evaluated by the metrics. +argument_groups: + - name: Inputs + arguments: + - name: "--input_prediction" + __merge__: file_prediction.yaml + required: true + direction: input + - name: "--input_spatial_unlabelled" + __merge__: file_spatial_unlabelled.yaml + required: true + direction: input + - name: Outputs + arguments: + - name: "--output" + __merge__: file_processed_prediction.yaml + direction: output + required: true +test_resources: + - type: python_script + path: /common/component_tests/run_and_check_output.py + - path: /resources_test/task_spatial_segmentation/mouse_brain_combined + dest: resources_test/task_spatial_segmentation/mouse_brain_combined diff --git a/src/api/file_prediction.yaml b/src/api/file_prediction.yaml index b1fc443..e72794b 100644 --- a/src/api/file_prediction.yaml +++ b/src/api/file_prediction.yaml @@ -1,6 +1,6 @@ #TODO: Change to the required and/or optional fields of the anndata type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.h5ad" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr" label: "Predicted data" summary: A predicted dataset as output by a method. info: @@ -16,15 +16,6 @@ info: name: table description: AnnData table required: true - obs: - - type: string - name: cell_id - description: Cell ID - required: true - - type: string - name: region - description: Region - required: true uns: - type: string name: dataset_id diff --git a/src/api/file_processed_prediction.yaml b/src/api/file_processed_prediction.yaml new file mode 100644 index 0000000..77dee47 --- /dev/null +++ b/src/api/file_processed_prediction.yaml @@ -0,0 +1,67 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/processed_prediction.h5ad" +label: "Processed prediction" +summary: A processed predicted dataset, ready to be used as input for the evaluation. +info: + format: + type: spatialdata_zarr + labels: + - type: object + name: "segmentation" + description: Segmentation of the data + required: true + tables: + - type: anndata + name: table + description: AnnData table + required: true + # TODO: what is it that this component adds to the anndata? + layers: + - type: integer + name: counts + description: Raw counts + required: true + - type: double + name: normalized + description: Normalized expression values + required: true + - type: double + name: normalized_log + description: Log1p normalized expression values + required: true + - type: double + name: normalized_log_scaled + description: Log1p normalized expression values scaled to unit variance and zero mean + required: true + obs: + - type: string + name: cell_id + description: Cell ID + required: true + - type: string + name: region + description: Region + required: true + # .... cell info ... ? + var: + - type: string + name: feature_id + description: Unique identifier for the feature, usually a ENSEMBL gene id. + required: false + - type: string + name: feature_name + description: A human-readable name for the feature, usually a gene symbol. + required: true + - type: boolean + name: hvg + description: Whether or not the feature is considered to be a 'highly variable gene' + required: true + uns: + - type: string + name: dataset_id + description: "A unique identifier for the dataset" + required: true + - type: string + name: method_id + description: "A unique identifier for the method" + required: true diff --git a/src/api/file_scrnaseq_reference.yaml b/src/api/file_scrnaseq_reference.yaml index 9b855fd..c1da992 100644 --- a/src/api/file_scrnaseq_reference.yaml +++ b/src/api/file_scrnaseq_reference.yaml @@ -1,7 +1,5 @@ type: file example: "resources_test/task_spatial_segmentation/mouse_brain_combined/scrnaseq_reference.h5ad" -# TODO: revert to the original example once file exists -# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.h5ad" label: "scRNA-seq Reference" summary: A single-cell reference dataset, preprocessed for this benchmark. description: | diff --git a/src/api/file_spatial_solution.yaml b/src/api/file_spatial_solution.yaml new file mode 100644 index 0000000..84944dd --- /dev/null +++ b/src/api/file_spatial_solution.yaml @@ -0,0 +1,69 @@ +type: file +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr" +label: "Spatial Segmentation Solution" +summary: Ground truth segmentation labels for evaluating spatial segmentation methods. +description: | + This dataset contains the ground truth cell and nucleus segmentation labels, + cell boundaries, and a reference table matching each cell to its label region. +info: + format: + type: spatialdata_zarr + labels: + - type: object + name: "cell_labels" + description: Ground truth cell segmentation labels + required: true + - type: object + name: "nucleus_labels" + description: Ground truth nucleus segmentation labels + required: false + shapes: + - type: dataframe + name: "cell_boundaries" + description: Ground truth cell boundary shapes + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the cell boundary + - type: dataframe + name: "nucleus_boundaries" + description: Ground truth nucleus boundary shapes + required: false + columns: + - type: object + name: "geometry" + required: true + description: Geometry of the nucleus boundary + tables: + - type: anndata + name: "table" + description: Reference cell metadata table + required: true + obs: + - type: integer + name: cell_id + description: Unique cell identifier, matching instance IDs in the label images + required: true + - type: string + name: region + description: Name of the label image this cell belongs to (e.g. 'cell_labels') + required: true + - type: double + name: cell_area + description: Area of the cell in pixels + required: false + - type: integer + name: transcript_counts + description: Total number of transcripts assigned to this cell + required: false + uns: + - type: string + name: dataset_id + description: A unique identifier for the dataset + required: true + - type: string + name: orig_dataset_id + required: true + description: The identifier of the original dataset from which this dataset was derived (if applicable) diff --git a/src/api/file_spatial_dataset.yaml b/src/api/file_spatial_unlabelled.yaml similarity index 53% rename from src/api/file_spatial_dataset.yaml rename to src/api/file_spatial_unlabelled.yaml index 4c5253e..74056ce 100644 --- a/src/api/file_spatial_dataset.yaml +++ b/src/api/file_spatial_unlabelled.yaml @@ -1,28 +1,18 @@ type: file -example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" -# TODO: revert to the original example once file exists -# example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr" -label: "Raw iST Dataset" +example: "resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr" +label: "Unlabelled Spatial Dataset" summary: A spatial transcriptomics dataset, preprocessed for this benchmark. description: | - This dataset contains preprocessed images, labels, points, shapes, and tables for spatial transcriptomics data. + This dataset contains preprocessed images and transcript point clouds for spatial transcriptomics data. + Ground truth segmentation labels are intentionally excluded to prevent methods from cheating. info: format: type: spatialdata_zarr images: - type: object name: morphology_mip - description: The raw image data + description: The raw morphology image (maximum intensity projection) required: true - labels: - - type: object - name: "cell_labels" - description: Cell segmentation labels - required: false - - type: object - name: "nucleus_labels" - description: Cell segmentation labels - required: false points: - type: dataframe name: transcripts @@ -45,18 +35,6 @@ info: name: feature_name required: true description: Name of the feature - - type: integer - name: "cell_id" - required: false - description: Unique identifier of the cell - - type: integer - name: "nucleus_id" - required: false - description: Unique identifier of the nucleus - - type: string - name: "cell_type" - required: false - description: Cell type of the cell - type: float name: qv required: false @@ -68,26 +46,7 @@ info: - type: boolean name: overlaps_nucleus required: false - description: Whether the point overlaps with a nucleus - shapes: - - type: dataframe - name: "cell_boundaries" - description: Cell boundaries - required: false - columns: - - type: object - name: "geometry" - required: true - description: Geometry of the cell boundary - - type: dataframe - name: "nucleus_boundaries" - description: Nucleus boundaries - required: false - columns: - - type: object - name: "geometry" - required: true - description: Geometry of the nucleus boundary + description: Whether the point overlaps with the nucleus (derived from morphology) tables: - type: anndata name: "table" @@ -123,29 +82,9 @@ info: required: true description: The organism of the sample in the dataset - type: string - name: segmentation_id - required: true - multiple: true - description: A unique identifier for the segmentation - obs: - - type: string - name: cell_id - required: true - description: A unique identifier for the cell - var: - - type: string - name: gene_ids - required: true - description: Unique identifier for the gene - - type: string - name: feature_types - required: true - description: Type of the feature - obsm: - - type: double - name: spatial + name: orig_dataset_id required: true - description: Spatial coordinates of the cell + description: The identifier of the original dataset from which this dataset was derived (if applicable) coordinate_systems: - type: object name: global diff --git a/src/control_methods/true_labels/config.vsh.yaml b/src/control_methods/true_labels/config.vsh.yaml index 0a71e50..7d47a02 100644 --- a/src/control_methods/true_labels/config.vsh.yaml +++ b/src/control_methods/true_labels/config.vsh.yaml @@ -1,59 +1,24 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test __merge__: ../../api/comp_control_method.yaml -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. name: true_labels - -# A relatively short label, used when rendering visualisations (required) label: True Labels -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "a positive control, solution labels are copied 1 to 1 to the predicted data." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. +summary: "A positive control where the ground truth cell_labels are used as the prediction." description: | - A positive control, where the solution labels are copied 1 to 1 to the predicted data. - -# Metadata for your component -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: counts - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. + A positive control where the ground truth cell_labels segmentation is copied + directly as the prediction. This represents the upper bound of performance + for any segmentation method. -# Resources required to run the component resources: - # The script of your component (required) - type: python_script path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt engines: - # Specifications for the Docker image for this component. - type: docker image: openproblems/base_python:1 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - # setup: - # - type: python - # packages: scib==1.1.5 + __merge__: ../../base/setup_spatialdata_partial.yaml runners: - # This platform allows running the component natively - type: executable - # Allows turning the component into a Nextflow module / pipeline. - type: nextflow directives: label: [midtime, lowmem, lowcpu] diff --git a/src/control_methods/true_labels/script.py b/src/control_methods/true_labels/script.py index 935f3af..107804a 100644 --- a/src/control_methods/true_labels/script.py +++ b/src/control_methods/true_labels/script.py @@ -1,13 +1,11 @@ import anndata as ad +import spatialdata as sd ## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. par = { - 'input_train': 'resources_test/task_template/cxg_mouse_pancreas_atlas/train.h5ad', - 'input_test': 'resources_test/task_template/cxg_mouse_pancreas_atlas/test.h5ad', - 'input_solution': 'resources_test/task_template/cxg_mouse_pancreas_atlas/solution.h5ad', - 'output': 'output.h5ad' + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'input_solution': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_solution.zarr', + 'output': 'output.zarr' } meta = { 'name': 'true_labels' @@ -15,31 +13,23 @@ ## VIASH END print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) -input_solution = ad.read_h5ad(par['input_solution']) +sdata_input = sd.read_zarr(par['input']) +sdata_solution = sd.read_zarr(par['input_solution']) -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = input_solution.obs["label"] - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] +print('Copying ground truth cell_labels as prediction', flush=True) +output = sd.SpatialData( + labels={ + 'segmentation': sdata_solution['cell_labels'] }, - obs={ - 'label_pred': obs_label_pred + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata_solution.tables['table'].uns['dataset_id'], + 'method_id': meta['name'] + } + ) } ) -output.obs_names = input_test.obs_names -output.write_h5ad(par['output'], compression='gzip') +print('Writing output', flush=True) +output.write(par['output'], overwrite=True) diff --git a/src/data_processors/process_dataset/config.vsh.yaml b/src/data_processors/process_dataset/config.vsh.yaml index 0ea6508..96cc85c 100644 --- a/src/data_processors/process_dataset/config.vsh.yaml +++ b/src/data_processors/process_dataset/config.vsh.yaml @@ -5,10 +5,6 @@ name: process_dataset argument_groups: - name: "Processing parameters" arguments: - - name: "--seed" - type: "integer" - description: "A seed for the subsampling." - example: 123 - name: "--span" type: double description: The fraction of the data (cells) used when estimating the variance in the loess model fit if flavor='seurat_v3'. @@ -24,7 +20,6 @@ resources: engines: - type: docker - #image: openproblems/base_pytorch_nvidia:1 # TODO: ideally get gpu image to work image: openproblems/base_python:1 setup: - type: python @@ -37,4 +32,4 @@ runners: - type: executable - type: nextflow directives: - label: [highmem, midcpu, midtime] \ No newline at end of file + label: [midmem, midcpu, midtime] diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index b04cb33..38f07de 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -1,4 +1,3 @@ -import random import anndata as ad import spatialdata as sd import scanpy as sc @@ -7,8 +6,9 @@ par = { 'input_sp': 'resources_test/common/2023_10x_mouse_brain_xenium_rep1/dataset.zarr', 'input_sc': 'resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2/dataset.h5ad', - 'output_spatial_dataset': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_spatial_dataset.zarr', - 'output_scrnaseq_reference': 'resources_test/task_spatial_segmentation/mouse_brain_combined/output_scrnaseq_reference.h5ad', + 'output_spatial_unlabelled': 'spatial_unlabelled.zarr', + 'output_spatial_solution': 'spatial_solution.zarr', + 'output_scrnaseq_reference': 'scrnaseq_reference.h5ad', 'span': 0.3, 'seed': 123, 'n_top_genes': 3000, @@ -53,12 +53,6 @@ def sc_processing(adata): ) adata.var.rename(columns={"highly_variable": "hvg"}, inplace=True) - -# set seed if need be -if par["seed"]: - print(f">> Setting seed to {par['seed']}") - random.seed(par["seed"]) - print(">> Load data", flush=True) sc_data = ad.read_h5ad(par["input_sc"]) print(f"single cell data: {sc_data}") @@ -71,7 +65,7 @@ def sc_processing(adata): for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]: sc_data.uns[key] = par[key] -print(">> Writing data", flush=True) +print(">> Writing scrnaseq reference", flush=True) sc_data.write_h5ad(par["output_scrnaseq_reference"], compression="gzip") # read input_sp @@ -79,27 +73,65 @@ def sc_processing(adata): sp_data = sd.read_zarr(par["input_sp"]) print(f"spatial data: {sp_data}") -print(">> Processing spatial data", flush=True) -sp_data_table = sp_data.tables['table'] -print(f"single cell part of spatial data: {sp_data_table}") -sc_processing(sp_data_table) - -if "cell_area" not in sp_data_table.obs: - print(">> Perform scanpy qc for cell area", flush=True) - sc.pp.calculate_qc_metrics(sp_data_table, layer="counts", inplace=True) - -for x in ["transcript_counts", "n_genes_by_counts"]: - if f"ca_normalized_{x}" not in sp_data_table.obs and x in sp_data_table.obs: - print(f">> Perform cell area normalization for {x}", flush=True) - sp_data_table.obs[f'ca_normalized_{x}'] = sp_data_table.obs[f"{x}"] / sp_data_table.obs["cell_area"] - -print(">> Override dataset metadata in .uns", flush=True) -sp_data_table.uns["orig_dataset_id"] = sp_data_table.uns.get("dataset_id", None) -for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_summary", "dataset_description", "dataset_reference", "dataset_organism"]: - sp_data_table.uns[key] = par[key] - -print(f"spatial data: {sp_data}") -print(f"spatial data tables['table']: {sp_data.tables['table']}") +dataset_uns = { + "dataset_id": par["dataset_id"], + "dataset_name": par["dataset_name"], + "dataset_url": par["dataset_url"], + "dataset_summary": par["dataset_summary"], + "dataset_description": par["dataset_description"], + "dataset_reference": par["dataset_reference"], + "dataset_organism": par["dataset_organism"], + "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), +} -print(">> Writing spatial data", flush=True) -sp_data.write(par["output_spatial_dataset"], overwrite=True) +# --------------------------------------------------------------- +# output_spatial_dataset: image + transcripts (no ground truth) +# --------------------------------------------------------------- +print(">> Building spatial dataset for methods (no ground truth)", flush=True) + +# Strip columns that reveal ground truth cell assignments from transcripts +_GROUND_TRUTH_COLS = {"cell_id", "nucleus_id", "cell_type"} +transcripts = sp_data.points["transcripts"] +clean_transcript_cols = [c for c in transcripts.columns if c not in _GROUND_TRUTH_COLS] +clean_transcripts = transcripts[clean_transcript_cols] + +# Minimal table: just dataset metadata in uns, no per-cell obs +minimal_table = ad.AnnData(uns=dataset_uns) + +output_spatial = sd.SpatialData( + images={"morphology_mip": sp_data.images["morphology_mip"]}, + points={"transcripts": clean_transcripts}, + tables={"table": minimal_table}, +) + +print(">> Writing spatial unlabelled dataset", flush=True) +output_spatial.write(par["output_spatial_unlabelled"], overwrite=True) + +# --------------------------------------------------------------- +# output_spatial_solution: ground truth labels, shapes, reference table +# --------------------------------------------------------------- +print(">> Building spatial solution (ground truth)", flush=True) + +ref_table = sp_data.tables["table"] +solution_obs = ref_table.obs[["cell_id", "region"]].copy() +for extra_col in ["cell_area", "transcript_counts"]: + if extra_col in ref_table.obs.columns: + solution_obs[extra_col] = ref_table.obs[extra_col] + +solution_table = ad.AnnData( + obs=solution_obs, + uns={ + "dataset_id": par["dataset_id"], + "orig_dataset_id": sp_data.tables["table"].uns.get("dataset_id", None), + "spatialdata_attrs": ref_table.uns["spatialdata_attrs"], + }, +) + +output_solution = sd.SpatialData( + labels={k: v for k, v in sp_data.labels.items()}, + shapes={k: v for k, v in sp_data.shapes.items()}, + tables={"table": solution_table}, +) + +print(">> Writing spatial solution", flush=True) +output_solution.write(par["output_spatial_solution"], overwrite=True) diff --git a/src/data_processors/process_prediction/config.vsh.yaml b/src/data_processors/process_prediction/config.vsh.yaml new file mode 100644 index 0000000..7e473ac --- /dev/null +++ b/src/data_processors/process_prediction/config.vsh.yaml @@ -0,0 +1,23 @@ +__merge__: ../../api/comp_output_processor.yaml + +name: process_prediction + +resources: + - type: python_script + path: script.py + +engines: + - type: docker + image: openproblems/base_python:1 + setup: + - type: python + packages: [scikit-learn, scikit-misc] + __merge__: + - /src/base/setup_spatialdata_partial.yaml + - type: native + +runners: + - type: executable + - type: nextflow + directives: + label: [midmem, midcpu, midtime] diff --git a/src/data_processors/process_prediction/script.py b/src/data_processors/process_prediction/script.py new file mode 100644 index 0000000..ceed915 --- /dev/null +++ b/src/data_processors/process_prediction/script.py @@ -0,0 +1,94 @@ +import numpy as np +import xarray as xr +import anndata as ad +import pandas as pd +import spatialdata as sd +import scanpy as sc + +## VIASH START +par = { + 'input_prediction': 'resources_test/task_spatial_segmentation/mouse_brain_combined/prediction.zarr', + 'input_spatial_unlabelled': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', + 'output': 'output.zarr' +} +## VIASH END + +print(">> Reading input files", flush=True) +sdata_pred = sd.read_zarr(par["input_prediction"]) +sdata_sp = sd.read_zarr(par["input_spatial_unlabelled"]) + +dataset_id = sdata_sp.tables["table"].uns["dataset_id"] +method_id = sdata_pred.tables["table"].uns["method_id"] + +print(">> Transforming transcripts to global coordinate system", flush=True) +transcripts = sd.transform(sdata_sp["transcripts"], to_coordinate_system="global") + +# Adjust for any translation applied to the segmentation +trans = sd.transformations.get_transformation( + sdata_pred["segmentation"], get_all=True +)["global"].inverse() +transcripts = sd.transform(transcripts, trans, "global") + +print(">> Assigning transcripts to cells via label image lookup", flush=True) +y_coords = transcripts.y.compute().to_numpy(dtype=np.int64) +x_coords = transcripts.x.compute().to_numpy(dtype=np.int64) + +if isinstance(sdata_pred["segmentation"], xr.DataTree): + label_image = sdata_pred["segmentation"]["scale0"].image.to_numpy() +else: + label_image = sdata_pred["segmentation"].to_numpy() + +# Clip coordinates to valid label image bounds +y_coords = np.clip(y_coords, 0, label_image.shape[0] - 1) +x_coords = np.clip(x_coords, 0, label_image.shape[1] - 1) + +cell_ids = label_image[y_coords, x_coords] + +# NOTE: Is it useful to build a cxg count matrix? Is this used downstream? +print(">> Building cell x gene count matrix", flush=True) +feature_names = transcripts["feature_name"].compute().to_numpy() + +transcript_df = pd.DataFrame({"cell_id": cell_ids, "feature_name": feature_names}) +# Remove background (cell_id == 0) +transcript_df = transcript_df[transcript_df["cell_id"] != 0] + +count_matrix = ( + transcript_df.groupby(["cell_id", "feature_name"]) + .size() + .unstack(fill_value=0) +) + +obs = pd.DataFrame( + {"cell_id": count_matrix.index.astype(str), "region": pd.Categorical(["segmentation"] * len(count_matrix))}, + index=count_matrix.index.astype(str), +) +var = pd.DataFrame(index=count_matrix.columns.astype(str)) +var.index.name = "feature_name" + +table = ad.AnnData(X=count_matrix.values.astype(np.float32), obs=obs, var=var) +table.layers["counts"] = table.X.copy() + +print(">> Normalizing counts", flush=True) +sc.pp.normalize_total(table, target_sum=1e4) +table.layers["normalized"] = table.X.copy() + +sc.pp.log1p(table) +table.layers["normalized_log"] = table.X.copy() + +sc.pp.scale(table) +table.layers["normalized_log_scaled"] = table.X.copy() + +table.uns["dataset_id"] = dataset_id +table.uns["method_id"] = method_id +table.uns["spatialdata_attrs"] = { + "instance_key": "cell_id", + "region": ["segmentation"], + "region_key": "region", +} + +print(">> Writing output", flush=True) +output = sd.SpatialData( + labels={"segmentation": sdata_pred["segmentation"]}, + tables={"table": table}, +) +output.write(par["output"], overwrite=True) diff --git a/src/methods/cellpose/script.py b/src/methods/cellpose/script.py index 6ebae72..3f0236b 100644 --- a/src/methods/cellpose/script.py +++ b/src/methods/cellpose/script.py @@ -10,7 +10,7 @@ ## VIASH START par = { - 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_dataset.zarr', + 'input': 'resources_test/task_spatial_segmentation/mouse_brain_combined/spatial_unlabelled.zarr', 'output': 'prediction.zarr' } meta = { @@ -47,24 +47,23 @@ def convert_to_lower_dtype(arr): print('Cellpose segmentation finished, post-processing results', flush=True) masks = convert_to_lower_dtype(masks) -print('Segmentation done, preparing output', flush=True) -sd_output = sd.SpatialData() -data_array = xr.DataArray(masks, name='segmentation', dims=('y', 'x')) -parsed = Labels2DModel.parse(data_array, transformations=transformation) -sd_output.labels['segmentation'] = parsed - -cell_ids = np.unique(masks)[1:] # exclude background (0) -table = ad.AnnData( - obs=pd.DataFrame( - {'cell_id': cell_ids.astype(str), 'region': 'segmentation'}, - index=cell_ids.astype(str), - ), - uns={ - 'dataset_id': sdata.tables['table'].uns['dataset_id'], - 'method_id': meta['name'] +print('Creating output data structure', flush=True) +sd_output = sd.SpatialData( + labels={ + 'segmentation': Labels2DModel.parse( + xr.DataArray(masks, name='segmentation', dims=('y', 'x')), + transformations=transformation + ) + }, + tables={ + 'table': ad.AnnData( + uns={ + 'dataset_id': sdata.tables['table'].uns['dataset_id'], + 'method_id': meta['name'] + } + ) } ) -sd_output.tables['table'] = table print('Saving output', flush=True) if os.path.exists(par["output"]): diff --git a/src/workflows/process_datasets/config.vsh.yaml b/src/workflows/process_datasets/config.vsh.yaml index c71286a..454c63f 100644 --- a/src/workflows/process_datasets/config.vsh.yaml +++ b/src/workflows/process_datasets/config.vsh.yaml @@ -14,8 +14,12 @@ argument_groups: direction: input - name: Outputs arguments: - - name: "--output_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml + - name: "--output_spatial_unlabelled" + __merge__: /src/api/file_spatial_unlabelled.yaml + direction: output + required: true + - name: "--output_spatial_solution" + __merge__: /src/api/file_spatial_solution.yaml direction: output required: true - name: "--output_scrnaseq_reference" diff --git a/src/workflows/process_datasets/main.nf b/src/workflows/process_datasets/main.nf index 947a8f1..2e0c2d4 100644 --- a/src/workflows/process_datasets/main.nf +++ b/src/workflows/process_datasets/main.nf @@ -44,13 +44,14 @@ workflow run_wf { "input_sc": "input_sc" ], toState: [ - output_spatial_dataset: "output_spatial_dataset", + output_spatial_unlabelled: "output_spatial_unlabelled", + output_spatial_solution: "output_spatial_solution", output_scrnaseq_reference: "output_scrnaseq_reference" ] ) // only output the files for which an output file was specified - | setState(["output_spatial_dataset", "output_scrnaseq_reference"]) + | setState(["output_spatial_unlabelled", "output_spatial_solution", "output_scrnaseq_reference"]) emit: output_ch diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml index 4ab5f83..9bfee25 100644 --- a/src/workflows/run_benchmark/config.vsh.yaml +++ b/src/workflows/run_benchmark/config.vsh.yaml @@ -4,13 +4,17 @@ namespace: workflows argument_groups: - name: Inputs arguments: - - name: "--input_spatial_dataset" - __merge__: /src/api/file_spatial_dataset.yaml - direction: output + - name: "--input_spatial_unlabelled" + __merge__: /src/api/file_spatial_unlabelled.yaml + direction: input + required: true + - name: "--input_spatial_solution" + __merge__: /src/api/file_spatial_solution.yaml + direction: input required: true - name: "--input_scrnaseq_reference" __merge__: /src/api/file_scrnaseq_reference.yaml - direction: output + direction: input required: true - name: Outputs arguments: diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf index fe25140..9cac4f6 100644 --- a/src/workflows/run_benchmark/main.nf +++ b/src/workflows/run_benchmark/main.nf @@ -33,7 +33,7 @@ workflow run_wf { // extract the dataset metadata | extract_uns_metadata.run( - fromState: [input: "input_solution"], + fromState: [input: "input_spatial_unlabelled"], toState: { id, output, state -> state + [ dataset_uns: readYaml(output.output).uns @@ -52,14 +52,8 @@ workflow run_wf { // use the 'filter' argument to only run a method on the normalisation the component is asking for filter: { id, state, comp -> - def norm = state.dataset_uns.normalization_id - def pref = comp.config.info.preferred_normalization - // if the preferred normalisation is none at all, - // we can pass whichever dataset we want - def norm_check = (norm == "log_cp10k" && pref == "counts") || norm == pref def method_check = !state.method_ids || state.method_ids.contains(comp.config.name) - - method_check && norm_check + method_check }, // define a new 'id' by appending the method name to the dataset id @@ -70,11 +64,10 @@ workflow run_wf { // use 'fromState' to fetch the arguments the component requires from the overall state fromState: { id, state, comp -> def new_args = [ - input_train: state.input_train, - input_test: state.input_test + input: state.input_spatial_unlabelled ] if (comp.config.info.type == "control_method") { - new_args.input_solution = state.input_solution + new_args.input_solution = state.input_spatial_solution } new_args }, @@ -96,7 +89,7 @@ workflow run_wf { }, // use 'fromState' to fetch the arguments the component requires from the overall state fromState: [ - input_solution: "input_solution", + input_solution: "input_solution", input_prediction: "method_output" ], // use 'toState' to publish that component's outputs to the overall state @@ -136,10 +129,8 @@ workflow run_wf { // extract the dataset metadata meta_ch = dataset_ch - // only keep one of the normalization methods - | filter{ id, state -> - state.dataset_uns.normalization_id == "log_cp10k" - } + // only keep one entry per dataset + | filter{ id, state -> true } | joinStates { ids, states -> // store the dataset metadata in a file def dataset_uns = states.collect{state ->