From a7aa12431d6c0e042243f1dbd9e0b7d0e970a2e3 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 11 Mar 2026 15:59:36 +0000 Subject: [PATCH 01/12] New semantic check for the presence of sampleInVCF --- docker/Dockerfile | 2 +- eva_sub_cli/etc/eva_schema.json | 3 +-- .../executables/check_metadata_semantics.py | 11 ++++++++--- eva_sub_cli/nextflow/validation.nf | 16 +++++++++++++--- eva_sub_cli/semantic_metadata.py | 16 +++++++++++++++- 5 files changed, 38 insertions(+), 10 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 1233e0c5..b8c054d3 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM --platform=linux/amd64 python:3.10 +FROM python:3.10 ENV vcf_validator_version=0.10.2 ENV NXF_VER=23.10.0 diff --git a/eva_sub_cli/etc/eva_schema.json b/eva_sub_cli/etc/eva_schema.json index 83559c6c..4e28906f 100644 --- a/eva_sub_cli/etc/eva_schema.json +++ b/eva_sub_cli/etc/eva_schema.json @@ -318,7 +318,6 @@ { "required": [ "analysisAlias", - "sampleInVCF", "bioSampleObject" ] } @@ -334,7 +333,7 @@ }, "sampleInVCF": { "type": "string", - "description": "Sample Name used in the VCF file" + "description": "Sample Name used in the VCF file. It is a required field when genotypes are provided." }, "bioSampleAccession": { "type": "string", diff --git a/eva_sub_cli/executables/check_metadata_semantics.py b/eva_sub_cli/executables/check_metadata_semantics.py index 68bf75e8..5ed9dbc4 100644 --- a/eva_sub_cli/executables/check_metadata_semantics.py +++ b/eva_sub_cli/executables/check_metadata_semantics.py @@ -1,18 +1,23 @@ import argparse import json +import yaml + from eva_sub_cli.semantic_metadata import SemanticMetadataChecker def main(): arg_parser = argparse.ArgumentParser(description='Perform semantic checks on the metadata') arg_parser.add_argument('--metadata_json', required=True, dest='metadata_json', help='EVA metadata json file') + arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results', help='Results of the evidence check') arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', help='Path to the location of the results') args = arg_parser.parse_args() with open(args.metadata_json) as open_json: metadata = json.load(open_json) - checker = SemanticMetadataChecker(metadata) - checker.check_all() - checker.write_result_yaml(args.output_yaml) + with open(args.evidence_type_results) as open_yaml: + evidence_type_results = yaml.safe_load(open_yaml) + checker = SemanticMetadataChecker(metadata, evidence_type_results) + checker.check_all() + checker.write_result_yaml(args.output_yaml) diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 51cec0cb..c19bc0dd 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -105,17 +105,24 @@ workflow { collect_file_size_and_md5(generate_file_size_and_md5_digests.out.file_size_and_digest_info.collect()) // Task-specific processing + evidence_type_results = null + if (params.tasks.contains(VCF_CHECK)) { check_vcf_valid(vcf_and_ref_ch) evidence_type_check(metadata_json, vcf_files.collect()) + evidence_type_results = evidence_type_check.out.evidence_type_checker_yml } if (params.tasks.contains(ASSEMBLY_CHECK)) { check_vcf_reference(vcf_and_ref_ch) insdc_checker(metadata_json, fasta_to_vcfs) } if (params.tasks.contains(METADATA_CHECK)) { + if (!evidence_type_results){ + evidence_type_check(metadata_json, vcf_files.collect()) + evidence_type_results = evidence_type_check.out.evidence_type_checker_yml + } metadata_json_validation(metadata_json) - metadata_semantic_check(metadata_json) + metadata_semantic_check(metadata_json, evidence_type_results) } if (params.tasks.contains(SAMPLE_CHECK)) { sample_name_concordance(metadata_json, vcf_files.collect()) @@ -361,7 +368,7 @@ process metadata_semantic_check { mode: "copy" input: - path(metadata_json) + path(metadata_json), path(evidence_type_results) output: path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml @@ -369,6 +376,9 @@ process metadata_semantic_check { script: """ - $params.python_scripts.semantic_checker --metadata_json $metadata_json --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1 + $params.python_scripts.semantic_checker \ + --metadata_json $metadata_json \ + --evidence_type_results $evidence_type_results \ + --output_yaml metadata_semantic_check.yml > semantic_checker.log 2>&1 """ } diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index 5ce27cad..e71ef577 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -20,6 +20,7 @@ PARENT_PROJECT_KEY = 'parentProject' CHILD_PROJECTS_KEY = 'childProjects' PEER_PROJECTS_KEY = 'peerProjects' +SAMPLE_IN_VCF_KEY = 'sampleInVcf' BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession' CHARACTERISTICS_KEY = 'characteristics' @@ -40,9 +41,10 @@ def cast_list(l, type_to_cast=str): class SemanticMetadataChecker(AppLogger): - def __init__(self, metadata, sample_checklist='ERC000011'): + def __init__(self, metadata, evidence_type_results, sample_checklist='ERC000011'): self.sample_checklist = sample_checklist self.metadata = metadata + self.evidence_type_results = evidence_type_results self.errors = [] # Caches whether taxonomy code is valid or not, and maps to scientific name if valid self.taxonomy_valid = {} @@ -60,6 +62,7 @@ def check_all(self): self.check_all_analysis_run_accessions() self.check_analysis_alias_coherence() self.check_all_analysis_contain_samples() + self.check_all_samples_have_sample_in_vcf() self.check_hold_date() def check_hold_date(self): @@ -316,3 +319,14 @@ def check_all_analysis_contain_samples(self): json_path = f'/{ANALYSIS_KEY}/{idx}' self.add_error(property=json_path, description=f'No sample found for the analysis. Should have at the least one sample.') + + def check_all_samples_have_sample_in_vcf(self): + for idx, sample in enumerate(self.metadata[SAMPLE_KEY]): + json_path = f'/{SAMPLE_KEY}/{idx}/{SAMPLE_IN_VCF_KEY}' + analysis_aliases = sample.get(ANALYSIS_ALIAS_KEY, []) + if any([self.evidence_type_results.get(analysis_alias, {}).get('evidence_type') != 'allele_frequency' for + analysis_alias in analysis_aliases]): + # SampleInVCF is required + if sample.get(SAMPLE_IN_VCF_KEY) is None or sample.get(SAMPLE_IN_VCF_KEY) == '': + self.add_error(json_path, f'{SAMPLE_IN_VCF_KEY} must be provided when Genotypes are present in the VCF file') + From 37ae5fece9ddba1a1e6db4e8de44b05111617625 Mon Sep 17 00:00:00 2001 From: tcezard Date: Wed, 11 Mar 2026 22:49:11 +0000 Subject: [PATCH 02/12] Fix bug and add tests --- eva_sub_cli/semantic_metadata.py | 2 +- tests/test_semantic_metadata.py | 60 +++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 18 deletions(-) diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index e71ef577..cd27343d 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -20,7 +20,7 @@ PARENT_PROJECT_KEY = 'parentProject' CHILD_PROJECTS_KEY = 'childProjects' PEER_PROJECTS_KEY = 'peerProjects' -SAMPLE_IN_VCF_KEY = 'sampleInVcf' +SAMPLE_IN_VCF_KEY = 'sampleInVCF' BIOSAMPLE_OBJECT_KEY = 'bioSampleObject' BIOSAMPLE_ACCESSION_KEY = 'bioSampleAccession' CHARACTERISTICS_KEY = 'characteristics' diff --git a/tests/test_semantic_metadata.py b/tests/test_semantic_metadata.py index de22550e..701551d1 100644 --- a/tests/test_semantic_metadata.py +++ b/tests/test_semantic_metadata.py @@ -72,7 +72,7 @@ def test_check_project_exists_and_public_in_ena_true(self): "projectAccession": "PRJEB12345" } } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: m_ena_download.side_effect = [True, HTTPError('problem downloading', response=Response())] checker.check_all_project_accessions() @@ -84,7 +84,7 @@ def test_check_project_exists_and_public_in_ena_false(self): "projectAccession": "PRJEBXYZ99" } } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: m_ena_download.side_effect = [HTTPError('problem downloading', response=Response())] checker.check_all_project_accessions() @@ -100,7 +100,7 @@ def test_check_all_project_accessions(self): "childProjects": ["PRJEB456", "PRJEBNA"] }, } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.download_xml_from_ena') as m_ena_download: m_ena_download.side_effect = [True, True, HTTPError('problem downloading', response=Response())] checker.check_all_project_accessions() @@ -133,7 +133,7 @@ def test_check_all_taxonomy_codes(self): } ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch('eva_sub_cli.semantic_metadata.get_scientific_name_and_common_name') as m_get_sci_name: # Mock should only be called once per taxonomy code m_get_sci_name.side_effect = [('Homo sapiens', 'human'), Exception('problem downloading')] @@ -153,7 +153,7 @@ def test_check_uniqueness_analysis_alias(self): {"analysisAlias": "alias1"} ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_uniqueness_analysis_alias() self.assertEqual(checker.errors, [ { @@ -194,7 +194,7 @@ def test_check_all_scientific_names(self): } ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.taxonomy_valid = { 1234: False, 9606: "Homo sapiens" @@ -208,7 +208,7 @@ def test_check_all_scientific_names(self): ]) def test_check_existing_biosamples_with_checklist(self): - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) with patch.object(SemanticMetadataChecker, '_get_biosample', side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample, old_invalid_sample2]) as m_get_sample: checker.check_existing_biosamples() @@ -234,7 +234,7 @@ def test_check_existing_biosamples_with_checklist(self): self.assertTrue(len(checker.errors) == 5) def test_check_existing_biosamples(self): - checker = SemanticMetadataChecker(metadata, sample_checklist=None) + checker = SemanticMetadataChecker(metadata, {}, sample_checklist=None) with patch.object(NoAuthHALCommunicator, 'follows_link', side_effect=[valid_sample, ValueError, invalid_sample1, invalid_sample2, old_invalid_sample, old_invalid_sample2]) as m_follows_link: checker.check_existing_biosamples() @@ -251,7 +251,7 @@ def test_check_existing_real_biosamples(self): {"bioSampleAccession": "SAMN01894452"} ] } - checker = SemanticMetadataChecker(metadata, sample_checklist=None) + checker = SemanticMetadataChecker(metadata, {}, sample_checklist=None) checker.check_existing_biosamples() print(checker.errors) @@ -282,7 +282,7 @@ def test_check_analysis_alias_coherence(self): } ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_analysis_alias_coherence() self.assertEqual(checker.errors, [ {'property': '/sample/analysisAlias', 'description': 'alias1 present in Analysis not in Samples'}, @@ -295,7 +295,7 @@ def test_check_all_analysis_run_accessions(self): {'runAccessions': ['SRR000001', 'SRR000002']} ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_run_accessions() assert checker.errors == [] @@ -318,7 +318,7 @@ def test_check_all_analysis_contain_samples(self): ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_contain_samples() assert checker.errors == [] @@ -334,7 +334,7 @@ def test_check_all_analysis_contain_samples(self): ] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_contain_samples() self.assertEqual(len(checker.errors), 2) self.assertEqual(checker.errors[0]["property"], "/analysis/1") @@ -350,25 +350,51 @@ def test_check_all_analysis_contain_samples(self): "sample": [] } - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_all_analysis_contain_samples() self.assertEqual(len(checker.errors), 1) self.assertEqual(checker.errors[0]["property"], "/analysis/0") self.assertEqual(checker.errors[0]["description"], "No sample found for the analysis. Should have at the least one sample.") + def test_check_all_samples_have_sample_in_vcf(self): + # Sample with genotype evidence + sampleInVCF present → no error + metadata = { + "sample": [{"analysisAlias": ["A1"], "sampleInVCF": "sample1"}] + } + checker = SemanticMetadataChecker(metadata, evidence_type_results={'A1': {'evidence_type': 'genotype'}}) + checker.check_all_samples_have_sample_in_vcf() + self.assertEqual(checker.errors, []) + + # Sample with genotype evidence + sampleInVCF missing → error + metadata = { + "sample": [{"analysisAlias": ["A1"]}] + } + checker = SemanticMetadataChecker(metadata, evidence_type_results={'A1': {'evidence_type': 'genotype'}}) + checker.check_all_samples_have_sample_in_vcf() + self.assertEqual(len(checker.errors), 1) + self.assertEqual(checker.errors[0]['property'], '/sample/0/sampleInVCF') + + # Sample with allele_frequency evidence + sampleInVCF missing → no error + metadata = { + "sample": [{"analysisAlias": ["A1"]}] + } + checker = SemanticMetadataChecker(metadata, evidence_type_results={'A1': {'evidence_type': 'allele_frequency'}}) + checker.check_all_samples_have_sample_in_vcf() + self.assertEqual(checker.errors, []) + def test_check_hold_date(self): # No error when holdDate is within 2 years hold_date_ok = (datetime.now() + timedelta(days=365)).strftime('%Y-%m-%d') metadata = {"project": {"holdDate": hold_date_ok}, "sample": [], "analysis": [], "files": []} - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_hold_date() self.assertEqual(checker.errors, []) # Error when holdDate is more than 2 years in the future hold_date_bad = (datetime.now() + timedelta(days=365 * 3)).strftime('%Y-%m-%d') metadata = {"project": {"holdDate": hold_date_bad}, "sample": [], "analysis": [], "files": []} - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_hold_date() self.assertEqual(checker.errors, [ {'property': '/project/holdDate', 'description': 'holdDate is more than 2 years in the future'} @@ -376,6 +402,6 @@ def test_check_hold_date(self): # No error when holdDate is absent metadata = {"project": {}, "sample": [], "analysis": [], "files": []} - checker = SemanticMetadataChecker(metadata) + checker = SemanticMetadataChecker(metadata, {}) checker.check_hold_date() self.assertEqual(checker.errors, []) \ No newline at end of file From 6682b61a3a36db7b2ccadcba6503076b28c1b5a2 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 12 Mar 2026 15:45:12 +0000 Subject: [PATCH 03/12] Ad integration tests --- eva_sub_cli/nextflow/validation.nf | 3 +- .../sample_in_vcf_check/allele_freq.vcf | 9 ++ .../sample_in_vcf_check/fake_fasta.fa | 2 + .../sample_in_vcf_check/genotype.vcf | 7 ++ .../metadata_af_no_sample_in_vcf.json | 1 + .../metadata_af_with_sample_in_vcf.json | 33 +++++++ .../metadata_genotype_no_sample_in_vcf.json | 1 + .../metadata_genotype_with_sample_in_vcf.json | 1 + tests/test_native_validator_sample_in_vcf.py | 95 +++++++++++++++++++ tests/test_utils.py | 4 +- 10 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 tests/resources/sample_in_vcf_check/allele_freq.vcf create mode 100644 tests/resources/sample_in_vcf_check/fake_fasta.fa create mode 100644 tests/resources/sample_in_vcf_check/genotype.vcf create mode 100644 tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json create mode 100644 tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json create mode 100644 tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json create mode 100644 tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json create mode 100644 tests/test_native_validator_sample_in_vcf.py diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index c19bc0dd..89c5d58d 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -368,7 +368,8 @@ process metadata_semantic_check { mode: "copy" input: - path(metadata_json), path(evidence_type_results) + path(metadata_json) + path(evidence_type_results) output: path "metadata_semantic_check.yml", emit: metadata_semantic_check_yml diff --git a/tests/resources/sample_in_vcf_check/allele_freq.vcf b/tests/resources/sample_in_vcf_check/allele_freq.vcf new file mode 100644 index 00000000..2e018a96 --- /dev/null +++ b/tests/resources/sample_in_vcf_check/allele_freq.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 10177 rs367896724 A AC 100 PASS AF=0.11;AN=2000;AC=220 +1 10505 rs548419688 A T 100 PASS AF=0.09;AN=2000;AC=180 diff --git a/tests/resources/sample_in_vcf_check/fake_fasta.fa b/tests/resources/sample_in_vcf_check/fake_fasta.fa new file mode 100644 index 00000000..8e8fc17c --- /dev/null +++ b/tests/resources/sample_in_vcf_check/fake_fasta.fa @@ -0,0 +1,2 @@ +>fasta +AAA diff --git a/tests/resources/sample_in_vcf_check/genotype.vcf b/tests/resources/sample_in_vcf_check/genotype.vcf new file mode 100644 index 00000000..7e316729 --- /dev/null +++ b/tests/resources/sample_in_vcf_check/genotype.vcf @@ -0,0 +1,7 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample1 sample2 sample3 +1 10177 rs367896724 A AC 100 PASS . GT 1|0 0|1 0|0 +1 10505 rs548419688 A T 100 PASS . GT 0|0 0|0 0|1 \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json new file mode 100644 index 00000000..7d1b74ab --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_af_no_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]} \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json new file mode 100644 index 00000000..6a4ed5ee --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json @@ -0,0 +1,33 @@ +{ + "project": { + "title": "Test AF Project", + "description": "Project with allele frequency VCF", + "taxId": 9606, + "centre": "Test Centre" + }, + "sample": [ + { + "analysisAlias": ["AF1"], + "sampleInVCF": "sample1", + "bioSampleAccession": "SAME00001" + } + ], + "analysis": [ + { + "analysisTitle": "AF Analysis", + "analysisAlias": "AF1", + "description": "Allele frequency analysis", + "experimentType": "Whole genome sequencing", + "referenceGenome": "GCA_000001405.27" + } + ], + "files": [ + { + "analysisAlias": "AF1", + "fileName": "allele_freq.vcf", + "fileType": "vcf", + "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", + "fileSize": 458 + } + ] +} \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json new file mode 100644 index 00000000..78335608 --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_genotype_no_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]} \ No newline at end of file diff --git a/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json new file mode 100644 index 00000000..d4873d6f --- /dev/null +++ b/tests/resources/sample_in_vcf_check/metadata_genotype_with_sample_in_vcf.json @@ -0,0 +1 @@ +{"project": {"title": "Test Genotype Project", "description": "Project with genotype VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["GT1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample2", "bioSampleAccession": "SAME00002"}, {"analysisAlias": ["GT1"], "sampleInVCF": "sample3", "bioSampleAccession": "SAME00003"}], "analysis": [{"analysisTitle": "Genotype Analysis", "analysisAlias": "GT1", "description": "Genotype analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "GT1", "fileName": "genotype.vcf", "fileType": "vcf", "md5": "81ca0b3a6e5b657bc2be50085c76546a", "fileSize": 350}]} \ No newline at end of file diff --git a/tests/test_native_validator_sample_in_vcf.py b/tests/test_native_validator_sample_in_vcf.py new file mode 100644 index 00000000..85b93147 --- /dev/null +++ b/tests/test_native_validator_sample_in_vcf.py @@ -0,0 +1,95 @@ +import os +import shutil +import tempfile +from unittest import TestCase + +import pytest +import yaml + +from eva_sub_cli.validators.native_validator import NativeValidator +from eva_sub_cli.validators.validator import METADATA_CHECK +from tests.test_utils import create_mapping_file + + + + +@pytest.mark.integration('You need to install java, nextflow, vcf_validator, vcf_assembly_checker, biovalidator (and md5sum, stat for mac)') +class TestNativeValidatorSampleInVCF(TestCase): + resource_dir = os.path.join(os.path.dirname(__file__), 'resources') + sample_in_vcf_dir = os.path.join(resource_dir, 'sample_in_vcf_check') + fasta_file = os.path.join(sample_in_vcf_dir, 'fake_fasta.fa') + + def setUp(self): + self.test_run_dir = os.path.join(self.resource_dir, 'test_native_run') + os.makedirs(self.test_run_dir, exist_ok=True) + self.mapping_file = os.path.join(self.test_run_dir, 'vcf_files_metadata.csv') + + def tearDown(self): + shutil.rmtree(self.test_run_dir) + + def _build_validator(self, metadata_json, vcf_file): + create_mapping_file( + self.mapping_file, + vcf_files=[vcf_file], + fasta_files=[self.fasta_file], + assembly_reports=None, + ) + return NativeValidator( + mapping_file=self.mapping_file, + submission_dir=self.test_run_dir, + project_title='Test Project', + metadata_json=metadata_json, + validation_tasks=[METADATA_CHECK], + ) + + def _get_semantic_errors(self): + semantic_yaml = os.path.join( + self.test_run_dir, 'other_validations', 'metadata_semantic_check.yml' + ) + with open(semantic_yaml) as f: + return yaml.safe_load(f) or [] + + def _sample_in_vcf_errors(self, errors): + return [e for e in errors if 'sampleInVCF' in e.get('property', '')] + + + def test_af_vcf_without_sample_in_vcf(self): + """AF evidence type: omitting sampleInVCF should produce no error.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_af_no_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf') + ) + validator.validate() + errors = self._get_semantic_errors() + self.assertEqual(self._sample_in_vcf_errors(errors), []) + + def test_af_vcf_with_sample_in_vcf(self): + """AF evidence type: providing sampleInVCF is permitted and should produce no error.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_af_with_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf'), + ) + validator.validate() + errors = self._get_semantic_errors() + self.assertEqual(self._sample_in_vcf_errors(errors), []) + + def test_genotype_vcf_without_sample_in_vcf(self): + """Genotype evidence type: omitting sampleInVCF must produce one error per sample.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_genotype_no_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'genotype.vcf'), + ) + validator.validate() + errors = self._get_semantic_errors() + sample_in_vcf_errors = self._sample_in_vcf_errors(errors) + self.assertEqual(len(sample_in_vcf_errors), 3) + + def test_genotype_vcf_with_sample_in_vcf(self): + """Genotype evidence type: providing sampleInVCF for every sample should produce no error.""" + validator = self._build_validator( + os.path.join(self.sample_in_vcf_dir, 'metadata_genotype_with_sample_in_vcf.json'), + os.path.join(self.sample_in_vcf_dir, 'genotype.vcf'), + ) + validator.validate() + errors = self._get_semantic_errors() + self.assertEqual(self._sample_in_vcf_errors(errors), []) diff --git a/tests/test_utils.py b/tests/test_utils.py index 6cf26d2c..d1be59a1 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,10 +1,12 @@ import csv -def create_mapping_file(mapping_file, vcf_files, fasta_files, assembly_reports): +def create_mapping_file(mapping_file, vcf_files, fasta_files, assembly_reports=None): with open(mapping_file, 'w', encoding='UTF8') as f: writer = csv.writer(f) writer.writerow(['vcf', 'fasta', 'report']) + if not assembly_reports: + assembly_reports = ['' for _ in range(len(vcf_files))] for vcf_file, fasta_file, assembly_reports in zip(vcf_files, fasta_files, assembly_reports): writer.writerow([vcf_file, fasta_file, assembly_reports]) From 239356ce86c280fc91f699dc4a3a4c1cffc27098 Mon Sep 17 00:00:00 2001 From: tcezard Date: Thu, 12 Mar 2026 21:06:40 +0000 Subject: [PATCH 04/12] Document the case of allele frequency with no SampleInVCF --- docs/input_file_overview.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/input_file_overview.md b/docs/input_file_overview.md index a52399ab..aa7e9fd7 100644 --- a/docs/input_file_overview.md +++ b/docs/input_file_overview.md @@ -98,7 +98,8 @@ alias, which is a shortened identifier you must provide for each analysis. This is where you describe the biological samples used for your analyses. Each row describes one sample and must include the Analysis Alias to indicate which analysis it belongs to, and "Sample Name in VCF" which is the exact name of the -sample as it appears in the VCF file. +sample as it appears in the VCF file. If you are submitting a VCF without sample names (containing only allele frequencies) +Then you can omit to fill the "Sample Name in VCF" column. We accept preregistered samples, which should be provided using BioSamples sample or sampleset accessions. Please ensure these are publicly accessible, as otherwise EVA will not be able to validate them. From c408c7a0bfd2dfe1319ff090cab90d25099c91f5 Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 13 Mar 2026 09:02:07 +0000 Subject: [PATCH 05/12] Fix error in integration test --- .../metadata_af_with_sample_in_vcf.json | 34 +------------------ tests/test_native_validator_sample_in_vcf.py | 3 +- 2 files changed, 2 insertions(+), 35 deletions(-) diff --git a/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json index 6a4ed5ee..330ef9fc 100644 --- a/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json +++ b/tests/resources/sample_in_vcf_check/metadata_af_with_sample_in_vcf.json @@ -1,33 +1 @@ -{ - "project": { - "title": "Test AF Project", - "description": "Project with allele frequency VCF", - "taxId": 9606, - "centre": "Test Centre" - }, - "sample": [ - { - "analysisAlias": ["AF1"], - "sampleInVCF": "sample1", - "bioSampleAccession": "SAME00001" - } - ], - "analysis": [ - { - "analysisTitle": "AF Analysis", - "analysisAlias": "AF1", - "description": "Allele frequency analysis", - "experimentType": "Whole genome sequencing", - "referenceGenome": "GCA_000001405.27" - } - ], - "files": [ - { - "analysisAlias": "AF1", - "fileName": "allele_freq.vcf", - "fileType": "vcf", - "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", - "fileSize": 458 - } - ] -} \ No newline at end of file +{"project": {"title": "Test AF Project", "description": "Project with allele frequency VCF", "taxId": 9606, "centre": "Test Centre"}, "sample": [{"analysisAlias": ["AF1"], "sampleInVCF": "sample1", "bioSampleAccession": "SAME00001"}], "analysis": [{"analysisTitle": "AF Analysis", "analysisAlias": "AF1", "description": "Allele frequency analysis", "experimentType": "Whole genome sequencing", "referenceGenome": "GCA_000001405.27"}], "files": [{"analysisAlias": "AF1", "fileName": "allele_freq.vcf", "fileType": "vcf", "md5": "b8ab2c9d58e5f430ce70783d8d0a0b88", "fileSize": 458}]} \ No newline at end of file diff --git a/tests/test_native_validator_sample_in_vcf.py b/tests/test_native_validator_sample_in_vcf.py index 85b93147..03da65fa 100644 --- a/tests/test_native_validator_sample_in_vcf.py +++ b/tests/test_native_validator_sample_in_vcf.py @@ -1,6 +1,5 @@ import os import shutil -import tempfile from unittest import TestCase import pytest @@ -44,7 +43,7 @@ def _build_validator(self, metadata_json, vcf_file): def _get_semantic_errors(self): semantic_yaml = os.path.join( - self.test_run_dir, 'other_validations', 'metadata_semantic_check.yml' + self.test_run_dir, 'validation_output', 'other_validations', 'metadata_semantic_check.yml' ) with open(semantic_yaml) as f: return yaml.safe_load(f) or [] From 3c618b2e8fbba1a5f3d71c0f27b13a5f4a2c3a43 Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 13 Mar 2026 15:47:35 +0000 Subject: [PATCH 06/12] First attempt at integration tests --- .github/workflows/integration-tests.yml | 60 ++++++++++++++++++++ tests/test_native_validator_sample_in_vcf.py | 10 +++- 2 files changed, 67 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/integration-tests.yml diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 00000000..ab01c03c --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,60 @@ +name: Integration tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main] + +jobs: + integration-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python 3.11 + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Set up Java + uses: actions/setup-java@v4 + with: + distribution: temurin + java-version: "21" + + - name: Install Nextflow + run: | + curl -s https://get.nextflow.io | bash + sudo mv nextflow /usr/local/bin/ + nextflow -version + + - name: Install biovalidator + run: | + npm install -g @elixir-europe/biovalidator + biovalidator --version + + - name: Install vcf-validator + run: | + wget -q https://github.com/EBIvariation/vcf-validator/releases/latest/download/vcf_validator_linux + chmod +x vcf_validator_linux + sudo mv vcf_validator_linux /usr/local/bin/vcf_validator + vcf_validator --version + + - name: Install vcf-assembly-checker + run: | + wget -q https://github.com/EBIvariation/vcf-validator/releases/latest/download/vcf_assembly_checker_linux + chmod +x vcf_assembly_checker_linux + sudo mv vcf_assembly_checker_linux /usr/local/bin/vcf_assembly_checker + vcf_assembly_checker --version + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + python -m pip install . + + - name: Run integration tests + run: | + PYTHONPATH=. pytest tests -m integration diff --git a/tests/test_native_validator_sample_in_vcf.py b/tests/test_native_validator_sample_in_vcf.py index 03da65fa..cb487180 100644 --- a/tests/test_native_validator_sample_in_vcf.py +++ b/tests/test_native_validator_sample_in_vcf.py @@ -26,7 +26,7 @@ def setUp(self): def tearDown(self): shutil.rmtree(self.test_run_dir) - def _build_validator(self, metadata_json, vcf_file): + def _build_validator(self, metadata_json, vcf_file, tasks): create_mapping_file( self.mapping_file, vcf_files=[vcf_file], @@ -38,7 +38,7 @@ def _build_validator(self, metadata_json, vcf_file): submission_dir=self.test_run_dir, project_title='Test Project', metadata_json=metadata_json, - validation_tasks=[METADATA_CHECK], + validation_tasks=tasks, ) def _get_semantic_errors(self): @@ -56,7 +56,8 @@ def test_af_vcf_without_sample_in_vcf(self): """AF evidence type: omitting sampleInVCF should produce no error.""" validator = self._build_validator( os.path.join(self.sample_in_vcf_dir, 'metadata_af_no_sample_in_vcf.json'), - os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf') + os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf'), + [METADATA_CHECK] ) validator.validate() errors = self._get_semantic_errors() @@ -67,6 +68,7 @@ def test_af_vcf_with_sample_in_vcf(self): validator = self._build_validator( os.path.join(self.sample_in_vcf_dir, 'metadata_af_with_sample_in_vcf.json'), os.path.join(self.sample_in_vcf_dir, 'allele_freq.vcf'), + [METADATA_CHECK] ) validator.validate() errors = self._get_semantic_errors() @@ -77,6 +79,7 @@ def test_genotype_vcf_without_sample_in_vcf(self): validator = self._build_validator( os.path.join(self.sample_in_vcf_dir, 'metadata_genotype_no_sample_in_vcf.json'), os.path.join(self.sample_in_vcf_dir, 'genotype.vcf'), + [METADATA_CHECK] ) validator.validate() errors = self._get_semantic_errors() @@ -88,6 +91,7 @@ def test_genotype_vcf_with_sample_in_vcf(self): validator = self._build_validator( os.path.join(self.sample_in_vcf_dir, 'metadata_genotype_with_sample_in_vcf.json'), os.path.join(self.sample_in_vcf_dir, 'genotype.vcf'), + [METADATA_CHECK] ) validator.validate() errors = self._get_semantic_errors() From fd7657fec88e62fb3e314dc540b9e75cf5281fb8 Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 13 Mar 2026 15:54:41 +0000 Subject: [PATCH 07/12] Fix biovalidator and nextflow install --- .github/workflows/integration-tests.yml | 42 +++++++++++-------------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index ab01c03c..0912307d 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -6,6 +6,10 @@ on: pull_request: branches: [ main] +env: + VCF_VALIDATOR_VERSION: "0.10.0" + NXF_VER: "23.10.0" + jobs: integration-tests: runs-on: ubuntu-latest @@ -18,36 +22,28 @@ jobs: with: python-version: "3.11" - - name: Set up Java - uses: actions/setup-java@v4 - with: - distribution: temurin - java-version: "21" + - name: Install Java and Node + run: sudo apt update && sudo apt install -y default-jdk nodejs npm git curl - - name: Install Nextflow + - name: Install vcf-validator and vcf-assembly-checker run: | - curl -s https://get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ - nextflow -version + curl -LJo /usr/local/bin/vcf_validator \ + https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_validator_linux + curl -LJo /usr/local/bin/vcf_assembly_checker \ + https://github.com/EBIvariation/vcf-validator/releases/download/v${VCF_VALIDATOR_VERSION}/vcf_assembly_checker_linux + chmod 755 /usr/local/bin/vcf_validator /usr/local/bin/vcf_assembly_checker - name: Install biovalidator run: | - npm install -g @elixir-europe/biovalidator - biovalidator --version + git clone https://github.com/elixir-europe/biovalidator.git + cd biovalidator + npm install + sudo npm link - - name: Install vcf-validator - run: | - wget -q https://github.com/EBIvariation/vcf-validator/releases/latest/download/vcf_validator_linux - chmod +x vcf_validator_linux - sudo mv vcf_validator_linux /usr/local/bin/vcf_validator - vcf_validator --version - - - name: Install vcf-assembly-checker + - name: Install Nextflow run: | - wget -q https://github.com/EBIvariation/vcf-validator/releases/latest/download/vcf_assembly_checker_linux - chmod +x vcf_assembly_checker_linux - sudo mv vcf_assembly_checker_linux /usr/local/bin/vcf_assembly_checker - vcf_assembly_checker --version + curl -L "https://github.com/nextflow-io/nextflow/releases/download/v${NXF_VER}/nextflow-${NXF_VER}-all" | bash + sudo mv nextflow /usr/local/bin/ - name: Install Python dependencies run: | From 2198770110dcaddedb099c23ccabe9682547244e Mon Sep 17 00:00:00 2001 From: tcezard Date: Fri, 13 Mar 2026 15:59:45 +0000 Subject: [PATCH 08/12] Use newer vcf-validator --- .github/workflows/integration-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml index 0912307d..11073128 100644 --- a/.github/workflows/integration-tests.yml +++ b/.github/workflows/integration-tests.yml @@ -7,7 +7,7 @@ on: branches: [ main] env: - VCF_VALIDATOR_VERSION: "0.10.0" + VCF_VALIDATOR_VERSION: "0.10.2" NXF_VER: "23.10.0" jobs: From f845974b01b2a3a7279f0d63efa412f2f03ca098 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 16 Mar 2026 18:03:17 +0000 Subject: [PATCH 09/12] Apply suggestions from code review Co-authored-by: April Shen --- docs/input_file_overview.md | 4 ++-- eva_sub_cli/nextflow/validation.nf | 2 +- eva_sub_cli/semantic_metadata.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/input_file_overview.md b/docs/input_file_overview.md index aa7e9fd7..c2f302e2 100644 --- a/docs/input_file_overview.md +++ b/docs/input_file_overview.md @@ -98,8 +98,8 @@ alias, which is a shortened identifier you must provide for each analysis. This is where you describe the biological samples used for your analyses. Each row describes one sample and must include the Analysis Alias to indicate which analysis it belongs to, and "Sample Name in VCF" which is the exact name of the -sample as it appears in the VCF file. If you are submitting a VCF without sample names (containing only allele frequencies) -Then you can omit to fill the "Sample Name in VCF" column. +sample as it appears in the VCF file. If you are submitting a VCF without sample names (containing only allele frequencies), +then you can omit the "Sample Name in VCF" column. We accept preregistered samples, which should be provided using BioSamples sample or sampleset accessions. Please ensure these are publicly accessible, as otherwise EVA will not be able to validate them. diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index 89c5d58d..e626b6cf 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -117,7 +117,7 @@ workflow { insdc_checker(metadata_json, fasta_to_vcfs) } if (params.tasks.contains(METADATA_CHECK)) { - if (!evidence_type_results){ + if (!evidence_type_results) { evidence_type_check(metadata_json, vcf_files.collect()) evidence_type_results = evidence_type_check.out.evidence_type_checker_yml } diff --git a/eva_sub_cli/semantic_metadata.py b/eva_sub_cli/semantic_metadata.py index cd27343d..b4fa1112 100644 --- a/eva_sub_cli/semantic_metadata.py +++ b/eva_sub_cli/semantic_metadata.py @@ -325,7 +325,7 @@ def check_all_samples_have_sample_in_vcf(self): json_path = f'/{SAMPLE_KEY}/{idx}/{SAMPLE_IN_VCF_KEY}' analysis_aliases = sample.get(ANALYSIS_ALIAS_KEY, []) if any([self.evidence_type_results.get(analysis_alias, {}).get('evidence_type') != 'allele_frequency' for - analysis_alias in analysis_aliases]): + analysis_alias in analysis_aliases]): # SampleInVCF is required if sample.get(SAMPLE_IN_VCF_KEY) is None or sample.get(SAMPLE_IN_VCF_KEY) == '': self.add_error(json_path, f'{SAMPLE_IN_VCF_KEY} must be provided when Genotypes are present in the VCF file') From 90905183a21ea32e758cbd11ab8bf12784704ef6 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 16 Mar 2026 18:04:02 +0000 Subject: [PATCH 10/12] Update tests/test_native_validator_sample_in_vcf.py Co-authored-by: April Shen --- tests/test_native_validator_sample_in_vcf.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test_native_validator_sample_in_vcf.py b/tests/test_native_validator_sample_in_vcf.py index cb487180..307704cb 100644 --- a/tests/test_native_validator_sample_in_vcf.py +++ b/tests/test_native_validator_sample_in_vcf.py @@ -10,8 +10,6 @@ from tests.test_utils import create_mapping_file - - @pytest.mark.integration('You need to install java, nextflow, vcf_validator, vcf_assembly_checker, biovalidator (and md5sum, stat for mac)') class TestNativeValidatorSampleInVCF(TestCase): resource_dir = os.path.join(os.path.dirname(__file__), 'resources') From d7be8e81ec9497951b2c177f08ba7af25aa086c6 Mon Sep 17 00:00:00 2001 From: tcezard Date: Tue, 17 Mar 2026 16:19:07 +0000 Subject: [PATCH 11/12] Use preexisting evidence types samples_checker.py --- eva_sub_cli/executables/samples_checker.py | 25 +++++++++++----------- eva_sub_cli/nextflow/validation.nf | 11 ++++++++-- tests/test_samples_checker.py | 3 ++- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/eva_sub_cli/executables/samples_checker.py b/eva_sub_cli/executables/samples_checker.py index 1627fdf7..c9823caf 100644 --- a/eva_sub_cli/executables/samples_checker.py +++ b/eva_sub_cli/executables/samples_checker.py @@ -54,7 +54,7 @@ def compare_names_in_files_and_samples(sample_name_in_analysis, sample_name_per_ more_metadata_submitted_files) -def compare_all_analysis(metadata, files_per_analysis): +def compare_all_analysis(metadata, files_per_analysis, evidence_type_results): overall_differences = False results_per_analysis_alias = {} all_analysis_alias = set(metadata.samples_per_analysis) | set(files_per_analysis) @@ -65,7 +65,7 @@ def compare_all_analysis(metadata, files_per_analysis): for file_path in files_per_analysis.get(analysis_alias, []) } - if need_to_check_samples(sample_name_per_file): + if need_to_check_samples(evidence_type_results, analysis_alias): ( has_difference, more_per_submitted_files_metadata, more_submitted_files_metadata, more_metadata_submitted_files @@ -89,13 +89,9 @@ def compare_all_analysis(metadata, files_per_analysis): return overall_differences, results_per_analysis_alias -def need_to_check_samples(sample_name_per_file): - no_samples_in_vcf = all(len(v) == 0 for v in sample_name_per_file.values()) - if no_samples_in_vcf: - evidence_types_for_vcf_files = [detect_vcf_evidence_type(vcf_file) for vcf_file in sample_name_per_file.keys()] - if set(evidence_types_for_vcf_files) == {'allele_frequency'}: - return False - +def need_to_check_samples(evidence_type_results, analysis_alias): + if evidence_type_results.get(analysis_alias, {}).get('evidence_type') == 'allele_frequency': + return False return True @@ -107,14 +103,16 @@ def write_result_yaml(output_yaml, overall_differences, results_per_analysis_ali }, stream=open_yaml) -def check_sample_name_concordance(metadata_json, vcf_files, output_yaml): +def check_sample_name_concordance(metadata_json, vcf_files, output_yaml, evidence_type_result_file): """ Take the metadata following EVA standard and formatted in JSON then compare the sample names in it to the ones found in the VCF files """ metadata = EvaMetadataJson(metadata_json) + with open(evidence_type_result_file) as open_yaml: + evidence_type_results = yaml.safe_load(open_yaml) file_path_per_analysis = associate_vcf_path_with_analysis(metadata, vcf_files) - overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis) + overall_differences, results_per_analysis_alias = compare_all_analysis(metadata, file_path_per_analysis, evidence_type_results) write_result_yaml(output_yaml, overall_differences, results_per_analysis_alias) @@ -127,7 +125,10 @@ def main(): help='Path to the vcf files to compare to the metadata') arg_parser.add_argument('--output_yaml', required=True, dest='output_yaml', help='Path to the location of the results') + arg_parser.add_argument('--evidence_type_results', required=True, dest='evidence_type_results', + help='Results of the evidence check') + args = arg_parser.parse_args() logging_config.add_stdout_handler() - check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml) + check_sample_name_concordance(args.metadata_json, args.vcf_files, args.output_yaml, args.evidence_type_results) diff --git a/eva_sub_cli/nextflow/validation.nf b/eva_sub_cli/nextflow/validation.nf index e626b6cf..4d287401 100644 --- a/eva_sub_cli/nextflow/validation.nf +++ b/eva_sub_cli/nextflow/validation.nf @@ -125,7 +125,11 @@ workflow { metadata_semantic_check(metadata_json, evidence_type_results) } if (params.tasks.contains(SAMPLE_CHECK)) { - sample_name_concordance(metadata_json, vcf_files.collect()) + if (!evidence_type_results){ + evidence_type_check(metadata_json, vcf_files.collect()) + evidence_type_results = evidence_type_check.out.evidence_type_checker_yml + } + sample_name_concordance(metadata_json, vcf_files.collect(), evidence_type_results) } } @@ -301,6 +305,7 @@ process sample_name_concordance { input: path(metadata_json) path(vcf_files) + path(evidence_type_results) output: path "sample_checker.yml", emit: sample_checker_yml @@ -308,7 +313,9 @@ process sample_name_concordance { script: """ - $params.python_scripts.samples_checker --metadata_json $metadata_json --vcf_files $vcf_files --output_yaml sample_checker.yml > sample_checker.log 2>&1 + $params.python_scripts.samples_checker --metadata_json $metadata_json \ + --vcf_files $vcf_files --output_yaml sample_checker.yml \ + --evidence_type_results $evidence_type_results > sample_checker.log 2>&1 """ } diff --git a/tests/test_samples_checker.py b/tests/test_samples_checker.py index 5508e602..bef7bd1d 100644 --- a/tests/test_samples_checker.py +++ b/tests/test_samples_checker.py @@ -49,7 +49,8 @@ def test_check_sample_name_concordance_absolute_paths(self): os.remove(updated_metadata) def run_and_assert_sample_check(self, metadata_json, vcf_files): - check_sample_name_concordance(metadata_json, vcf_files, self.output_yaml) + evidence_type_yaml = os.path.join(self.resource_dir, 'sample_checker', 'evidence_type.yaml') + check_sample_name_concordance(metadata_json, vcf_files, self.output_yaml, evidence_type_yaml) expected_results = { 'overall_differences': True, 'results_per_analysis': { From c7edfd49dfbefee26e5c8d78f238b12643300344 Mon Sep 17 00:00:00 2001 From: tcezard Date: Tue, 17 Mar 2026 16:34:39 +0000 Subject: [PATCH 12/12] Add missing evidence type file --- tests/resources/sample_checker/evidence_type.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 tests/resources/sample_checker/evidence_type.yaml diff --git a/tests/resources/sample_checker/evidence_type.yaml b/tests/resources/sample_checker/evidence_type.yaml new file mode 100644 index 00000000..154be1da --- /dev/null +++ b/tests/resources/sample_checker/evidence_type.yaml @@ -0,0 +1,6 @@ +VD1: + evidence_type: genotype +VD2: + evidence_type: genotype +VD3: + evidence_type: genotype